In [49]:
import pandas as pd

df = pd.read_csv(
    './data/final_dataset.csv'
)

## Data Description

Each row contains some properties referring particular established session

| Atriibute            | Description                                       |
|-----------------|-------------------------------------------|
| fl_dur          | Flow duration                              |
| tot_fw_pk       | Total packets in the forward direction   |
| tot_bw_pk       | Total packets in the backward direction  |
| tot_l_fw_pkt    | Total size of packet in forward direction |
| fw_pkt_l_max    | Maximum size of packet in forward direction |
| fw_pkt_l_min    | Minimum size of packet in forward direction |
| fw_pkt_l_avg    | Average size of packet in forward direction |
| fw_pkt_l_std    | Standard deviation size of packet in forward direction |
| Bw_pkt_l_max    | Maximum size of packet in backward direction |
| Bw_pkt_l_min    | Minimum size of packet in backward direction |
| Bw_pkt_l_avg    | Mean size of packet in backward direction |
| Bw_pkt_l_std    | Standard deviation size of packet in backward direction |
| fl_byt_s        | Flow byte rate that is the number of packets transferred per second |
| fl_pkt_s        | Flow packets rate that is the number of packets transferred per second |
| fl_iat_avg      | Average time between two flows           |
| fl_iat_std      | Standard deviation time between two flows |
| fl_iat_max      | Maximum time between two flows           |
| fl_iat_min      | Minimum time between two flows           |
| fw_iat_tot      | Total time between two packets sent in the forward direction |
| fw_iat_avg      | Mean time between two packets sent in the forward direction |
| fw_iat_std      | Standard deviation time between two packets sent in the forward direction |
| fw_iat_max      | Maximum time between two packets sent in the forward direction |
| fw_iat_min      | Minimum time between two packets sent in the forward direction |
| bw_iat_tot      | Total time between two packets sent in the backward direction |
| bw_iat_avg      | Mean time between two packets sent in the backward direction |
| bw_iat_std      | Standard deviation time between two packets sent in the backward direction |
| bw_iat_max      | Maximum time between two packets sent in the backward direction |
| bw_iat_min      | Minimum time between two packets sent in the backward direction |
| fw_psh_flag     | Number of times the PSH flag was set in packets traveling in the forward direction (0 for UDP) |
| bw_psh_flag     | Number of times the PSH flag was set in packets traveling in the backward direction (0 for UDP) |
| fw_urg_flag     | Number of times the URG flag was set in packets traveling in the forward direction (0 for UDP) |
| bw_urg_flag     | Number of times the URG flag was set in packets traveling in the backward direction (0 for UDP) |
| fw_hdr_len      | Total bytes used for headers in the forward direction |
| bw_hdr_len      | Total bytes used for headers in the forward direction |
| fw_pkt_s        | Number of forward packets per second    |
| bw_pkt_s        | Number of backward packets per second   |
| pkt_len_min     | Minimum length of a flow                |
| pkt_len_max     | Maximum length of a flow                |
| pkt_len_avg     | Mean length of a flow                   |
| pkt_len_std     | Standard deviation length of a flow     |
| pkt_len_va      | Minimum inter-arrival time of packet    |
| fin_cnt         | Number of packets with FIN              |
| syn_cnt         | Number of packets with SYN              |
| rst_cnt         | Number of packets with RST              |
| pst_cnt         | Number of packets with PUSH             |
| ack_cnt         | Number of packets with ACK              |
| urg_cnt         | Number of packets with URG              |
| cwe_cnt         | Number of packets with CWE              |
| ece_cnt         | Number of packets with ECE              |
| down_up_ratio   | Download and upload ratio               |
| pkt_size_avg    | Average size of the packet              |
| fw_seg_avg      | Average size observed in the forward direction |
| bw_seg_avg      | Average size observed in the backward direction |
| fw_byt_blk_avg  | Average number of bytes bulk rate in the forward direction |
| fw_pkt_blk_avg  | Average number of packets bulk rate in the forward direction |
| fw_blk_rate_avg | Average number of bulk rate in the forward direction |
| bw_byt_blk_avg  | Average number of bytes bulk rate in the backward direction |
| bw_pkt_blk_avg  | Average number of packets bulk rate in the backward direction |
| bw_blk_rate_avg | Average number of bulk rate in the backward direction |
| subfl_fw_pk     | The average number of packets in a subflow in the forward direction |
| subfl_fw_byt    | The average number of bytes in a subflow in the forward direction |
| subfl_bw_pk     | The average number of packets in a subflow in the backward direction |
| subfl_bw_byt    | The average number of bytes in a subflow in the backward direction |
| fw_win_byt      | Number of bytes sent in initial window in the forward direction |
| bw_win_byt      | Number of bytes sent in initial window in the backward direction |
| Fw_act_pkt      | Number of packets with at least 1 byte of TCP data payload in the forward direction |
| fw_seg_min      | Minimum segment size observed in the forward direction |
| atv_avg         | Mean time a flow was active before becoming idle |
| atv_std         | Standard deviation time a flow was active before becoming idle |
| atv_max         | Maximum time a flow was active before becoming idle |
| atv_min         | Minimum time a flow was active before becoming idle |
| idl_avg         | Mean time a flow was idle before becoming active |
| idl_std         | Standard deviation time a flow was idle before becoming active |
| idl_max         | Maximum time a flow was idle before becoming active |
| idl_min         | Minimum time a flow was idle before becoming active |


In [42]:
df.shape

(12794627, 85)

In [50]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df.head(5)

Unnamed: 0.1,Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,624,192.168.4.118-203.73.24.75-4504-80-6,192.168.4.118,4504,203.73.24.75,80,6,12/06/2010 08:34:32 AM,3974862,29,44,86.0,59811.0,86.0,0.0,2.965517,15.969799,1460.0,0.0,1359.340909,372.02719,15068.950821,18.365417,55206.416667,195478.316654,1566821.0,167.0,3735347.0,133405.25,341775.688712,1805015.0,167.0,3974862.0,92438.651163,248174.820574,1566821.0,3997.0,0,0,0,0,768,896,7.295851,11.069567,0.0,1460.0,809.418919,728.862428,531240.438541,0,1,0,0,0,0,0,0,1.0,820.506849,2.965517,1359.340909,0,0,0,0,0,0,29,86,44,59811,-1,5840,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ddos
1,625,192.168.4.118-203.73.24.75-4504-80-6,192.168.4.118,4504,203.73.24.75,80,6,12/06/2010 08:34:36 AM,63,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31746.031746,63.0,0.0,63.0,63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,20,20,15873.015873,15873.015873,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,1.0,0.0,0.0,0.0,0,0,0,0,0,0,1,0,1,0,-1,17520,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ddos
2,626,192.168.4.118-203.73.24.75-4505-80-6,192.168.4.118,4505,203.73.24.75,80,6,12/06/2010 08:34:36 AM,476078,2,6,86.0,3037.0,86.0,0.0,43.0,60.811183,1460.0,0.0,506.166667,740.224403,6559.849436,16.803969,68011.142857,110862.707451,232203.0,6.0,134.0,134.0,0.0,134.0,134.0,476078.0,95215.6,123467.30358,232601.0,6.0,0,0,0,0,40,136,4.200992,12.602977,0.0,1460.0,347.0,632.515217,400075.5,0,1,0,0,0,0,0,0,3.0,390.375,43.0,506.166667,0,0,0,0,0,0,2,86,6,3037,-1,5840,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ddos
3,627,192.168.4.118-203.73.24.75-4505-80-6,192.168.4.118,4505,203.73.24.75,80,6,12/06/2010 08:34:37 AM,151,2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19867.549669,75.5,98.287843,145.0,6.0,145.0,145.0,0.0,145.0,145.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40,20,13245.033113,6622.516556,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,2,0,1,0,-1,17520,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ddos
4,628,192.168.4.118-203.73.24.75-4506-80-6,192.168.4.118,4506,203.73.24.75,80,6,12/06/2010 08:34:37 AM,472507,2,5,73.0,1050.0,73.0,0.0,36.5,51.618795,1050.0,0.0,210.0,469.574275,2376.684367,14.814595,78751.166667,118675.492251,232355.0,7.0,179.0,179.0,0.0,179.0,179.0,472507.0,118126.75,131726.857333,232875.0,7.0,0,0,0,0,40,116,4.232742,10.581854,0.0,1050.0,140.375,368.430624,135741.125,0,1,0,0,0,0,0,0,2.0,160.428571,36.5,210.0,0,0,0,0,0,0,2,73,5,1050,-1,5840,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ddos


In [48]:
df.describe()

  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0.1,Unnamed: 0,Src Port,Dst Port,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12764910.0,12794630.0,12794630.0,12794630.0,12794627.0,12794630.0,12794627.0,12794630.0,12794630.0,12794627.0,12794630.0,12794627.0,12794630.0,12794630.0,12794627.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794627.0,12794627.0,12794627.0,12794627.0,12794627.0,12794627.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794630.0,12794627.0,12794630.0,12794627.0,12794627.0
mean,2662390.0,37070.54,14642.9,8223957.0,27.19636,4.974281,1125.727,3264.957,292.2581,7.095572,76.22122,130.7919,301.0288,16.94691,91.89326,122.4333,inf,inf,1858787.0,954755.2,4340040.0,1450054.0,7144640.5,1909382.0,733794.1,3354991.0,1461141.0,5800243.0,840076.6,1034288.0,2614625.5,217372.5,278.7391,114.2484,15820.86,9268.565,7.161795,432.4783,78.25745,141.0938,47930.95,0.6935632,89.68109,76.22122,91.89326,0.0,0.0,0.0,0.0,0.0,0.0,27.19636,1125.726,4.974281,3264.955,2834214000.0,1211272000.0,24.14145,8.117674,136969.8,69363.45,204986.8,90305.39,3119924.0,109353.2,3215790.5,3019043.5
std,2169382.0,25219.85,23063.83,25147280.0,1720.577,250.9204,54791.57,538767.8,393.1046,19.74606,94.38475,189.0678,443.1349,42.53935,141.4036,181.3809,,,8888754.0,3430755.0,13021731.0,8795805.0,24485164.0,8982503.0,3547276.0,12333053.0,8819990.0,21381280.0,4042258.0,3713835.0,9532594.0,3373672.0,14176.07,5019.706,154545.4,79908.59,17.62173,515.7801,94.38339,167.4021,180488.6,0.9748132,100.5043,94.38475,141.4036,0.0,0.0,0.0,0.0,0.0,0.0,1720.577,54791.57,250.9204,538767.8,2034718000.0,1932654000.0,1711.587,9.138129,2301480.0,1399946.0,3050190.0,1934856.0,12192611.0,1414693.0,12444760.0,12065274.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-2000000.0,-1.0,0.0,-1.0,-13.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,898483.0,443.0,80.0,1262.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.215615,945.8333,0.0,1053.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,16.0,0.9515051,0.9220341,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,8192.0,219.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2040991.0,50592.0,80.0,32066.0,2.0,1.0,42.0,112.0,40.0,0.0,36.0,0.0,100.0,0.0,68.0,0.0,293.7508,88.07081,18925.33,251.0229,30046.0,47.0,1272.0,457.3333,0.0,1022.0,21.0,0.0,0.0,0.0,0.0,0.0,40.0,32.0,38.51783,21.78697,0.0,101.0,54.5,34.64102,1200.0,1.0,76.5,36.0,68.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,42.0,1.0,112.0,4294967000.0,32738.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3903946.0,56215.0,38550.0,4159749.0,4.0,4.0,935.0,358.0,677.0,0.0,142.0,222.6233,350.0,0.0,113.6667,180.5,12214.05,2042.206,507729.3,456505.7,3894913.5,1973.0,141571.5,51542.0,33505.09,105016.0,89.0,1911058.5,384651.0,362895.6,969091.0,328.0,136.0,136.0,890.4719,571.1022,0.0,935.0,143.2222,317.6552,100904.9,1.0,160.375,142.0,113.6667,0.0,0.0,0.0,0.0,0.0,0.0,4.0,935.0,4.0,358.0,4294967000.0,4294967000.0,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,7902474.0,65535.0,65535.0,4294967000.0,309628.0,291923.0,9908096.0,655452700.0,23360.0,1472.0,4660.441,7137.621,65160.0,3684.0,30375.08,22448.41,inf,inf,119999300.0,84808080.0,119999736.0,119999300.0,120000000.0,119999300.0,84852690.0,119999872.0,119999300.0,119999912.0,119963800.0,84837190.0,119977904.0,119963800.0,4644876.0,5838460.0,6000000.0,3000000.0,1460.0,65160.0,16691.33,22463.89,504626400.0,311.0,16801.14,4660.441,30375.08,0.0,0.0,0.0,0.0,0.0,0.0,309628.0,9908096.0,291923.0,655452700.0,4294967000.0,4294967000.0,309628.0,48.0,113269100.0,75232420.0,113269100.0,113269100.0,119999736.0,76393950.0,119999736.0,119999736.0


In [44]:
protocols = df.groupby("Protocol").size()
protocols

  protocols = df.groupby("Protocol").size()


Protocol
6     10489144
17     2189941
0       115542
dtype: int64

In [47]:
protocols = df.groupby("Label").size()
protocols

  protocols = df.groupby("Label").size()


Label
ddos      6472647
Benign    6321980
dtype: int64

## Data Preparation

#### Filtering only TCP connections

In [45]:
filltered = df[df["Protocol"] == '6']
filltered.shape

(10489144, 85)