# EDA, Balancing and Sampling

In [1]:
import os.path

import pandas as pd
from pandas_profiling import ProfileReport

In [2]:
generate_reports: bool = True
generate_dfs: bool = True

In [3]:
part_1 = pd.read_csv("dataset/dataset-part1.csv")
part_2 = pd.read_csv("dataset/dataset-part2.csv")

In [4]:
df = pd.concat([part_1, part_2], ignore_index=True)

## Exploratory Data Analysis

In [5]:
df.columns.values

array(['BIFLOW_DIRECTION', 'DIRECTION', 'DST_TO_SRC_SECOND_BYTES',
       'FIREWALL_EVENT', 'FIRST_SWITCHED', 'FLOW_ACTIVE_TIMEOUT',
       'FLOW_DURATION_MICROSECONDS', 'FLOW_DURATION_MILLISECONDS',
       'FLOW_END_MILLISECONDS', 'FLOW_END_SEC', 'FLOW_ID',
       'FLOW_INACTIVE_TIMEOUT', 'FLOW_START_MILLISECONDS',
       'FLOW_START_SEC', 'FRAME_LENGTH', 'IN_BYTES', 'IN_PKTS',
       'IPV4_DST_ADDR', 'IPV4_SRC_ADDR', 'L4_DST_PORT', 'L4_SRC_PORT',
       'LAST_SWITCHED', 'MAX_IP_PKT_LEN', 'MIN_IP_PKT_LEN',
       'OOORDER_IN_PKTS', 'OOORDER_OUT_PKTS', 'OUT_BYTES', 'OUT_PKTS',
       'PROTOCOL', 'PROTOCOL_MAP', 'RETRANSMITTED_IN_BYTES',
       'RETRANSMITTED_IN_PKTS', 'RETRANSMITTED_OUT_BYTES',
       'RETRANSMITTED_OUT_PKTS', 'SRC_TO_DST_SECOND_BYTES', 'TCP_FLAGS',
       'TCP_WIN_MAX_IN', 'TCP_WIN_MAX_OUT', 'TCP_WIN_MIN_IN',
       'TCP_WIN_MIN_OUT', 'TCP_WIN_MSS_IN', 'TCP_WIN_MSS_OUT',
       'TCP_WIN_SCALE_IN', 'TCP_WIN_SCALE_OUT', 'SRC_TOS', 'DST_TOS',
       'L7_PROTO_NAME', 'SAM

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12207873 entries, 0 to 12207872
Data columns (total 50 columns):
 #   Column                      Dtype 
---  ------                      ----- 
 0   BIFLOW_DIRECTION            int64 
 1   DIRECTION                   int64 
 2   DST_TO_SRC_SECOND_BYTES     object
 3   FIREWALL_EVENT              int64 
 4   FIRST_SWITCHED              int64 
 5   FLOW_ACTIVE_TIMEOUT         int64 
 6   FLOW_DURATION_MICROSECONDS  int64 
 7   FLOW_DURATION_MILLISECONDS  int64 
 8   FLOW_END_MILLISECONDS       int64 
 9   FLOW_END_SEC                int64 
 10  FLOW_ID                     int64 
 11  FLOW_INACTIVE_TIMEOUT       int64 
 12  FLOW_START_MILLISECONDS     int64 
 13  FLOW_START_SEC              int64 
 14  FRAME_LENGTH                int64 
 15  IN_BYTES                    int64 
 16  IN_PKTS                     int64 
 17  IPV4_DST_ADDR               object
 18  IPV4_SRC_ADDR               object
 19  L4_DST_PORT                 int64 
 20  

In [7]:
df.head()

Unnamed: 0,BIFLOW_DIRECTION,DIRECTION,DST_TO_SRC_SECOND_BYTES,FIREWALL_EVENT,FIRST_SWITCHED,FLOW_ACTIVE_TIMEOUT,FLOW_DURATION_MICROSECONDS,FLOW_DURATION_MILLISECONDS,FLOW_END_MILLISECONDS,FLOW_END_SEC,...,TCP_WIN_MSS_IN,TCP_WIN_MSS_OUT,TCP_WIN_SCALE_IN,TCP_WIN_SCALE_OUT,SRC_TOS,DST_TOS,L7_PROTO_NAME,SAMPLING_INTERVAL,TOTAL_FLOWS_EXP,LABEL
0,1,0,40,0,1616660040,120,339,0,1616660040010,1616660040,...,1460,0,0,0,0,0,Unknown,1,2293398,Normal flow
1,1,0,",",0,1616660040,120,0,0,1616660040068,1616660040,...,0,0,0,0,40,0,ICMP,1,2293400,Normal flow
2,1,0,104,0,1616660040,120,44725,44,1616660040114,1616660040,...,0,0,0,0,0,0,TLS,1,2293404,Normal flow
3,1,0,",",0,1616660040,120,0,0,1616660040122,1616660040,...,1440,0,8,0,40,0,Unknown,1,2293407,Normal flow
4,1,0,40,0,1616660040,120,1114,1,1616660040184,1616660040,...,0,0,0,0,0,0,TLS,1,2293409,Normal flow


In [8]:
df.describe(include=['object'])

Unnamed: 0,DST_TO_SRC_SECOND_BYTES,IPV4_DST_ADDR,IPV4_SRC_ADDR,PROTOCOL_MAP,SRC_TO_DST_SECOND_BYTES,L7_PROTO_NAME,LABEL
count,12207873,12207873,12207873,12207873,12207873,12207873,12207873
unique,569243,440887,88463,5,3160289,457,4
top,40,10.114.224.65,10.114.241.166,tcp,44,Unknown,Normal flow
freq,3053652,3604686,5671376,8950058,2534718,4090802,6570058


In [9]:
object_columns = df.select_dtypes(include=['object']).columns
df[object_columns].head(5)

Unnamed: 0,DST_TO_SRC_SECOND_BYTES,IPV4_DST_ADDR,IPV4_SRC_ADDR,PROTOCOL_MAP,SRC_TO_DST_SECOND_BYTES,L7_PROTO_NAME,LABEL
0,40,10.114.225.212,162.142.125.173,tcp,44,Unknown,Normal flow
1,",",10.114.225.215,45.79.106.170,icmp,68,ICMP,Normal flow
2,104,10.114.241.165,10.114.224.65,tcp,189,TLS,Normal flow
3,",",10.114.226.23,202.179.91.28,tcp,52,Unknown,Normal flow
4,40,10.114.224.65,10.114.241.165,tcp,189,TLS,Normal flow


In [10]:
df["LABEL"].unique()

array(['Normal flow', 'SYN Scan - aggressive',
       'Denial of Service R-U-Dead-Yet', 'Denial of Service Slowloris'],
      dtype=object)

In [11]:
# Check for nan columns
df.columns[df.isna().any()].tolist()
# No nan

[]

In [13]:
if not os.path.isfile("reports/EDA.html") and generate_reports:
    profile = ProfileReport(df, title="Exploratory Data Analysis - Clean DF", minimal=True)
    profile.to_file("reports/EDA.html")

## Datset Balancing

In [14]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

ros = RandomOverSampler(sampling_strategy='minority')

In [15]:
labels_dic = {
    0: "Normal flow",
    1: "SYN Scan - aggressive",
    2: "Denial of Service R-U-Dead-Yet",
    3: "Denial of Service Slowloris"
}

In [16]:
normal_flow = df.loc[df["LABEL"] == labels_dic[0]]
syn_attacks = df.loc[df["LABEL"] == labels_dic[1]]
r_u_dead_attacks = df.loc[df["LABEL"] == labels_dic[2]]
dos_attacks = df.loc[df["LABEL"] == labels_dic[3]]

In [17]:
print(len(normal_flow))
print(len(syn_attacks))
print(len(r_u_dead_attacks))
print(len(dos_attacks))

6570058
2496814
2276947
864054


### Oversampling DDoS attack

In [18]:
attacks_oversampling = pd.concat([syn_attacks, dos_attacks])
y_oversampling = attacks_oversampling["LABEL"]
x_oversampling = attacks_oversampling.drop(["LABEL"], axis= 1)

In [19]:
X_oversampling_res, y_oversampling_res = ros.fit_resample(x_oversampling, y_oversampling)

In [20]:
print('Resampled dataset shape %s' % Counter(y_oversampling_res))

Resampled dataset shape Counter({'SYN Scan - aggressive': 2496814, 'Denial of Service Slowloris': 2496814})


In [21]:
oversampling_result = X_oversampling_res.copy()
oversampling_result["LABEL"] = y_oversampling_res.copy()

In [22]:
df = pd.concat([normal_flow, r_u_dead_attacks, oversampling_result])

### Undersampling benign

In [27]:
from imblearn.under_sampling import RandomUnderSampler

In [23]:
normal_flow = df.loc[df["LABEL"] == labels_dic[0]]
syn_attacks = df.loc[df["LABEL"] == labels_dic[1]]
r_u_dead_attacks = df.loc[df["LABEL"] == labels_dic[2]]
dos_attacks = df.loc[df["LABEL"] == labels_dic[3]]

In [24]:
benign_us = pd.concat([syn_attacks, normal_flow])

In [25]:
y = benign_us["LABEL"]
X = benign_us.drop(["LABEL"], axis=1)

In [28]:
rus = RandomUnderSampler(random_state=0)

In [29]:
X_undersampled, y_undersampled = rus.fit_resample(X, y)

print('Resampled dataset shape %s' % Counter(y_undersampled))

In [None]:
X_undersampled["LABEL"] = y_undersampled
undersample_res = X_undersampled

In [None]:
df = pd.concat([dos_attacks, r_u_dead_attacks, undersample_res])

### Generate balanced csv file

In [None]:
# create csv if it doesn't exist
if not os.path.isfile("dataset/balanced_df.csv") and generate_dfs:
    df.to_csv("dataset/balanced_df.csv", index_label=False)

df.shape

## Sampling

In [None]:
df = df.groupby('LABEL', group_keys=False).apply(lambda x: x.sample(frac=0.05))
if not os.path.isfile("dataset/sampled_df.csv") and generate_dfs:
    df.to_csv("dataset/sampled_df.csv", index_label=False)

df.shape