# EDA, Balancing and Sampling

In [None]:
import os.path

import pandas as pd
from pandas_profiling import ProfileReport

In [None]:
generate_reports: bool = True
generate_dfs: bool = True

In [None]:
part_1 = pd.read_csv("dataset/dataset-part1.csv")
part_2 = pd.read_csv("dataset/dataset-part2.csv")

In [None]:
df = pd.concat([part_1, part_2], ignore_index=True)

## Exploratory Data Analysis

In [None]:
df.columns.values

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.describe(include=['object'])

In [None]:
object_columns = df.select_dtypes(include=['object']).columns
df[object_columns].head(5)

In [None]:
df["LABEL"].unique()

In [None]:
# Check for nan columns
df.columns[df.isna().any()].tolist()
# No nan

In [None]:
if not os.path.isfile("EDA.html") and generate_reports:
    profile = ProfileReport(df, title="Exploratory Data Analysis - Clean DF", minimal=True)
    profile.to_file("EDA.html")

## Datset Balancing

In [None]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

ros = RandomOverSampler(sampling_strategy='minority')

In [None]:
labels_dic = {
    0: "Normal flow",
    1: "SYN Scan - aggressive",
    2: "Denial of Service R-U-Dead-Yet",
    3: "Denial of Service Slowloris"
}

In [None]:
normal_flow = df.loc[df["LABEL"] == labels_dic[0]]
syn_attacks = df.loc[df["LABEL"] == labels_dic[1]]
r_u_dead_attacks = df.loc[df["LABEL"] == labels_dic[2]]
dos_attacks = df.loc[df["LABEL"] == labels_dic[3]]

In [None]:
print(len(normal_flow))
print(len(syn_attacks))
print(len(r_u_dead_attacks))
print(len(dos_attacks))

### Oversampling DDoS attack

In [None]:
attacks_oversampling = pd.concat([syn_attacks, dos_attacks])
y_oversampling = attacks_oversampling["LABEL"]
x_oversampling = attacks_oversampling.drop(["LABEL"], axis= 1)

In [None]:
X_oversampling_res, y_oversampling_res = ros.fit_resample(x_oversampling, y_oversampling)

In [None]:
print('Resampled dataset shape %s' % Counter(y_oversampling_res))

In [None]:
oversampling_result = X_oversampling_res.copy()
oversampling_result["LABEL"] = y_oversampling_res.copy()

In [None]:
df = pd.concat([normal_flow, r_u_dead_attacks, oversampling_result])

### Undersampling benign

from imblearn.under_sampling import RandomUnderSampler

In [None]:
normal_flow = df.loc[df["LABEL"] == labels_dic[0]]
syn_attacks = df.loc[df["LABEL"] == labels_dic[1]]
r_u_dead_attacks = df.loc[df["LABEL"] == labels_dic[2]]
dos_attacks = df.loc[df["LABEL"] == labels_dic[3]]

In [None]:
benign_us = pd.concat([syn_attacks, normal_flow])

In [None]:
y = benign_us["LABEL"]
X = benign_us.drop(["LABEL"], axis=1)

In [None]:
rus = RandomUnderSampler(random_state=0)

In [None]:
X_undersampled, y_undersampled = rus.fit_resample(X, y)

print('Resampled dataset shape %s' % Counter(y_undersampled))

In [None]:
X_undersampled["LABEL"] = y_undersampled
undersample_res = X_undersampled

In [None]:
df = pd.concat([dos_attacks, r_u_dead_attacks, undersample_res])

### Generate balanced csv file

In [None]:
# create csv if it doesn't exist
if not os.path.isfile("dataset/balanced_df.csv") and generate_dfs:
    df.to_csv("dataset/balanced_df.csv", index_label=False)
else:
    df =pd.read_csv("dataset/balanced_df.csv")


df.shape

## Sampling

In [None]:
if not os.path.isfile("dataset/sampled_df.csv") and generate_dfs:
    df = df.groupby('LABEL', group_keys=False).apply(lambda x: x.sample(frac=0.05))
    df.to_csv("dataset/sampled_df.csv", index_label=False)
else:
    df = pd.read_csv("dataset/sampled_df.csv")

df.shape