# **DDos Data Preprocessing**
---

## **Imports and Configuration**
---

In [1]:
import numpy as np
import os
import pandas as pd
import pickle
import warnings

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, Normalizer, OneHotEncoder, QuantileTransformer, StandardScaler

In [2]:
# Ignore warnings
warnings.filterwarnings('ignore')

In [3]:
# Input data path
dataset_name = '75-20-05-udplag.syn'

input_path = os.path.join('data/clean', dataset_name)
encoders_path = os.path.join(input_path, 'encoders')
stats_path = os.path.join(input_path, 'stats')
balanced_smote_path = os.path.join(input_path, 'split-sets-balanced-smote')
unbalanced_path = os.path.join(input_path, 'split-sets-unbalanced')

## **Load Data**
---

In [4]:
file_name = 'ddos-data-clean.csv'

df = pd.read_csv(os.path.join(input_path, file_name))
df

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,17,28415,97,0,42680.0,0.0,440.0,440.0,440.0,0.0,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DNS/LDAP
1,17,2,2,0,880.0,0.0,440.0,440.0,440.0,0.0,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DNS/LDAP
2,17,48549,200,0,88000.0,0.0,440.0,440.0,440.0,0.0,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DNS/LDAP
3,17,48337,200,0,88000.0,0.0,440.0,440.0,440.0,0.0,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DNS/LDAP
4,17,32026,200,0,88000.0,0.0,440.0,440.0,440.0,0.0,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DNS/LDAP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8852836,6,1,2,0,12.0,0.0,6.0,6.0,6.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Syn/UDPLag
8852837,6,1,2,0,12.0,0.0,6.0,6.0,6.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Syn/UDPLag
8852838,6,105,2,2,12.0,12.0,6.0,6.0,6.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Syn/UDPLag
8852839,6,1,2,0,12.0,0.0,6.0,6.0,6.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Syn/UDPLag


In [5]:
# Show label distribution
df['Label'].value_counts()

Label
Syn/UDPLag         1764050
NetBIOS/Portmap    1178457
SSDP/UDP           1114215
SNMP                991717
NTP                 978664
TFTP                977602
MSSQL               968182
DNS/LDAP            823020
BENIGN               56934
Name: count, dtype: int64

## **Split Data (train/val/test), generate Encoders (Label and One-Hot) and get Train Normalize Stats**
---

In [6]:
# Shuffle data
df = df.sample(frac=1)

# Get (X, y) from dataframe
X = df.drop(columns=['Label'])
y = df['Label']

# Show (X, y) shapes
print(f'X.shape = {X.shape}')
print(f'y.shape = {y.shape}\n')

# Get unique labels
unique_labels = y.unique()
print(f'Unique labels = {unique_labels}')

X.shape = (8852841, 78)
y.shape = (8852841,)

Unique labels = ['SSDP/UDP' 'TFTP' 'Syn/UDPLag' 'NetBIOS/Portmap' 'SNMP' 'MSSQL' 'NTP'
 'DNS/LDAP' 'BENIGN']


In [7]:
# Create and save label encoder for target labels
label_encoder = LabelEncoder()
label_encoder.fit(y)
with open(os.path.join(encoders_path, 'label-encoder.pkl'), 'wb') as file:
    pickle.dump(label_encoder, file)

# Create and save one-hot encoder for target labels
onehot_encoder = OneHotEncoder()
onehot_encoder.fit(y.values.reshape(-1, 1))
with open(os.path.join(encoders_path, 'onehot-encoder.pkl'), 'wb') as file:
    pickle.dump(onehot_encoder, file)

In [8]:
# Split data in train and val-test data
X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, test_size=0.25)

# Split val-test in val and test data
X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, test_size=0.2)

# Show train, dev and test shapes
print(f'X_train.shape = {X_train.shape}')
print(f'y_train.shape = {y_train.shape}')
print(f'X_val.shape = {X_val.shape}')
print(f'y_val.shape = {y_val.shape}')
print(f'X_test.shape = {X_test.shape}')
print(f'y_test.shape = {y_test.shape}')

X_train.shape = (6639630, 78)
y_train.shape = (6639630,)
X_val.shape = (1770568, 78)
y_val.shape = (1770568,)
X_test.shape = (442643, 78)
y_test.shape = (442643,)


In [9]:
# Save train, val and test datasets
X_train.to_csv(os.path.join(unbalanced_path, 'X_train.csv'), index=False)
y_train.to_csv(os.path.join(unbalanced_path, 'y_train.csv'), index=False)
X_val.to_csv(os.path.join(unbalanced_path, 'X_val.csv'), index=False)
y_val.to_csv(os.path.join(unbalanced_path, 'y_val.csv'), index=False)
X_test.to_csv(os.path.join(unbalanced_path, 'X_test.csv'), index=False)
y_test.to_csv(os.path.join(unbalanced_path, 'y_test.csv'), index=False)

In [4]:
# Load X train (Skip if generating the sets)
X_train = pd.read_csv(os.path.join(unbalanced_path, 'X_train.csv'))

In [10]:
# Create and save standard scaler from train data
standard_scaler = StandardScaler()
standard_scaler.fit(X_train)
with open(os.path.join(stats_path, 'standard-scaler.pkl'), 'wb') as file:
    pickle.dump(standard_scaler, file)

In [11]:
# Create and save L2 normalizer from train data
l2_normalizer = Normalizer()
l2_normalizer.fit(X_train)
with open(os.path.join(stats_path, 'l2-normalizer.pkl'), 'wb') as file:
    pickle.dump(l2_normalizer, file)

# Create and save quantile transformer from train data
quantile_transformer = QuantileTransformer(output_distribution = 'normal')
quantile_transformer.fit(X_train)
with open(os.path.join(stats_path, 'quantile-transformer.pkl'), 'wb') as file:
    pickle.dump(quantile_transformer, file)

In [12]:
y_train.value_counts()

Label
Syn/UDPLag         1324091
NetBIOS/Portmap     884306
SSDP/UDP            834979
SNMP                743327
NTP                 734271
TFTP                732465
MSSQL               725683
DNS/LDAP            617957
BENIGN               42551
Name: count, dtype: int64

## **Apply SMOTE and Normalize Data (train/val/test)**

In [4]:
# Load Data (Skip if generating the sets)
X_train = pd.read_csv(os.path.join(unbalanced_path, 'X_train.csv'))
y_train = pd.read_csv(os.path.join(unbalanced_path, 'y_train.csv'))
X_val = pd.read_csv(os.path.join(unbalanced_path, 'X_val.csv'))
y_val = pd.read_csv(os.path.join(unbalanced_path, 'y_val.csv'))
X_test = pd.read_csv(os.path.join(unbalanced_path, 'X_test.csv'))
y_test = pd.read_csv(os.path.join(unbalanced_path, 'y_test.csv'))

In [13]:
# SMOTE over train set
smote = SMOTE()
X_train, y_train = smote.fit_resample(X_train, y_train)

In [14]:
y_train.value_counts()

Label
Syn/UDPLag         1324091
DNS/LDAP           1324091
NetBIOS/Portmap    1324091
MSSQL              1324091
SNMP               1324091
SSDP/UDP           1324091
NTP                1324091
TFTP               1324091
BENIGN             1324091
Name: count, dtype: int64

In [15]:
y_train.value_counts().sum()

np.int64(11916819)

In [18]:
# Normalize X data
# X_train = standard_scaler.transform(X_train)
# X_val = standard_scaler.transform(X_val)
# X_test = standard_scaler.transform(X_test)

In [16]:
# Show train, dev and test shapes
print(f'X_train.shape = {X_train.shape}')
print(f'y_train.shape = {y_train.shape}')
print(f'X_val.shape = {X_val.shape}')
print(f'y_val.shape = {y_val.shape}')
print(f'X_test.shape = {X_test.shape}')
print(f'y_test.shape = {y_test.shape}')

X_train.shape = (11916819, 78)
y_train.shape = (11916819,)
X_val.shape = (1770568, 78)
y_val.shape = (1770568,)
X_test.shape = (442643, 78)
y_test.shape = (442643,)


In [29]:
# Obtain column names from original dataset
# column_names = list(df.columns)
# column_names.remove('Label')

# Create dataframes from X data
# X_train = pd.DataFrame(X_train, columns=column_names)
# X_val = pd.DataFrame(X_val, columns=column_names)
# X_test = pd.DataFrame(X_test, columns=column_names)

In [17]:
# Save train, val and test datasets
X_train.to_csv(os.path.join(balanced_smote_path, 'X_train.csv'), index=False)
y_train.to_csv(os.path.join(balanced_smote_path, 'y_train.csv'), index=False)
X_val.to_csv(os.path.join(balanced_smote_path, 'X_val.csv'), index=False)
y_val.to_csv(os.path.join(balanced_smote_path, 'y_val.csv'), index=False)
X_test.to_csv(os.path.join(balanced_smote_path, 'X_test.csv'), index=False)
y_test.to_csv(os.path.join(balanced_smote_path, 'y_test.csv'), index=False)