# Homework 3 - Deep Learning
***Author: Christopher Wagner***

In [2]:
# Import libraries...
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from pandas import Series
from sklearn.preprocessing import StandardScaler

### Import Dataset

In [3]:
# DATASET_DIRECTORY = "../datasets/CICIoT2023/"

# data_files = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
# df = pd.concat([pd.read_csv(DATASET_DIRECTORY + filename) for filename in data_files])

# Using smaller dataset for testing purposes, has 1_048_575 rows
DATASET_FILE = "../datasets/IoT_Intrusion.csv"
df = pd.read_csv(DATASET_FILE)

# Define the features and target columns
X_columns = df.columns.to_list()[:-1]
y_column = 'label'

X = df[X_columns]
y = df[y_column]

### Define Binary Classification Labels

In [5]:
dict_2classes = {}
dict_2classes['DDoS-RSTFINFlood'] = 'Attack'
dict_2classes['DDoS-PSHACK_Flood'] = 'Attack'
dict_2classes['DDoS-SYN_Flood'] = 'Attack'
dict_2classes['DDoS-UDP_Flood'] = 'Attack'
dict_2classes['DDoS-TCP_Flood'] = 'Attack'
dict_2classes['DDoS-ICMP_Flood'] = 'Attack'
dict_2classes['DDoS-SynonymousIP_Flood'] = 'Attack'
dict_2classes['DDoS-ACK_Fragmentation'] = 'Attack'
dict_2classes['DDoS-UDP_Fragmentation'] = 'Attack'
dict_2classes['DDoS-ICMP_Fragmentation'] = 'Attack'
dict_2classes['DDoS-SlowLoris'] = 'Attack'
dict_2classes['DDoS-HTTP_Flood'] = 'Attack'

dict_2classes['DoS-UDP_Flood'] = 'Attack'
dict_2classes['DoS-SYN_Flood'] = 'Attack'
dict_2classes['DoS-TCP_Flood'] = 'Attack'
dict_2classes['DoS-HTTP_Flood'] = 'Attack'

dict_2classes['Mirai-greeth_flood'] = 'Attack'
dict_2classes['Mirai-greip_flood'] = 'Attack'
dict_2classes['Mirai-udpplain'] = 'Attack'

dict_2classes['Recon-PingSweep'] = 'Attack'
dict_2classes['Recon-OSScan'] = 'Attack'
dict_2classes['Recon-PortScan'] = 'Attack'
dict_2classes['VulnerabilityScan'] = 'Attack'
dict_2classes['Recon-HostDiscovery'] = 'Attack'

dict_2classes['DNS_Spoofing'] = 'Attack'
dict_2classes['MITM-ArpSpoofing'] = 'Attack'

dict_2classes['BenignTraffic'] = 'Benign'

dict_2classes['BrowserHijacking'] = 'Attack'
dict_2classes['Backdoor_Malware'] = 'Attack'
dict_2classes['XSS'] = 'Attack'
dict_2classes['Uploading_Attack'] = 'Attack'
dict_2classes['SqlInjection'] = 'Attack'
dict_2classes['CommandInjection'] = 'Attack'

dict_2classes['DictionaryBruteForce'] = 'Attack'

def map_label_to_binary_classes(y: Series):
    """Maps the labels to binary classes.

    Args:
        y (Series): The current labels. 

    Returns:
        list: The new labels.
    """
    return [dict_2classes[label] for label in y]

### Define Multi-Class Classification Labels

In [6]:
dict_8classes = {}
dict_8classes['DDoS-RSTFINFlood'] = 'DDoS'
dict_8classes['DDoS-PSHACK_Flood'] = 'DDoS'
dict_8classes['DDoS-SYN_Flood'] = 'DDoS'
dict_8classes['DDoS-UDP_Flood'] = 'DDoS'
dict_8classes['DDoS-TCP_Flood'] = 'DDoS'
dict_8classes['DDoS-ICMP_Flood'] = 'DDoS'
dict_8classes['DDoS-SynonymousIP_Flood'] = 'DDoS'
dict_8classes['DDoS-ACK_Fragmentation'] = 'DDoS'
dict_8classes['DDoS-UDP_Fragmentation'] = 'DDoS'
dict_8classes['DDoS-ICMP_Fragmentation'] = 'DDoS'
dict_8classes['DDoS-SlowLoris'] = 'DDoS'
dict_8classes['DDoS-HTTP_Flood'] = 'DDoS'

dict_8classes['DoS-UDP_Flood'] = 'DoS'
dict_8classes['DoS-SYN_Flood'] = 'DoS'
dict_8classes['DoS-TCP_Flood'] = 'DoS'
dict_8classes['DoS-HTTP_Flood'] = 'DoS'

dict_8classes['Mirai-greeth_flood'] = 'Mirai'
dict_8classes['Mirai-greip_flood'] = 'Mirai'
dict_8classes['Mirai-udpplain'] = 'Mirai'

dict_8classes['Recon-PingSweep'] = 'Recon'
dict_8classes['Recon-OSScan'] = 'Recon'
dict_8classes['Recon-PortScan'] = 'Recon'
dict_8classes['VulnerabilityScan'] = 'Recon'
dict_8classes['Recon-HostDiscovery'] = 'Recon'

dict_8classes['DNS_Spoofing'] = 'Spoofing'
dict_8classes['MITM-ArpSpoofing'] = 'Spoofing'

dict_8classes['BenignTraffic'] = 'Benign'

dict_8classes['BrowserHijacking'] = 'Web'
dict_8classes['Backdoor_Malware'] = 'Web'
dict_8classes['XSS'] = 'Web'
dict_8classes['Uploading_Attack'] = 'Web'
dict_8classes['SqlInjection'] = 'Web'
dict_8classes['CommandInjection'] = 'Web'

dict_8classes['DictionaryBruteForce'] = 'BruteForce'

def map_label_to_8_classes(y: Series):
    """Maps the labels to 8 classes.

    Args:
        y (Series): The current labels. 

    Returns:
        list: The new labels.
    """
    return [dict_8classes[label] for label in y]

### Split the Dataset into Training and Test Sets

In [7]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=42)

### Pre-Process the Data

In [8]:
sc = StandardScaler()
sc.fit(X_train)

X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

### Multi-Layer Perceptron (MLP) Model

In [14]:
def create_mlp_grid_search(activation_functions: list):
    """Creates a grid search for the MLPClassifier.

    Returns:
        GridSearchCV: The grid search.
    """
    mlpc = MLPClassifier(
        solver='adam',
        learning_rate_init=0.01,
        max_iter=300,
        early_stopping=True,
        verbose=1,
    )

    param_grid = {
        # L2 penalty (regularization term) parameter. i.e [0.1, 0.01, 0.001, 0.0001]
        'alpha': 10.0 ** -np.arange(1, 5),
        
        'activation': activation_functions,
        
        'hidden_layer_sizes': [
            # number of hidden neurons should be between the size 
            # of the input layer and the size of the output layer.
            (47, 23, 12, 7), # decreases by ~2 each layer
            
            # number of hidden neurons should be 2/3 the size of 
            # the input layer, plus the size of the output layer.
            (38, 32, 28, 25, 23, 22),
            
            # number of hidden neurons should be less than twice 
            # the size of the input layer.
            (94, 47, 23, 12, 7) # decreases by ~2 each layer
        ],
    }

    return GridSearchCV(mlpc, param_grid, scoring='accuracy', n_jobs=-1, verbose=3)

### Binary Classification Model

In [15]:
# Fit the grid search to the data
binary_grid_search = create_mlp_grid_search(['logistic', 'tanh'])
binary_grid_search.fit(X_train, map_label_to_binary_classes(y_train))
binary_grid_search.best_params_

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Iteration 1, loss = 0.02941278
Validation score: 0.991035
Iteration 2, loss = 0.01730797
Validation score: 0.992716
Iteration 3, loss = 0.01657170
Validation score: 0.992633
Iteration 4, loss = 0.01615270
Validation score: 0.992895
Iteration 5, loss = 0.01612154
Validation score: 0.993086
Iteration 6, loss = 0.01595642
Validation score: 0.992692
Iteration 7, loss = 0.01593939
Validation score: 0.992430
Iteration 8, loss = 0.01595209
Validation score: 0.992883
Iteration 9, loss = 0.01589978
Validation score: 0.993348
Iteration 10, loss = 0.01567903
Validation score: 0.992728
Iteration 11, loss = 0.01567733
Validation score: 0.992537
Iteration 12, loss = 0.01560748
Validation score: 0.992764
Iteration 13, loss = 0.01580669
Validation score: 0.993074
Iteration 14, loss = 0.01577090
Validation score: 0.993539
Iteration 15, loss = 0.01570535
Validation score: 0.993610
Iteration 16, loss = 0.01567015
Validation score: 0.993443
Ite

{'activation': 'logistic',
 'alpha': 0.0001,
 'hidden_layer_sizes': (47, 23, 12, 7)}

In [16]:
# Predicting the Test set results
y_pred = binary_grid_search.predict(X_test)

accuracy = binary_grid_search.score(X_test, map_label_to_binary_classes(y_test))
accuracy

0.9939107836826169

### Multi-Class Classification Experiment

In [17]:
# Fit the grid search to the data
multi_grid_search = create_mlp_grid_search(['tanh', 'relu'])
multi_grid_search.fit(X_train, map_label_to_8_classes(y_train))
multi_grid_search.best_params_

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Iteration 1, loss = 0.38649299
Validation score: 0.841046
Iteration 2, loss = 0.29116265
Validation score: 0.973011
Iteration 3, loss = 0.20832354
Validation score: 0.933922
Iteration 4, loss = 0.18097904
Validation score: 0.968052
Iteration 5, loss = 0.19592880
Validation score: 0.964106
Iteration 6, loss = 0.21177554
Validation score: 0.981594
Iteration 7, loss = 0.22095671
Validation score: 0.954367
Iteration 8, loss = 0.21973654
Validation score: 0.939012
Iteration 9, loss = 0.23377279
Validation score: 0.975836
Iteration 10, loss = 0.26777093
Validation score: 0.973834
Iteration 11, loss = 0.24847950
Validation score: 0.975419
Iteration 12, loss = 0.23559799
Validation score: 0.954212
Iteration 13, loss = 0.25460708
Validation score: 0.916458
Iteration 14, loss = 0.20596957
Validation score: 0.967158
Iteration 15, loss = 0.26450168
Validation score: 0.982357
Iteration 16, loss = 0.23770274
Validation score: 0.925971
Ite

{'activation': 'tanh', 'alpha': 0.1, 'hidden_layer_sizes': (47, 23, 12, 7)}

In [18]:
# Predicting the Test set results
y_pred = multi_grid_search.predict(X_test)

accuracy = multi_grid_search.score(X_test, map_label_to_8_classes(y_test))
accuracy

0.9829912023460411