<a href="https://colab.research.google.com/github/Frosk-Kristian/COMP6002-Group10-Models/blob/develop/COMP6002_Group_Project_ML_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# COMP6002 Computer Science Project - Group 10
Utilising Machine Learning to detect DDoS attacks.

# Setup

In [1]:
import pandas as pd
print("Using Pandas version: {}".format(pd.__version__))

Using Pandas version: 2.0.3


In [4]:
from sklearn import __version__ as skl_ver
print("Using Sklearn version: {}".format(skl_ver))

Using Sklearn version: 1.2.2


## Import & Export Trained Models

In [5]:
# helper functions for exporting/importing models trained with Sklearn, do not attempt to use with Neural Network as the library used has its' own methods for exporting/importing
# all functions defined will prompt the user for confirmation, to allow for skipping the functions when running the notebook
import joblib
print("Using Joblib version: {}".format(joblib.__version__))

# Save a trained model to the provided filepath
def SaveSKL(model, model_fpath):
  """
  Exports a trained model via joblib.

  Parameters:
    model (object): model to be saved.
    model_fpath (string): file path that the model will be saved in.
  Returns:
    : no value returned.
  """
  to_save = ""
  while to_save.lower() not in ('y', 'n'):
    to_save = input("Do you wish to save the trained model? (y/n)\n")
    if to_save.lower() in 'y':
      print("Saving model to: {}".format(model_fpath))
      try:
        joblib.dump(model, model_fpath)
        print("SUCCESS: Model saved to {}".format(model_fpath))
      except:
        print("ERROR: An unknown error has occured when calling joblib.dump()!")
    else:
      if to_save.lower() in 'n':
        print("Did not save model.")
      else:
        print("Please only enter \'y\' to save model or \'n\' to skip saving.")

# Load a trained model from the provided filepath
def LoadSKL(model_fpath):
  """
  Import a trained model via joblib.

  Parameters:
    model_fpath (string): file path to the stored model.
  Returns:
    object: if a model is found and loaded correctly, returns an object.
    None: if no matching file is found or an error occurs during loading, returns None.
  """
  to_load = ""
  while to_load.lower() not in ('y', 'n'):
    to_load = input("Do you wish to import a trained model? {y/n)\n")
    if to_load.lower() in 'y':
      model = None
      print("Attempting to import model from: {}".format(model_fpath))
      try:
        model = joblib.load(model_fpath)
        print("SUCCESS: Model successfully imported.")
      except FileNotFoundError:
        print("ERROR: The file \'{}\' does not exist!".format(model_fpath))
        model = None
      except:
        print("ERROR: An unknown error has occured when calling joblib.load()!")
        model = None
      finally:
        return model
    else:
      if to_load.lower() in 'n':
        print("Did not import model.")
        return None
      else:
        print("Please only enter \'y\' to import model or \'n\' to skip importing.")

# To-do: write function that exports model parameters, evaluation metrics, etc.

Using Joblib version: 1.4.0


# Import Data
Checks current working directory for datasets, if datasets are missing downloads a [.zip archive mirror of the CiCDDoS2019 hosted on Kaggle](https://www.kaggle.com/datasets/kristianfrossos/cicddos2019/data).

**NOTE:** the first part of this section is specific to Google Colab, and will not work outside of it. Advise writing an alternative later for local use (relevant when training Neural Network for speed and when usage limits get in the way).

## Reference
Iman Sharafaldin, Arash Habibi Lashkari, Saqib Hakak, and Ali A. Ghorbani, "Developing Realistic Distributed Denial of Service (DDoS) Attack Dataset and Taxonomy", IEEE 53rd International Carnahan Conference on Security Technology, Chennai, India, 2019.


In [6]:
!pip -q install --upgrade --force-reinstall --no-deps kaggle

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/79.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.7/79.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone


In [7]:
# Sets up dataset directory
import os

In [8]:
# mounts google drive
from google.colab import drive
drive.mount("/content/drive")

# directory that the dataset will be downloaded to
dl_dir = os.getcwd() + r'/drive/MyDrive/Colab Notebooks/COMP6002_Group10_Data'

if os.path.exists(dl_dir):
  print("Directory {} already exists.\n".format(dl_dir))
else:
  os.mkdir(dl_dir)
  print("Successfully created the directory {}".format(dl_dir))

Mounted at /content/drive
Directory /content/drive/MyDrive/Colab Notebooks/COMP6002_Group10_Data already exists.



In [9]:
# sets up kaggle environment variables (needed to access API)
from google.colab import userdata
from google.colab import files

# checks if kaggle key and username have been provided as secrets and sets environment variables appropriately
# if not found, attempts to use kaggle.json
try:
  os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
  os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')
  print("Using KAGGLE_KEY and KAGGLE_USERNAME defined in secrets.")
except (userdata.SecretNotFoundError, userdata.NotebookAccessError):
  print("WARN: One or more secret(s) missing or inaccessible.\n")
  if os.path.isfile('~/.kaggle/kaggle.json'):
    print("Using existing kaggle.json")
  else:
    print("Please upload kaggle.json")
    files.upload()

    if os.path.isfile(os.getcwd() + '/content/kaggle.json'):
      !rm -r ~/.kaggle
      !mkdir ~/.kaggle
      !mv ./kaggle.json ~/.kaggle/
      !chmod 600 ~/.kaggle/kaggle.json
    else:
      print("\'kaggle.json\' not uploaded.")
      raise

Using KAGGLE_KEY and KAGGLE_USERNAME defined in secrets.


In [10]:
data_dir = os.getcwd() + '/dataset'

# checks if .zip archive containing dataset already exists in google drive and downloads it if necessary
if os.path.isfile(dl_dir + '/cicddos2019.zip'):
  print("Dataset already present.")
else:
  print("Downloading zipped dataset to {}".format(dl_dir))
  !kaggle datasets download kristianfrossos/cicddos2019 -p {dl_dir.replace(' ', '\ ')}

# creates the content/dataset directory if it doesn't already exist
if os.path.exists(data_dir):
  print("Directory {} already exists.\n".format(data_dir))
else:
  print("Created directory: {}\n".format(data_dir))
  os.mkdir(data_dir)

# extracts contents of .zip archive to content/dataset if directory is not empty
if not os.listdir(data_dir):
  print("Empty directory, extracting dataset.")
  # unzips .zip archive
  !unzip {dl_dir.replace(' ', '\ ') + '/cicddos2019.zip'} -d {data_dir}
else:
  print("Non-empty directory, skipping download")

Dataset already present.
Created directory: /content/dataset

Empty directory, extracting dataset.
Archive:  /content/drive/MyDrive/Colab Notebooks/COMP6002_Group10_Data/cicddos2019.zip
  inflating: /content/dataset/CSV-01-12/01-12/DrDoS_DNS.csv  
  inflating: /content/dataset/CSV-01-12/01-12/DrDoS_LDAP.csv  
  inflating: /content/dataset/CSV-01-12/01-12/DrDoS_MSSQL.csv  
  inflating: /content/dataset/CSV-01-12/01-12/DrDoS_NTP.csv  
  inflating: /content/dataset/CSV-01-12/01-12/DrDoS_NetBIOS.csv  
  inflating: /content/dataset/CSV-01-12/01-12/DrDoS_SNMP.csv  
  inflating: /content/dataset/CSV-01-12/01-12/DrDoS_SSDP.csv  
  inflating: /content/dataset/CSV-01-12/01-12/DrDoS_UDP.csv  
  inflating: /content/dataset/CSV-01-12/01-12/Syn.csv  
  inflating: /content/dataset/CSV-01-12/01-12/TFTP.csv  
  inflating: /content/dataset/CSV-01-12/01-12/UDPLag.csv  
  inflating: /content/dataset/CSV-03-11/03-11/LDAP.csv  
  inflating: /content/dataset/CSV-03-11/03-11/MSSQL.csv  
  inflating: /content/

In [12]:
# initialises empty list
csv_list = []

# iterates through all subdirectories of /content/dataset and appends the filepath of each .csv to csv_list
for root, dirs, files in os.walk(data_dir):
  for f in files:
    if f.endswith(".csv"):
      csv_list.append(os.path.join(root, f))

# if .csv files were found, displays number of files and prints each path
if not csv_list:
  print("No .csv files found!")
else:
  print("{} .csv files found.".format(len(csv_list)))
  for csv in csv_list:
    print(csv)

18 .csv files found.
/content/dataset/CSV-01-12/01-12/DrDoS_MSSQL.csv
/content/dataset/CSV-01-12/01-12/DrDoS_DNS.csv
/content/dataset/CSV-01-12/01-12/DrDoS_SNMP.csv
/content/dataset/CSV-01-12/01-12/DrDoS_LDAP.csv
/content/dataset/CSV-01-12/01-12/DrDoS_NetBIOS.csv
/content/dataset/CSV-01-12/01-12/DrDoS_NTP.csv
/content/dataset/CSV-01-12/01-12/TFTP.csv
/content/dataset/CSV-01-12/01-12/Syn.csv
/content/dataset/CSV-01-12/01-12/DrDoS_SSDP.csv
/content/dataset/CSV-01-12/01-12/DrDoS_UDP.csv
/content/dataset/CSV-01-12/01-12/UDPLag.csv
/content/dataset/CSV-03-11/03-11/NetBIOS.csv
/content/dataset/CSV-03-11/03-11/Portmap.csv
/content/dataset/CSV-03-11/03-11/UDP.csv
/content/dataset/CSV-03-11/03-11/Syn.csv
/content/dataset/CSV-03-11/03-11/LDAP.csv
/content/dataset/CSV-03-11/03-11/UDPLag.csv
/content/dataset/CSV-03-11/03-11/MSSQL.csv


In [40]:
ddos_df = pd.DataFrame()
benign_df = pd.DataFrame()

num_samples = 100

for csv in csv_list:
  data_iter = pd.read_csv(csv, chunksize=2000)

  ddos_desired = ddos_df.size + num_samples
  benign_desired = benign_df.size + num_samples
  print("Goal:\n {} DDoS & {} Benign".format(ddos_desired, benign_desired))

  for chunk in data_iter:
    ddos_rows = chunk[chunk[' Label'].str.lower() != 'benign']
    benign_rows = chunk[chunk[' Label'].str.lower() == 'benign']

    sample_size = min(len(ddos_rows), len(benign_rows)) // 100
    ddos_sample = ddos_rows.sample(n=sample_size, random_state=42)
    benign_sample = benign_rows.sample(n=sample_size, random_state=42)

    ddos_df = pd.concat([ddos_df, ddos_sample], ignore_index=True)
    benign_df = pd.concat([benign_df, benign_sample], ignore_index=True)

    if ddos_df.size >= ddos_desired and benign_df.size >= benign_desired:
      print("Goal reached")
      break

Goal:
 100 DDoS & 100 Benign
Goal reached
Goal:
 628 DDoS & 628 Benign
Goal reached
Goal:
 980 DDoS & 980 Benign
Goal reached
Goal:
 1244 DDoS & 1244 Benign
Goal:
 1244 DDoS & 1244 Benign
Goal reached
Goal:
 1508 DDoS & 1508 Benign
Goal reached
Goal:
 1772 DDoS & 1772 Benign
Goal reached
Goal:
 2388 DDoS & 2388 Benign
Goal:
 2388 DDoS & 2388 Benign
Goal:
 2388 DDoS & 2388 Benign
Goal reached
Goal:
 2564 DDoS & 2564 Benign
Goal reached
Goal:
 2828 DDoS & 2828 Benign
Goal:
 2828 DDoS & 2828 Benign
Goal reached
Goal:
 3708 DDoS & 3708 Benign
Goal reached
Goal:
 4236 DDoS & 4236 Benign
Goal reached
Goal:
 4764 DDoS & 4764 Benign
Goal reached
Goal:
 5556 DDoS & 5556 Benign
Goal reached
Goal:
 5732 DDoS & 5732 Benign
Goal reached


In [85]:
# concatenates ddos and benign dataframes into one subset
subset = pd.concat([ddos_df, benign_df], ignore_index=True)

# class weights
weights = subset.value_counts(' Label', normalize=True)

# prints classes and their weights
print("     Class      |     Weight")
for index in weights.index:
  print(f'{index:<15} | {(weights[index] * 100.0):n}%')

# prints total (for error checking)
print(f'Total: {sum(weights.values) * 100.0}%')

     Class      |     Weight
BENIGN          | 50%
Portmap         | 7.46269%
NetBIOS         | 6.71642%
MSSQL           | 5.97015%
TFTP            | 5.22388%
DrDoS_MSSQL     | 4.47761%
Syn             | 4.47761%
DrDoS_DNS       | 2.98507%
DrDoS_NTP       | 2.23881%
DrDoS_NetBIOS   | 2.23881%
DrDoS_SNMP      | 2.23881%
DrDoS_UDP       | 1.49254%
UDP             | 1.49254%
UDP-lag         | 1.49254%
LDAP            | 0.746269%
WebDDoS         | 0.746269%
Total: 100.0%


## Data Preprocessing
**To-Do:**
*   Substitute missing values with the mean of their respective columns.
    *  Not feasible with size of dataset.
    *  Break dataset into several batches and preprocess per batch?
*   Encode categorical columns.
*   Normalise dataset.

## Dropped Features
*   Unnamed: 0: unknown feature.
*   Flow Id: constructed from Source Ip, Destination Ip, Source Port, Destination Port and Protocol.

In [86]:
from sklearn.model_selection import train_test_split
import ipaddress

# drops irrelevant columns
subset.drop(columns = ['Unnamed: 0', 'Flow ID'],
            inplace = True)

# drop columns with at least 50% missing values
subset.dropna(axis = 1,
              thresh = int(0.5 * subset.shape[0]),
              inplace = True)

# drop duplicate rows
subset.drop_duplicates(inplace = True)

# converts source and destination IP addresses to useable integer values
subset['Source IP_int'] = subset.apply(lambda x: int (ipaddress.IPv4Address(x[' Source IP'])), axis=1)
subset['Destination IP_int'] = subset.apply(lambda x: int (ipaddress.IPv4Address(x[' Destination IP'])), axis=1)

# converts date and time values to unix timestamps
subset['UnixTimestamp'] = subset.apply(lambda x: (pd.to_datetime(x[' Timestamp']).timestamp()), axis=1)

# drops original columns
subset.drop(columns = [' Source IP', ' Destination IP', ' Timestamp'],
            inplace = True)

# save processed subset to .csv
subset.to_csv((dl_dir + '/COMP6002_Processed_Subset.csv'), index = False)

# Random Forest

In [None]:
# build random forest classifier model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(random_state = 42)

# grid of parameters to search through when performing cross validation
rf_params = {
    'class_weight' : ['balanced']
}

# tests all permutations of the parameters outline in rf_params, returns the best performing model
rf_model = GridSearchCV(estimator = rf,
                        param_grid = rf_params,
                        scoring = ["accuracy", "f1_weighted", "roc_auc_ovr"],
                        refit = "f1_weighted",
                        cv = 5,
                        verbose = 3,
                        return_train_score = True)

# rf_model.fit(x_train, y_train)

# Neural Network

## NN Setup
Import and install required libraries, sets some initial values.

In [None]:
# import PyTorch and confirm version
import torch
from torch import nn
print("Using PyTorch version: {}".format(torch.__version__))

# check the availability of and set the device
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print("Using {} device.".format(device))

Using PyTorch version: 2.2.1+cu121

Using cpu device.


In [None]:
# install Skorch, providing a wrapper for using PyTorch with Sklearn
!pip install skorch

# import Skorch and confirm version
from skorch import __version__ as skorch_version
from skorch import NeuralNetClassifier
print("Using Skorch version: {}".format(skorch_version))



## Define Neural Network
Currently using a Multilayer Perceptron (MLP), consider swapping to a hybrid model of a MLP and Convolutional Neural Network (CNN) later.

In [None]:
class NN_MLP(nn.Module):
  """Class that defines a multilayer perceptron model."""
  def __init__(self, input_size, hidden_size, output_size):
    """
    Construct a new NN_MLP object.

    Parameters:
      input_size (int): number of inputs to the input layer.
      hidden_size (int): number of inputs to the hidden layer(s).
      output_size (int): number of outputs from the output layer, typically 1.
    Returns:
      : no value returned.
    """
    super(NN_MLP, self).__init__()
    # layers
    self.h1 = nn.Linear(input_size, hidden_size)
    self.h2 = nn.Linear(hidden_size, hidden_size)
    self.output = nn.Linear(hidden_size, output_size)
    # activation functions
    self.relu = nn.ReLu()

  def forward(self, X):
    """
    Parameters:
      X (Any): features to make prediction on.
    Returns:
      Any: predicted value.
    """
    out = self.h1(X)
    out = self.relu(out)
    out = self.h2(out)
    out = self.relu(out)
    out = self.output(out)

    return out