<a href="https://colab.research.google.com/github/Frosk-Kristian/COMP6002-Group10-Models/blob/develop/COMP6002_Group_Project_ML_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# COMP6002 Computer Science Project - Group 10
Utilising Machine Learning to detect DDoS attacks.

## Reference
Iman Sharafaldin, Arash Habibi Lashkari, Saqib Hakak, and Ali A. Ghorbani, "Developing Realistic Distributed Denial of Service (DDoS) Attack Dataset and Taxonomy", IEEE 53rd International Carnahan Conference on Security Technology, Chennai, India, 2019.

# Setup
Run all of these first. Sets up libraries and directories used throughout notebook.

In [None]:
import pandas as pd
print(f"Using Pandas version: {pd.__version__}")

Using Pandas version: 2.0.3


In [None]:
import numpy as np
print(f"Using Numpy version: {np.__version__}")

Using Numpy version: 1.25.2


In [None]:
from sklearn import __version__ as skl_ver
print(f"Using Sklearn version: {skl_ver}")

Using Sklearn version: 1.2.2


In [None]:
!pip install codecarbon

from codecarbon import EmissionsTracker
from codecarbon import __version__ as cc_ver
print(f"Using CodeCarbon.io version: {cc_ver}")

# track project emissions
tracker = EmissionsTracker()
tracker.start()

In [None]:
import os
from google.colab import drive

# mounts google drive
drive.mount("/content/drive")

# directory that all exports will be stored in
dl_dir = os.getcwd() + r'/drive/MyDrive/Colab Notebooks/COMP6002_Group10_Data'
# directory that the runtime will store the unzipped dataset in (not on your drive)
data_dir = os.getcwd() + '/dataset'

# checks if export directory already exists, if not creates it
if os.path.exists(dl_dir):
  print(f"Directory {dl_dir} already exists.\n")
else:
  os.mkdir(dl_dir)
  print(f"Successfully created the directory {dl_dir}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Directory /content/drive/MyDrive/Colab Notebooks/COMP6002_Group10_Data already exists.



## Instance Specs
Gets hardware specifications of the Google Colab instance.

In [1]:
# CPU specifications
from psutil import *

print(f"Number of CPU: {cpu_count()}")

!cat /proc/cpuinfo

Number of CPU: 2
processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 79
model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
stepping	: 0
microcode	: 0xffffffff
cpu MHz		: 2199.998
cache size	: 56320 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 1
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap xsaveopt arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa mmio_stale_data retbleed
bogomips	: 4399.99
clflush size	: 64
cache_alig

In [2]:
# VRAM specifications
virtual_memory()

svmem(total=13609451520, available=12429377536, percent=8.7, used=873287680, free=8717107200, active=612868096, inactive=4022714368, buffers=344756224, cached=3674300416, shared=1409024, slab=154161152)

In [3]:
# GPU specifications
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


## Define Helper Functions

In [None]:
# helper functions for exporting/importing models trained with Sklearn, do not attempt to use with Neural Network as the library used has its' own methods for exporting/importing
# all functions defined will prompt the user for confirmation, to allow for skipping the functions when running the notebook
import joblib
print(f"Using Joblib version: {joblib.__version__}")

# function used to append a number to a filename in the event that the file already exists
def IncrementFname(file_path):
  """
  Takes a file path, splits the file name/extension and inserts an incrementing integer between them until a unique name is found.

  Parameters:
    file_path (string): full file path to increment, including extension.
  Returns:
    string: new file name in the form of 'path/to/file(i).extension'.
  """
  fname, ext = os.path.splitext(file_path)
  i = 1
  new_name = f'{fname}({i}){ext}' # intial reassignment

  # if filename(1).extension already exists, increment number until an unused one is found
  while os.path.exists(new_name):
    i += 1
    new_name = f'{fname}({i}){ext}'

  return new_name

# Save a trained model to the provided filepath
def SaveSKL(model, model_fpath):
  """
  Exports a trained model via joblib.

  Parameters:
    model (object): model to be saved.
    model_fpath (string): file path that the model will be saved in, including filename and extension.
  Returns:
    : no value returned.
  """
  to_save = ""

  while to_save.lower() not in ('y', 'n'):
    to_save = input("Do you wish to save the trained model? (y/n)\n")
    if to_save.lower() in 'y':
      fpath = model_fpath # assigns value to new variable to allow for reassignment

      if os.path.isfile(fpath):
        print(f"File \'{fpath}\' already exists...")
        fpath = IncrementFname(fpath)
        print(f"...Using the name \'{fpath}\' instead.")

      print(f"Saving model to: {fpath}")
      try:
        joblib.dump(model, fpath)
        print(f"SUCCESS: Model saved to {fpath}")
      except:
        print("ERROR: An unknown error has occured when calling joblib.dump()!")
    else:
      if to_save.lower() in 'n':
        print("Did not save model.")
      else:
        print("Please only enter \'y\' to save model or \'n\' to skip saving.")

# Load a trained model from the provided filepath
def LoadSKL(model_fpath):
  """
  Import a trained model via joblib.

  Parameters:
    model_fpath (string): file path to the stored model.
  Returns:
    object: if a model is found and loaded correctly, returns an object.
    None: if no matching file is found or an error occurs during loading, returns None.
  """
  to_load = ""
  while to_load.lower() not in ('y', 'n'):
    to_load = input("Do you wish to import a trained model? {y/n)\n")
    if to_load.lower() in 'y':
      model = None
      print(f"Attempting to import model from: {model_fpath}")
      try:
        model = joblib.load(model_fpath)
        print("SUCCESS: Model successfully imported.")
      except FileNotFoundError:
        print(f"ERROR: The file \'{model_fpath}\' does not exist!")
        model = None
      except:
        print("ERROR: An unknown error has occured when calling joblib.load()!")
        model = None
      finally:
        return model
    else:
      if to_load.lower() in 'n':
        print("Did not import model.")
        return None
      else:
        print("Please only enter \'y\' to import model or \'n\' to skip importing.")

# To-do: write function that exports model parameters, evaluation metrics, etc.

Using Joblib version: 1.4.0


# Import Data
Checks current working directory for datasets, if datasets are missing downloads a [.zip archive mirror of the CiCDDoS2019 hosted on Kaggle](https://www.kaggle.com/datasets/kristianfrossos/cicddos2019/data).

**NOTE:** the first part of this section is specific to Google Colab, and will not work outside of it. Advise writing an alternative later for local use (relevant when training Neural Network for speed and when usage limits get in the way).


In [None]:
!pip -q install --upgrade --force-reinstall --no-deps kaggle

In [None]:
# sets up kaggle environment variables (needed to access API)
from google.colab import userdata
from google.colab import files

# checks if kaggle key and username have been provided as secrets and sets environment variables appropriately
# if not found, attempts to use kaggle.json
try:
  os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
  os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')
  print("Using KAGGLE_KEY and KAGGLE_USERNAME defined in secrets.")
except (userdata.SecretNotFoundError, userdata.NotebookAccessError):
  print("WARN: One or more secret(s) missing or inaccessible.\n")
  if os.path.isfile('~/.kaggle/kaggle.json'):
    print("Using existing kaggle.json")
  else:
    print("Please upload kaggle.json")
    files.upload()

    if os.path.isfile(os.getcwd() + '/content/kaggle.json'):
      !rm -r ~/.kaggle
      !mkdir ~/.kaggle
      !mv ./kaggle.json ~/.kaggle/
      !chmod 600 ~/.kaggle/kaggle.json
    else:
      print("\'kaggle.json\' not uploaded.")
      raise

Using KAGGLE_KEY and KAGGLE_USERNAME defined in secrets.


In [None]:
# checks if .zip archive containing dataset already exists in google drive and downloads it if necessary
if os.path.isfile(dl_dir + '/cicddos2019.zip'):
  print("Dataset already present.")
else:
  print(f"Downloading zipped dataset to {dl_dir}")
  !kaggle datasets download kristianfrossos/cicddos2019 -p {dl_dir.replace(' ', '\ ')}

# creates the content/dataset directory if it doesn't already exist
if os.path.exists(data_dir):
  print(f"Directory {data_dir} already exists.")
else:
  print(f"Created directory: {data_dir}")
  os.mkdir(data_dir)

# extracts contents of .zip archive to content/dataset if directory is not empty
if not os.listdir(data_dir):
  print("Empty directory, extracting dataset.")
  # unzips .zip archive
  !unzip {dl_dir.replace(' ', '\ ') + '/cicddos2019.zip'} -d {data_dir}
else:
  print("Non-empty directory, skipping download")

Dataset already present.
Directory /content/dataset already exists.
Non-empty directory, skipping download


In [None]:
# initialises empty list
csv_list = []

# iterates through all subdirectories of /content/dataset and appends the filepath of each .csv to csv_list
for root, dirs, files in os.walk(data_dir):
  for f in files:
    if f.endswith(".csv"):
      csv_list.append(os.path.join(root, f))

# if .csv files were found, displays number of files and prints each path
if not csv_list:
  print("No .csv files found!")
else:
  print(f"{len(csv_list)} .csv files found.")
  for csv in csv_list:
    print(csv)

18 .csv files found.
/content/dataset/CSV-01-12/01-12/DrDoS_MSSQL.csv
/content/dataset/CSV-01-12/01-12/DrDoS_DNS.csv
/content/dataset/CSV-01-12/01-12/DrDoS_SNMP.csv
/content/dataset/CSV-01-12/01-12/DrDoS_LDAP.csv
/content/dataset/CSV-01-12/01-12/DrDoS_NetBIOS.csv
/content/dataset/CSV-01-12/01-12/DrDoS_NTP.csv
/content/dataset/CSV-01-12/01-12/TFTP.csv
/content/dataset/CSV-01-12/01-12/Syn.csv
/content/dataset/CSV-01-12/01-12/DrDoS_SSDP.csv
/content/dataset/CSV-01-12/01-12/DrDoS_UDP.csv
/content/dataset/CSV-01-12/01-12/UDPLag.csv
/content/dataset/CSV-03-11/03-11/NetBIOS.csv
/content/dataset/CSV-03-11/03-11/Portmap.csv
/content/dataset/CSV-03-11/03-11/UDP.csv
/content/dataset/CSV-03-11/03-11/Syn.csv
/content/dataset/CSV-03-11/03-11/LDAP.csv
/content/dataset/CSV-03-11/03-11/UDPLag.csv
/content/dataset/CSV-03-11/03-11/MSSQL.csv


## Samples Dataset and Creates Subset

In [None]:
ddos_df = pd.DataFrame()
benign_df = pd.DataFrame()

for csv in csv_list:
  data_iter = pd.read_csv(csv, chunksize=2000)
  print(f"Reading {csv}...")

  for chunk in data_iter:
    ddos_rows = chunk[chunk[' Label'].str.lower() != 'benign']
    benign_rows = chunk[chunk[' Label'].str.lower() == 'benign']

    sample_size = min(len(ddos_rows), len(benign_rows)) // 100
    ddos_sample = ddos_rows.sample(n=sample_size, random_state=42)
    benign_sample = benign_rows.sample(n=sample_size, random_state=42)

    ddos_df = pd.concat([ddos_df, ddos_sample], ignore_index=True)
    benign_df = pd.concat([benign_df, benign_sample], ignore_index=True)

Reading /content/dataset/CSV-01-12/01-12/DrDoS_MSSQL.csv...
Reading /content/dataset/CSV-01-12/01-12/DrDoS_DNS.csv...
Reading /content/dataset/CSV-01-12/01-12/DrDoS_SNMP.csv...
Reading /content/dataset/CSV-01-12/01-12/DrDoS_LDAP.csv...
Reading /content/dataset/CSV-01-12/01-12/DrDoS_NetBIOS.csv...
Reading /content/dataset/CSV-01-12/01-12/DrDoS_NTP.csv...
Reading /content/dataset/CSV-01-12/01-12/TFTP.csv...
Reading /content/dataset/CSV-01-12/01-12/Syn.csv...
Reading /content/dataset/CSV-01-12/01-12/DrDoS_SSDP.csv...
Reading /content/dataset/CSV-01-12/01-12/DrDoS_UDP.csv...
Reading /content/dataset/CSV-01-12/01-12/UDPLag.csv...
Reading /content/dataset/CSV-03-11/03-11/NetBIOS.csv...
Reading /content/dataset/CSV-03-11/03-11/Portmap.csv...
Reading /content/dataset/CSV-03-11/03-11/UDP.csv...
Reading /content/dataset/CSV-03-11/03-11/Syn.csv...
Reading /content/dataset/CSV-03-11/03-11/LDAP.csv...
Reading /content/dataset/CSV-03-11/03-11/UDPLag.csv...
Reading /content/dataset/CSV-03-11/03-11/MS

In [None]:
# concatenates ddos and benign dataframes into one subset
subset = pd.concat([ddos_df, benign_df], ignore_index=True)

# class weights
weights = subset.value_counts(' Label', normalize=True)
counts = subset.value_counts(' Label')

# prints classes and their weights
print("     Class      |     Weight     |     Count")
for index in weights.index:
  print(f'{index:<15} | {(weights[index] * 100.0):<14n} | {counts[index]:n}')

# prints total (for error checking)
print(f"{'Total':<15} | {sum(weights.values) * 100.0:<14n} | {sum(counts.values)}")

# stores unprocessed subset
subset.to_csv((dl_dir + '/COMP6002_Raw_Subset.csv'), index = False)

     Class      |     Weight     |     Count
BENIGN          | 50             | 330
Syn             | 11.8182        | 78
TFTP            | 11.6667        | 77
DrDoS_NTP       | 5.60606        | 37
UDP-lag         | 4.24242        | 28
DrDoS_DNS       | 2.57576        | 17
UDPLag          | 2.42424        | 16
MSSQL           | 2.27273        | 15
UDP             | 1.9697         | 13
Portmap         | 1.81818        | 12
NetBIOS         | 1.36364        | 9
DrDoS_UDP       | 1.06061        | 7
DrDoS_MSSQL     | 0.909091       | 6
LDAP            | 0.757576       | 5
WebDDoS         | 0.606061       | 4
DrDoS_SNMP      | 0.454545       | 3
DrDoS_NetBIOS   | 0.454545       | 3
Total           | 100            | 660


# Data Preprocessing
**To-Do:**
*   Identify features that can be droped.

## Dropped Features
*   Unnamed: 0: unknown feature.
*   Flow Id: constructed from Source Ip, Destination Ip, Source Port, Destination Port and Protocol.
*   Similar HTTP: object with no meaningful way to encode (the exact same objects won't necessaril exist in real data)

In [None]:
# optionally import a unprocessed subset, skips above steps.
raw_load = ""

while raw_load.lower() not in ('y', 'n'):
  raw_load = input("Do you wish to load a previously generated (but not processed) subset? (y/n)\n")

  if raw_load.lower() in ('y'):
    raw_path = dl_dir + '/COMP6002_Raw_Subset.csv'
    try:
      subset = pd.read_csv(raw_path)
      print(f"\'{raw_path}\' loaded successfully.")
    except:
      print(f"ERROR: an unknown error occured loading \'{raw_path}\'!")
  else:
    if raw_load.lower() in ('n'):
      print("Did not attempt to load existing data.")
    else:
      print("WARNING: please only input \'y\' to load the data or \'n\' to skip loading.")

del raw_load

Do you wish to load a previously generated (but not processed) subset? (y/n)
y
'/content/drive/MyDrive/Colab Notebooks/COMP6002_Group10_Data/COMP6002_Raw_Subset.csv' loaded successfully.


In [None]:
from sklearn.model_selection import train_test_split
import ipaddress

# drops irrelevant columns
subset.drop(columns = ['Unnamed: 0', 'Flow ID', 'SimillarHTTP'],
            inplace = True)

# replace infinite values with NaN so they are caught by the next 2 steps
subset.replace([np.inf, -np.inf], np.nan, inplace = True)

# drop columns with at least 50% missing values
subset.dropna(axis = 1,
              thresh = int(0.5 * subset.shape[0]),
              inplace = True)

# replace missing values with the mean of their columns
for col in subset.columns:
  if subset[col].isna().sum() > 0:
    subset[col].fillna(subset[col].mean(), inplace = True)

# drop duplicate rows
subset.drop_duplicates(inplace = True)

# converts source and destination IP addresses to useable integer values
subset['Source IP_int'] = subset.apply(lambda x: int (ipaddress.IPv4Address(x[' Source IP'])), axis=1)
subset['Destination IP_int'] = subset.apply(lambda x: int (ipaddress.IPv4Address(x[' Destination IP'])), axis=1)

# converts date and time values to unix timestamps
subset['UnixTimestamp'] = subset.apply(lambda x: (pd.to_datetime(x[' Timestamp']).timestamp()), axis=1)

# drops original columns
subset.drop(columns = [' Source IP', ' Destination IP', ' Timestamp'],
            inplace = True)

# splits subset across x and y axis
X = subset.drop(columns = [' Label'], inplace = False)
y = subset[' Label']

In [None]:
subset_path = dl_dir + '/COMP6002_Processed_Subset.csv'

## Save Processed Data

In [None]:
# optionally save processed subset to .csv
subset_save = ""

while subset_save.lower() not in ('y', 'n'):
  subset_save = input("Do you wish to export the subset to a .csv? (y/n)")

  if subset_save.lower() in ('y'):
    new_subset_path = subset_path

    if os.path.exists(new_subset_path):
      new_subset_path = IncrementFname(subset_path)

    try:
      subset.to_csv(new_subset_path, index = False)
      print(f"Subset saved as \'{new_subset_path}\'.")
    except:
      print(f"ERROR: an unknown error occured saving \'{new_subset_path}\'!")
  else:
    if subset_save.lower() in ('n'):
      print("Skipped exporting to .csv file.")
    else:
      print("WARNING: please only input \'y\' to save the data to a .csv or \'n\' to skip saving.")

del subset_save

Do you wish to export the subset to a .csv? (y/n)y
Subset saved as '/content/drive/MyDrive/Colab Notebooks/COMP6002_Group10_Data/COMP6002_Processed_Subset.csv'.


## Load Previously Processed Data

In [None]:
# optionally import a preprocessed subset, skips above steps.
subset_load = ""

while subset_load.lower() not in ('y', 'n'):
  subset_load = input("Do you wish to load a previously preprocessed subset? (y/n)\n")

  if subset_load.lower() in ('y'):
    subset_path = dl_dir + '/COMP6002_Processed_Subset.csv'
    try:
      subset = pd.read_csv(subset_path)
      print(f"\'{subset_path}\' loaded successfully.")
    except:
      print(f"ERROR: an unknown error occured loading \'{subset_path}\'!")
  else:
    if subset_load.lower() in ('n'):
      print("Did not attempt to load existing data.")
    else:
      print("WARNING: please only input \'y\' to load the data or \'n\' to skip loading.")

del subset_load

Do you wish to load a previously preprocessed subset? (y/n)
n
Did not attempt to load existing data.


## Split Training/Testing Data

In [None]:
# set aside 20% of data to be used in testing, keeping the remaining 80% for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

## Normalise Data

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train[X_train.columns] = scaler.transform(X_train[X_train.columns])

## Feature Selection

In [None]:
from sklearn.feature_selection import SelectPercentile

selector = SelectPercentile(percentile=50).fit(X_train[X_train.columns], y_train)

X_new = X[selector.get_feature_names_out()]

print(f"Features remaining: {X_new.shape[1]}")
for col in X_new.columns:
  print(f"{col}")

Features remaining: 42
 Source Port
 Destination Port
 Protocol
 Flow Duration
 Total Fwd Packets
Total Length of Fwd Packets
 Fwd Packet Length Max
 Fwd Packet Length Min
 Fwd Packet Length Mean
 Bwd Packet Length Min
 Bwd Packet Length Mean
 Bwd Packet Length Std
Flow Bytes/s
 Flow Packets/s
Fwd IAT Total
Fwd PSH Flags
 Fwd Header Length
Fwd Packets/s
 Min Packet Length
 Max Packet Length
 Packet Length Mean
 Packet Length Std
 RST Flag Count
 ACK Flag Count
 URG Flag Count
 CWE Flag Count
 Down/Up Ratio
 Average Packet Size
 Avg Fwd Segment Size
 Avg Bwd Segment Size
 Fwd Header Length.1
Subflow Fwd Packets
 Subflow Fwd Bytes
Init_Win_bytes_forward
 act_data_pkt_fwd
 min_seg_size_forward
Idle Mean
 Idle Max
 Idle Min
 Inbound
Destination IP_int
UnixTimestamp


  f = msb / msw


# Build Models

In [None]:
from sklearn.metrics import f1_score, roc_auc_score as roc_auc, accuracy_score as accuracy

## Random Forest
**To-Do:**
*   Hyperparameter tuning.

In [None]:
# build random forest classifier model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(random_state = 42)

# grid of parameters to search through when performing cross validation
rf_params = {
    'n_estimators' : [100, 200, 500],
    'max_features' : [0.5, 0.75, 1.0],
    'class_weight' : ['balanced']
}

# tests all permutations of the parameters outline in rf_params, returns the best performing model
rf_model = GridSearchCV(estimator = rf,
                        param_grid = rf_params,
                        scoring = ["accuracy", "f1_weighted", "roc_auc_ovr"],
                        refit = "f1_weighted",
                        cv = 5,
                        verbose = 3,
                        return_train_score = True)

### Train Random Forest

In [None]:
# train model
rf_model.fit(X_train, y_train)

In [None]:
# training results
rf_train = rf_model.predict(X_train)

# displays the best model produced in training as well as its' hyperparameters and f1 score
print(f'Training Result:\n Best Model: {rf_model.best_estimator_}\n Best Parameters: {rf_model.best_params_}\n F1 Score: {rf_model.best_score_}')

# evaluate the models performance and display scores
print(f'Random Forest (TRAINING):\n accuracy: {accuracy(y_train, rf_train):f}\n f1 score: {f1_score(y_train, rf_train):f}\n roc area under curve: {roc_auc(y_train, rf_train):f}')

### Test Random Forest

In [None]:
# make predictions on test data
rf_test = rf_model.predict(X_test)

# evaluate the models performance and display scores
print(f'Random Forest (TESTING):\n accuracy: {accuracy(y_test, rf_test):f}\n f1 score: {f1_score(y_test, rf_test):e}\n roc area under curve: {roc_auc(y_test, rf_test):f}')

### Export Model

In [None]:
# uses joblib to serialise trained random forest model
SaveSKL(model = rf_model, model_path = (dl_dir + "/random_forest.joblib"))

## Neural Network

### NN Setup
Import and install required libraries, sets some initial values.

In [None]:
# import PyTorch and confirm version
import torch
from torch import nn
print("Using PyTorch version: {}".format(torch.__version__))

# check the availability of and set the device
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device.")

Using PyTorch version: 2.2.1+cu121

Using cpu device.


In [None]:
# install Skorch, providing a wrapper for using PyTorch with Sklearn
!pip install skorch

# import Skorch and confirm version
from skorch import __version__ as skorch_version
from skorch import NeuralNetClassifier
print("Using Skorch version: {}".format(skorch_version))



### Define Neural Network
Currently using a Multilayer Perceptron (MLP), consider swapping to a hybrid model of a MLP and Convolutional Neural Network (CNN) later.

In [None]:
class NN_MLP(nn.Module):
  """Class that defines a multilayer perceptron model."""
  def __init__(self, input_size, hidden_size, output_size):
    """
    Construct a new NN_MLP object.

    Parameters:
      input_size (int): number of inputs to the input layer.
      hidden_size (int): number of inputs to the hidden layer(s).
      output_size (int): number of outputs from model, equivalent to number of classes.
    Returns:
      : no value returned.
    """
    super(NN_MLP, self).__init__()
    # layers
    self.h1 = nn.Linear(input_size, hidden_size)
    self.h2 = nn.Linear(hidden_size, hidden_size)
    self.output = nn.Linear(hidden_size, output_size)
    # activation functions
    self.relu = nn.ReLu()
    self.softmax = nn.Softmax(dim = 1)

  def forward(self, X):
    """
    Parameters:
      X (Any): features to make prediction on.
    Returns:
      Any: predicted value.
    """
    out = self.h1(X)
    out = self.relu(out)
    out = self.h2(out)
    out = self.relu(out)
    out = self.output(out)
    out = self.softmax(out)

    return out

# Emissions

In [None]:
emissions = tracker.stop()
print(f"Emissions: {emissions * 1_000} kg CO₂")