## Phase 4 Data Mining
In this notebook we use machine learning to analyze the detection of each of the Modbus Protocol attacks: 
- Arp-based Man in the Middle (mitm)
- TCP SYN flooding
- Modbus Query flood
- Ping flooding

In [24]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
import time
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
# Suppress User Warning messages
warnings.simplefilter(action='ignore', category=UserWarning)

In [25]:
# Loading the datasets and concatenating them into one df
files = ['clean.csv', 'mitm.csv', 'modbusQuery2Flooding.csv', 'modbusQueryFlooding.csv', 'pingFloodDDos.csv', 'tcpSYNFlood.csv']
data = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)


relevant_features = ['Length',
                    'Protocol',
                    'SYNFlag',
                    'ACKFlag',
                    'TimeDelta',
                    'RelativeTime', 
                    'AttackName']

data = data[relevant_features]

In [26]:
# mapping dictionary, using label encoding to convert the attack names to integers
attack_name_mapping = {
    'Clean': 1,  
    'mitm': 2,
    'modbusQuery2Flooding': 3,
    'modbusQueryFlooding': 4,
    'pingFloodDDoS': 5,
    'tcpSYNFloodDDoS': 6
}

# Apply the mapping
data['AttackName'] = data['AttackName'].map(attack_name_mapping)

# Check the results of the mapping
print("Unique values in 'AttackName' after mapping:", data['AttackName'].unique())
print("Non-null values in 'AttackName' after mapping:", data['AttackName'].notnull().sum()) # total number of rows in 'AttackName' column that don't have null values
print("Null values in 'AttackName':", data['AttackName'].isnull().sum())


Unique values in 'AttackName' after mapping: [1 2 3 4 5 6]
Non-null values in 'AttackName' after mapping: 6889976
Null values in 'AttackName': 0


In [27]:
# encoding the categorical features
categorical_features = ['Protocol', 'SYNFlag', 'ACKFlag']
label_encoder = LabelEncoder()
for col in categorical_features:
    data[col] = label_encoder.fit_transform(data[col].astype(str)) # convert the column value to string before encoding

# Impute missing values for the rest of the dataset
imputer = SimpleImputer(strategy='most_frequent')
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Convert all columns to numeric
data = data.apply(pd.to_numeric, errors='coerce')

# Split the dataset into features (X) and target (y)
X = data.drop(['AttackName'], axis=1)  
y = data['AttackName'].astype(int)

In [28]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_selection import SelectKBest, f_regression

selector = SelectKBest(score_func=f_regression, k=5)
X_new = selector.fit_transform(X, y)

# Get the selected feature indices
selected_indices = selector.get_support(indices=True)
print(selector.scores_)
featuresArr = [];
print(selected_indices)
for col in data.columns:
    featuresArr.append(col);
for i in selected_indices:
    print(featuresArr[i])

[4.34395187e+05 2.47506960e+05 8.10151768e+04 1.46548656e+06
 5.09698962e-02 6.74046217e+05]
[0 1 2 3 5]
Length
Protocol
SYNFlag
ACKFlag
RelativeTime
