Author: Shangyuan Liu

Username: acp21sl

UCard: 001768913

Module: COM6013 - Cybersecurity and Artificial Intelligence Dissertation Project

Project Name: Malicious Endpoint Detection and Response

Step 01: Data pre-processing

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import EditedNearestNeighbours

import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

import sklearn
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

加载数据集

In [2]:
train_data_path = "../Dataset/MQTTset/train70_reduced.csv"  # the training dataset
test_data_path = "../Dataset/MQTTset/test30_reduced.csv"  # the testing dataset

dataset_train = pd.read_csv(train_data_path)  # read the training dataset into data-frame
dataset_test = pd.read_csv(test_data_path)  # read the training dataset into data-frame

In [3]:
df_raw = pd.concat([dataset_train, dataset_test])  # concat training set and testing set

对Dataframe进行统计分析, 查看正常数据与异常数据的比例

统计攻击类型和对应的数量

In [4]:
print('The number of rows in raw dataset', df_raw.shape[0])  # rows  -- 1438157
print('The number of columns in raw dataset', df_raw.shape[1])  # columns  --  86

label_statistics = df_raw["target"].value_counts()  # the statistics of Label features

print('\nThe statistics of Label feature \n', label_statistics)

The number of rows in raw dataset 231646
The number of columns in raw dataset 34

The statistics of Label feature 
 legitimate    115824
dos            91156
bruteforce     10150
malformed       7646
slowite         6441
flood            429
Name: target, dtype: int64


清理无穷大和无穷小的值

In [5]:
# df_repZero = df_raw.replace(0, np.nan)   # replace all 0 values to NaN
    
# def missing_rate(df):
#     """
#     calculate the rate of missing values (NaN) in each feature
#     Args:
#         df (_data-frame_): df_raw
#     Returns:
#         _float_: percentage of missing values in each feature data
#     """
#     # statistics on the number and percentage of missing values
#     nan_percent = round((df.isnull().sum() / len(df)) * 100, 3)
#     # Get the percentage of missing values in each column, sorted in ascending order
#     # >0 is to screen out columns without missing values and return only those with missing values
#     nan_percent = nan_percent[nan_percent >= 0].sort_values()
#     return nan_percent

# missingVal_feature = missing_rate(df_repZero)

# # print the rate of NaN value
# print("The percentage of each feature's missing value\n", missingVal_feature)

Set a threshold value to remove any features with a percentage of missing value above the threshold

In [6]:
# threshold = 99
# # the percentage of missing value over 90%
# missingVal_90 = missingVal_feature[missingVal_feature > threshold]

# # set a list to store any features should be removed
# delete_list = missingVal_90.index.tolist()

# # features in delete_list are deleted to create a new data-frame
# df_raw = df_raw.drop(delete_list, axis=1)

# df_raw

In [5]:
class_names = df_raw.target.unique()
df_raw = df_raw.astype('category')
cat_columns = df_raw.select_dtypes(['category']).columns
df_raw[cat_columns] = df_raw[cat_columns].apply(lambda x: x.cat.codes)

In [7]:
dataset_test

Unnamed: 0,tcp.flags,tcp.time_delta,tcp.len,mqtt.conack.flags,mqtt.conack.flags.reserved,mqtt.conack.flags.sp,mqtt.conack.val,mqtt.conflag.cleansess,mqtt.conflag.passwd,mqtt.conflag.qos,...,mqtt.qos,mqtt.retain,mqtt.sub.qos,mqtt.suback.qos,mqtt.ver,mqtt.willmsg,mqtt.willmsg_len,mqtt.willtopic,mqtt.willtopic_len,target
0,5,2574,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
1,6,1,45,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,6,4020,10,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
3,4,2420,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
4,2,121,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99285,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99286,6,120,11,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
99287,2,1021,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99288,2,26,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


Remove all meaningless features

映射 特征转换

legitimate     0
dos            1
bruteforce     2
malformed      3
slowite        4
flood          5

In [10]:
# converting text data to numeric type data
df_convert = df_raw.replace(
    ['legitimate', 'dos', 'bruteforce', 'malformed', 'slowite', 'flood'], 
    [0, 1, 2, 3, 4, 5]).replace(['MQTT'], [1])

df_convert

In [None]:
df_convert.to_csv("../Dataset/MQTTset/MQTTset_total.csv", index = False)

In [11]:
X, y = df_convert.iloc[:, :-1], df_convert.iloc[:, -1]  # split feature, target arrays

In [11]:
X, y = df_convert.iloc[:, :-1], df_convert.iloc[:, -1]  # split feature, target arrays

# split dataset: training set-80% testing set-20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=2022)

In [17]:
def RF_model(X_train, X_test, y_train, y_test):
    """
    Build a random forests model
    Args:
        X_train (_type_): training dataset - input features
        X_test (_type_): testing dateset - input feature
        y_train (_type_): training dataset - output label
        y_test (_type_): testing dateset - output label
    """
    # build the RF model
    classifier = RandomForestClassifier(random_state=2022, n_jobs=-1, n_estimators=50, criterion='entropy')
    # training model
    rf = classifier.fit(X_train, y_train)
    # predict test dataset
    y_pred = classifier.predict(X_test)
    
    print(f"Training Score: {classifier.score(X_train, y_train)}") 
    print(f"Test Score: {classifier.score(X_test, y_test)}")
    return rf, y_pred
    

def model_evaluate(y_test, y_pred):
    """
    Evaluate the model using predicted labels
    Args:
        y_test (_type_): testing dateset - output label
        y_pred (_type_): predicted labels
    """
    # report classification results for each category
    eval_result1 = classification_report(y_test, y_pred)
    print("Classification Report: \n", eval_result1)

    eval_result2 = accuracy_score(y_test, y_pred)
    print("Accuracy:", eval_result2)
    return eval_result1, eval_result2

In [18]:
rf_model, y_pred = RF_model(X_train, X_test, y_train, y_test)  # training model
eval_result1, eval_result2 = model_evaluate(y_test, y_pred)  # evaluate model

Training Score: 0.9457966034380045
Test Score: 0.9029308087420687
Classification Report: 
               precision    recall  f1-score   support

           0       0.73      0.74      0.74      4351
           1       0.90      0.91      0.90     39077
           2       1.00      0.48      0.65       184
           3       0.92      0.94      0.93     49639
           4       0.71      0.47      0.57      3278
           5       1.00      1.00      1.00      2761

    accuracy                           0.90     99290
   macro avg       0.88      0.76      0.80     99290
weighted avg       0.90      0.90      0.90     99290

Accuracy: 0.9029308087420687
