In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE

import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

导入数据集

In [3]:
data_path = "../Dataset/dataset_cleaned.csv"  # the original dataset
df_raw = pd.read_csv(data_path)  # read the raw dataset into dataframe
df_raw

Split training and testing dataset

In [3]:
sklearn.utils.shuffle(df_raw)  # shuffle df order
X, y = df_raw.iloc[:, :-2], df_raw.iloc[:, -2]  # split feature, target arrays

# split dataset: training set-80% testing set-20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=2022)

Remove all meaningless features and copy a testing dataset sample

In [4]:
# copy a dataset sample
X_test_raw = X_test.copy()

# remove all meaningless features
X_train = X_train.drop(["Flow_ID", "Src_IP", "Src_Port", "Dst_IP", "Dst_Port", "Protocol", "Timestamp"], axis=1)

X_test = X_test.drop(["Flow_ID", "Src_IP", "Src_Port", "Dst_IP", "Dst_Port", "Protocol", "Timestamp"], axis=1)

Random Forests Method - Gini Index 二分类

In [5]:
def RF_model(X_train, X_test, y_train, y_test):
    """
    Build a random forests model
    Args:
        X_train (_type_): training dataset - input features
        X_test (_type_): testing dateset - input feature
        y_train (_type_): training dataset - output label
        y_test (_type_): testing dateset - output label
    """
    # build the RF model
    classifier = RandomForestClassifier(random_state=2022, n_estimators=50, criterion='entropy')
    # training model
    rf = classifier.fit(X_train, y_train)
    # predict test dataset
    y_pred = classifier.predict(X_test)
    
    print(f"Training Score: {classifier.score(X_train, y_train)}") 
    print(f"Test Score: {classifier.score(X_test, y_test)}")
    return rf, y_pred
    
    
def model_evaluate(y_test, y_pred):
    """
    Evaluate the model using predicted labels
    Args:
        y_test (_type_): testing dateset - output label
        y_pred (_type_): predicted labels
    """
    # report classification results for each category
    eval_result1 = classification_report(y_test, y_pred)
    print("Classification Report: \n", eval_result1)

    eval_result2 = accuracy_score(y_test, y_pred)
    print("Accuracy:", eval_result2)
    return eval_result1, eval_result2

train and evaluate RF model

In [6]:
rf, y_pred = RF_model(X_train, X_test, y_train, y_test)  # training model
eval_result1, eval_result2 = model_evaluate(y_test, y_pred)  # evaluate model

Training Score: 0.9998435490039287
Test Score: 0.9952438897194312
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      1.00      0.99     45218
           1       1.00      1.00      1.00    242412

    accuracy                           1.00    287630
   macro avg       0.99      1.00      0.99    287630
weighted avg       1.00      1.00      1.00    287630

Accuracy: 0.9952438897194312


Calculate the importance of each feature and sort in descending order

In [7]:
feat_labels = X_train.columns  # get all the features names - 60

feature_drop = []  # store the features need to be deleted

importances = rf.feature_importances_  # get feature importances from random forests model
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))
    if importances[indices[f]] < 0.001:
        feature_drop.append(feat_labels[indices[f]])

 1) Fwd_Seg_Size_Min               0.119865
 2) Fwd_IAT_Min                    0.089994
 3) SYN_Flag_Cnt                   0.086617
 4) Idle_Mean                      0.082633
 5) Init_Fwd_Win_Byts              0.071761
 6) Idle_Min                       0.068613
 7) Flow_IAT_Min                   0.065730
 8) Fwd_Header_Len                 0.057831
 9) Idle_Max                       0.043891
10) Bwd_Header_Len                 0.028092
11) Flow_IAT_Std                   0.026841
12) Fwd_IAT_Tot                    0.022645
13) Fwd_IAT_Std                    0.022121
14) Tot_Fwd_Pkts                   0.020295
15) Flow_IAT_Max                   0.020100
16) Flow_IAT_Mean                  0.019342
17) Bwd_Pkts/s                     0.015876
18) Tot_Bwd_Pkts                   0.015120
19) Fwd_IAT_Mean                   0.013964
20) Subflow_Bwd_Pkts               0.013756
21) Fwd_IAT_Max                    0.012864
22) Fwd_Pkts/s                     0.010213
23) Bwd_Seg_Size_Avg            

In [8]:
# select features
X_train = X_train.drop(feature_drop, axis=1)
X_test = X_test.drop(feature_drop, axis=1)

save testing dataset after features selection

In [9]:
df_test = pd.concat([X_test, y_test], axis=1)  # concat data features and labels
df_test.to_csv("../Dataset/binary/dataset_test.csv", index = False)

X_test_raw.to_csv("../Dataset/binary/dataset_test_raw.csv", index = False) # the raw testing dataset

Evaluate the model - Binary 时间成本低，性能增强

In [None]:
# tranining a model after features selection
rf, y_pred = RF_model(X_train, X_test, y_train, y_test)  # training model
eval_result1, eval_result2 = model_evaluate(y_test, y_pred)  # evaluate model

STOME - Data Oversampling 针对训练集进行数据采样

In [None]:
data_x = X_train
data_y = y_train

SMOTE for Imbalanced Classification - Synthetic Minority Over-sampling Technique

In [None]:
SMOTE = SMOTE()
X__SMOTE, y__SMOT = SMOTE.fit_resample(data_x, data_y)  # fit and transform

df_concat = pd.concat([X__SMOTE, y__SMOT], axis=1)  # concat data features and labels

In [None]:
# Build feature, target arrays 
X, y = df_concat.iloc[:, :-1], df_concat.iloc[:, -1]

# Train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=2022) 

rf, y_pred = RF_model(X_train, X_test, y_train, y_test)  # training model
eval_result1, eval_result2 = model_evaluate(y_test, y_pred)  # evaluate model

In [None]:
# # save the new binary dataset
# df_concat.to_csv("../Dataset/binary/dataset_train.csv", index = True)

SMOTETomek sampling for imbalanced classification 

In [None]:
smote_tomek = SMOTETomek()
X_resampled, y_resampled = smote_tomek.fit_resample(data_x, data_y) # fit and transform

sorted(Counter(y_resampled).items())

In [None]:
df_concat_2 = pd.concat([X_resampled, y_resampled], axis=1)  # concat data features and labels

In [None]:
# Build feature, target arrays 
X, y = df_concat_2.iloc[:, :-1], df_concat_2.iloc[:, -1]

# Train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=2022) 

rf, y_pred = RF_model(X_train, X_test, y_train, y_test)  # training model
eval_result1, eval_result2 = model_evaluate(y_test, y_pred)  # evaluate model

In [None]:
# save the new binary dataset
df_concat_2.to_csv("../Dataset/binary/dataset_train.csv", index = False)