In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours

import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [2]:
data_path = "../Dataset/dataset_cleaned.csv"  # the original dataset
dataset = pd.read_csv(data_path)  # read the raw dataset into data-frame

data = dataset.iloc[:, 2:]
data_ip_port = dataset.iloc[:, : 2]

In [3]:
def RF_model(X_train, X_test, y_train, y_test):
    """
    Build a random forests model
    Args:
        X_train (_type_): training dataset - input features
        X_test (_type_): testing dateset - input feature
        y_train (_type_): training dataset - output label
        y_test (_type_): testing dateset - output label
    """
    # build the RF model
    classifier = RandomForestClassifier(random_state=2022, n_estimators=50, criterion='entropy')
    # training model
    rf = classifier.fit(X_train, y_train)
    # predict test dataset
    y_pred = classifier.predict(X_test)
    
    print(f"Training Score: {classifier.score(X_train, y_train)}") 
    print(f"Test Score: {classifier.score(X_test, y_test)}")
    return rf, y_pred
    
    
def model_evaluate(y_test, y_pred):
    """
    Evaluate the model using predicted labels
    Args:
        y_test (_type_): testing dateset - output label
        y_pred (_type_): predicted labels
    """
    # report classification results for each category
    eval_result1 = classification_report(y_test, y_pred)
    print("Classification Report: \n", eval_result1)

    eval_result2 = accuracy_score(y_test, y_pred)
    print("Accuracy:", eval_result2)
    return eval_result1, eval_result2

In [4]:
# Build feature, target arrays 
X, y = data.iloc[:, :-2], data.iloc[:, -1]

# Train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=2022)

In [5]:
rf_model, y_pred = RF_model(X_train, X_test, y_train, y_test)  # training model
eval_result1, eval_result2 = model_evaluate(y_test, y_pred)  # evaluate model

Training Score: 0.9997387516700523
Test Score: 0.9887795663410168
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      1.00      0.98     67842
           1       1.00      1.00      1.00    121633
           2       1.00      1.00      1.00     45899
           3       1.00      1.00      1.00     45910
           4       1.00      1.00      1.00     41198
           5       0.95      0.96      0.95     27583
           6       1.00      1.00      1.00     24776
           7       1.00      1.00      1.00     15153
           8       1.00      1.00      1.00      7266
           9       0.98      0.98      0.98      7024
          10       1.00      1.00      1.00      6082
          11       1.00      1.00      1.00      5575
          12       1.00      1.00      1.00      4068
          13       0.90      0.57      0.70      3940
          14       0.60      0.53      0.56      3105
          15       0.99      0.99      0.99 

In [6]:
feat_labels = data.columns[1:][:-1]

feature_drop = []  # store the dropped features 

importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))
    if importances[indices[f]] < 0.0011:
        feature_drop.append(feat_labels[indices[f]])

 1) Init_Bwd_Win_Byts              0.126738
 2) Active_Mean                    0.073334
 3) Pkt_Len_Min                    0.065225
 4) Flow_IAT_Min                   0.051261
 5) Idle_Min                       0.050503
 6) Subflow_Bwd_Byts               0.050496
 7) Idle_Std                       0.044846
 8) PSH_Flag_Cnt                   0.043877
 9) Fwd_Pkts/s                     0.043104
10) Flow_IAT_Mean                  0.041806
11) Flow_IAT_Std                   0.037533
12) TotLen_Fwd_Pkts                0.032867
13) Bwd_Pkts/s                     0.027123
14) Fwd_IAT_Mean                   0.026785
15) Label                          0.025970
16) Bwd_Header_Len                 0.024773
17) Down/Up_Ratio                  0.022395
18) Tot_Fwd_Pkts                   0.021044
19) Fwd_IAT_Std                    0.020025
20) Fwd_IAT_Tot                    0.017907
21) Fwd_IAT_Min                    0.013795
22) Fwd_Act_Data_Pkts              0.011761
23) Subflow_Fwd_Byts            

In [7]:
data_featDel =  data.drop(feature_drop, axis=1)

data_featDel

Unnamed: 0,Flow_Duration,Tot_Fwd_Pkts,Tot_Bwd_Pkts,TotLen_Fwd_Pkts,TotLen_Bwd_Pkts,Fwd_Pkt_Len_Min,Fwd_Pkt_Len_Mean,Fwd_Pkt_Len_Std,Bwd_Pkt_Len_Max,Bwd_Pkt_Len_Mean,...,Init_Fwd_Win_Byts,Init_Bwd_Win_Byts,Fwd_Act_Data_Pkts,Fwd_Seg_Size_Min,Active_Mean,Idle_Std,Idle_Max,Idle_Min,Label,Cat
0,19377440.0,2.0,1.0,200.0,100.0,100.0,100.000000,0.000000,100.0,100.0,...,-1.0,512.0,2.0,0.0,0.0,1.637866e+06,1.084687e+07,8.530574e+06,1,1
1,22110440.0,3.0,1.0,300.0,100.0,100.0,100.000000,0.000000,100.0,100.0,...,-1.0,512.0,3.0,0.0,0.0,1.734992e+06,8.413620e+06,5.367339e+06,1,1
2,2851022.0,2.0,1.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,-1.0,-1.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,1,1
3,16638411.0,3.0,5.0,361.0,1124.0,0.0,120.333333,208.423447,1124.0,224.8,...,-1.0,215.0,1.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,1,1
4,24702225.0,6.0,4.0,300.0,100.0,0.0,50.000000,54.772256,100.0,25.0,...,-1.0,29200.0,3.0,0.0,146923.5,1.938831e+06,1.098504e+07,8.243123e+06,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1438145,130.0,16.0,1.0,23360.0,0.0,1460.0,1460.000000,0.000000,0.0,0.0,...,1026.0,65535.0,15.0,20.0,0.0,2.941206e+01,1.626339e+15,1.626339e+15,1,12
1438146,87.0,9.0,1.0,13140.0,0.0,1460.0,1460.000000,0.000000,0.0,0.0,...,1026.0,65535.0,8.0,20.0,0.0,2.826185e+01,1.626339e+15,1.626339e+15,1,12
1438147,86.0,4.0,1.0,5840.0,0.0,1460.0,1460.000000,0.000000,0.0,0.0,...,1026.0,65535.0,3.0,20.0,0.0,4.680812e+01,1.626339e+15,1.626339e+15,1,12
1438148,142.0,2.0,1.0,2920.0,0.0,1460.0,1460.000000,0.000000,0.0,0.0,...,1026.0,65535.0,1.0,20.0,0.0,0.000000e+00,1.626339e+15,1.626339e+15,1,12


In [8]:
data_muti = pd.concat([data_ip_port, data_featDel], axis=1)  # concat data features and endpoint ip / port

Train RF model and evaluate it

In [9]:
# Build feature, target arrays 
X, y = data_muti.iloc[:, :-2], data_muti.iloc[:, -1]

# Train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=2022)

rf, y_pred = RF_model(X_train, X_test, y_train, y_test)  # training model
eval_result, eval_result = model_evaluate(y_test, y_pred)  # evaluate model

Training Score: 0.9999811265465057
Test Score: 0.993473096223157
Classification Report: 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99     67842
           1       1.00      1.00      1.00    121633
           2       1.00      1.00      1.00     45899
           3       1.00      1.00      1.00     45910
           4       1.00      1.00      1.00     41198
           5       0.97      0.99      0.98     27583
           6       1.00      1.00      1.00     24776
           7       1.00      1.00      1.00     15153
           8       1.00      1.00      1.00      7266
           9       0.99      0.99      0.99      7024
          10       1.00      1.00      1.00      6082
          11       1.00      1.00      1.00      5575
          12       1.00      1.00      1.00      4068
          13       0.91      0.70      0.79      3940
          14       0.86      0.74      0.79      3105
          15       0.99      0.99      0.99  

Edited Nearest Neighbor Rule 在与另一种欠采样方法结合使用时可提供最佳结果

In [14]:
# Undersample and plot imbalanced dataset with the Edited Nearest Neighbor rule
# summarize class distribution
counter = Counter(y)
print(counter)

# define the undersampling method
undersample = EditedNearestNeighbours(n_neighbors=3)

# transform the dataset
X, y = undersample.fit_resample(X, y)

# summarize the new class distribution
counter = Counter(y)
print(counter)

Counter({1: 403711, 0: 226451, 3: 153000, 2: 153000, 4: 138283, 5: 92178, 6: 82288, 7: 50555, 8: 24492, 9: 23980, 10: 20192, 11: 18884, 12: 13780, 13: 12889, 14: 10176, 15: 8034, 16: 6257})
Counter({1: 403464, 0: 220555, 2: 152926, 3: 152914, 4: 137823, 5: 83115, 6: 82189, 7: 50403, 8: 22757, 10: 19811, 11: 16917, 9: 14078, 12: 13619, 15: 7311, 16: 6257, 13: 4811, 14: 4280})


In [15]:
df_ENN = pd.concat([X, y], axis=1)  # concat data features and labels

In [17]:
# Build feature, target arrays 
X, y = df_ENN.iloc[:, :-2], df_ENN.iloc[:, -1]

# Train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=2022)

rf_model, y_pred = RF_model(X_train, X_test, y_train, y_test)  # training model
eval_result1, eval_result2 = model_evaluate(y_test, y_pred)  # evaluate model

Training Score: 0.9999979492669142
Test Score: 0.998720000765607
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     66489
           1       1.00      1.00      1.00    121225
           2       1.00      1.00      1.00     45685
           3       1.00      1.00      1.00     45961
           4       1.00      1.00      1.00     41249
           5       0.99      1.00      1.00     24760
           6       1.00      1.00      1.00     24794
           7       1.00      1.00      1.00     15016
           8       1.00      1.00      1.00      6845
           9       0.99      1.00      1.00      4194
          10       1.00      1.00      1.00      5776
          11       1.00      1.00      1.00      5167
          12       1.00      1.00      1.00      4077
          13       0.98      0.83      0.90      1446
          14       0.96      0.86      0.91      1240
          15       1.00      1.00      1.00  

In [None]:
df_ENN.to_csv("../Dataset/dataset_muticlass_test.csv", index = False)

In [None]:
data_path = "../Dataset/dataset_muticlass_test.csv"  # the original dataset
df_ENN = pd.read_csv(data_path)  # read the raw dataset into data-frame
