In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours

import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

import sklearn
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [2]:
data_path = "../Dataset/dataset_cleaned.csv"  # the original dataset
df_raw = pd.read_csv(data_path)  # read the raw dataset into dataframe
df_raw

Unnamed: 0,Flow_ID,Src_IP,Src_Port,Dst_IP,Dst_Port,Protocol,Timestamp,Flow_Duration,Tot_Fwd_Pkts,Tot_Bwd_Pkts,...,Fwd_Seg_Size_Min,Active_Mean,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Label,Cat
0,6.464587e+09,3.232261e+09,64892.0,3.232261e+09,80.0,6.0,1.522984e+09,19377440.0,2.0,1.0,...,0.0,0.0,0.0,0.0,9.688720e+06,1.637866e+06,1.084687e+07,8.530574e+06,1,1
1,6.464582e+09,3.232261e+09,59992.0,3.232261e+09,80.0,6.0,1.522984e+09,22110440.0,3.0,1.0,...,0.0,0.0,0.0,0.0,7.370147e+06,1.734992e+06,8.413620e+06,5.367339e+06,1,1
2,6.464567e+09,3.232261e+09,44446.0,3.232261e+09,80.0,17.0,1.522988e+09,2851022.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1,1
3,6.464576e+09,3.232261e+09,53936.0,3.232261e+09,80.0,6.0,1.522992e+09,16638411.0,3.0,5.0,...,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1,1
4,6.464578e+09,3.232261e+09,55650.0,3.232261e+09,80.0,6.0,1.522984e+09,24702225.0,6.0,4.0,...,0.0,146923.5,152124.0,141723.0,9.614084e+06,1.938831e+06,1.098504e+07,8.243123e+06,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1438145,6.464475e+09,3.232236e+09,1500.0,3.232236e+09,1883.0,6.0,1.626325e+09,130.0,16.0,1.0,...,20.0,0.0,0.0,0.0,1.626339e+15,2.941206e+01,1.626339e+15,1.626339e+15,1,12
1438146,6.464475e+09,3.232236e+09,1487.0,3.232236e+09,1883.0,6.0,1.626325e+09,87.0,9.0,1.0,...,20.0,0.0,0.0,0.0,1.626339e+15,2.826185e+01,1.626339e+15,1.626339e+15,1,12
1438147,6.464476e+09,3.232236e+09,2009.0,3.232236e+09,1883.0,6.0,1.626325e+09,86.0,4.0,1.0,...,20.0,0.0,0.0,0.0,1.626339e+15,4.680812e+01,1.626339e+15,1.626339e+15,1,12
1438148,6.464476e+09,3.232236e+09,1970.0,3.232236e+09,1883.0,6.0,1.626325e+09,142.0,2.0,1.0,...,20.0,0.0,0.0,0.0,1.626339e+15,0.000000e+00,1.626339e+15,1.626339e+15,1,12


Split training and testing dataset

In [3]:
sklearn.utils.shuffle(df_raw)  # shuffle df order
X, y = df_raw.iloc[:, :-2], df_raw.iloc[:, -1]  # split feature, target arrays

# split dataset: training set-80% testing set-20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=2022)

Remove all meaningless features and copy a testing dataset sample

In [4]:
# copy a dataset sample
X_test_raw = pd.concat([X_test, y_test], axis=1)

# remove all meaningless features
X_train = X_train.drop(["Flow_ID", "Src_IP", "Src_Port", "Dst_IP", "Dst_Port", "Protocol", "Timestamp"], axis=1)

X_test = X_test.drop(["Flow_ID", "Src_IP", "Src_Port", "Dst_IP", "Dst_Port", "Protocol", "Timestamp"], axis=1)

Random Forests Method - Gini Index 多分类

In [None]:
def RF_model(X_train, X_test, y_train, y_test):
    """
    Build a random forests model
    Args:
        X_train (_type_): training dataset - input features
        X_test (_type_): testing dateset - input feature
        y_train (_type_): training dataset - output label
        y_test (_type_): testing dateset - output label
    """
    # build the RF model
    classifier = RandomForestClassifier(random_state=2022, n_estimators=50, criterion='entropy')
    # training model
    rf = classifier.fit(X_train, y_train)
    # predict test dataset
    y_pred = classifier.predict(X_test)
    
    print(f"Training Score: {classifier.score(X_train, y_train)}") 
    print(f"Test Score: {classifier.score(X_test, y_test)}")
    return rf, y_pred
    
    
def model_evaluate(y_test, y_pred):
    """
    Evaluate the model using predicted labels
    Args:
        y_test (_type_): testing dateset - output label
        y_pred (_type_): predicted labels
    """
    # report classification results for each category
    eval_result1 = classification_report(y_test, y_pred)
    print("Classification Report: \n", eval_result1)

    eval_result2 = accuracy_score(y_test, y_pred)
    print("Accuracy:", eval_result2)
    return eval_result1, eval_result2

train and evaluate RF model

In [None]:
rf_model, y_pred = RF_model(X_train, X_test, y_train, y_test)  # training model
eval_result1, eval_result2 = model_evaluate(y_test, y_pred)  # evaluate model

In [None]:
feat_labels = X_train.columns  # get all the features names - 60

feature_drop = []  # store the dropped features 

importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))
    if importances[indices[f]] < 0.0011:
        feature_drop.append(feat_labels[indices[f]])

In [None]:
# select features
X_train = X_train.drop(feature_drop, axis=1)
X_test = X_test.drop(feature_drop, axis=1)

save testing dataset after features selection

In [5]:
df_test = pd.concat([X_test, y_test], axis=1)  # concat data features and labels

df_test.to_csv("../Dataset/muticlass/dataset_test.csv", index = False)

Train RF model and evaluate it

In [None]:
# tranining a model after features selection
rf, y_pred = RF_model(X_train, X_test, y_train, y_test)  # training model
eval_result1, eval_result2 = model_evaluate(y_test, y_pred)  # evaluate model

Edited Nearest Neighbor Rule 在与另一种欠采样方法结合使用时可提供最佳结果

In [None]:
# Undersample and plot imbalanced dataset with the Edited Nearest Neighbor rule
# summarize class distribution
counter = Counter(y_train)
print(counter)

# define the undersampling method
undersample = EditedNearestNeighbours(n_neighbors=3)

# transform the dataset
X_train, y_train = undersample.fit_resample(X_train, y_train)

# summarize the new class distribution
counter = Counter(y_train)
print(counter)

In [None]:
df_muti = pd.concat([X_train, y_train], axis=1)  # concat data features and labels

In [None]:
# Build feature, target arrays 
X, y = df_muti.iloc[:, :-1], df_muti.iloc[:, -1]

# Train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=2022)

rf_model, y_pred = RF_model(X_train, X_test, y_train, y_test)  # training model
eval_result1, eval_result2 = model_evaluate(y_test, y_pred)  # evaluate model

In [None]:
df_muti.to_csv("../Dataset/muticlass/dataset_muticlass_train.csv", index = False)