In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from utils import evaluate_model
import os
from sklearn.ensemble import RandomForestClassifier

# Number of CPUs for ensemble learning methods
N_ENSEMBLE_CPUS = max(os.cpu_count()//2, 1)

In [3]:
normal_traffic_dataset = pd.read_csv("./normalTrafficTraining.csv")
anomalous_traffic_dataset = pd.read_csv("./anomalousTrafficTest.csv")
# Preview the dataset
traffic_dataset = pd.concat([normal_traffic_dataset, anomalous_traffic_dataset])

In [4]:
unique_columns = []
for name, item in traffic_dataset.items():
    if len(item.unique()) == 1:
        unique_columns.append(name)
print(unique_columns)

['protocol', 'userAgent', 'pragma', 'cacheControl', 'accept', 'acceptEncoding', 'acceptCharset', 'acceptLanguage', 'connection']


In [5]:
binary_columns = []
for name, item in traffic_dataset.items():
    if len(item.unique()) > 1 and len(item.unique()) < 5:
        print(item.unique())
        binary_columns.append(name)
print(binary_columns)

['GET' 'POST']
['localhost:8080' 'localhost:9090']
[nan 'application/x-www-form-urlencoded']
['norm' 'anom']
['method', 'host', 'contentType', 'label']


In [6]:
multiple_columns = []
for name, item in traffic_dataset.items():
    if len(item.unique()) > 2:
        multiple_columns.append(name)
print(multiple_columns)

['url', 'contentLength', 'cookie', 'payload']


In [7]:
feat_df = traffic_dataset[['method', 'host', 'contentType', 'contentLength', 'label']]
feat_df = feat_df.replace({"method" : {"GET" : 0, "POST" : 1}})
feat_df = feat_df.replace({"host" : {"localhost:8080" : 0, "localhost:9090" : 1}})
feat_df = feat_df.replace({"contentType" : {"application/x-www-form-urlencoded" : 1}})
feat_df = feat_df.replace({"label" : {"norm" : 0, "anom" : 1}})
feat_df = feat_df.fillna(0)

feat_all = feat_df.drop(["label"], axis=1).values
y_all = feat_df["label"]

In [8]:
feat_train, feat_test, y_train, y_test = train_test_split(
    feat_all, y_all, test_size=0.4, random_state=0
)

In [9]:
rf_5_model = RandomForestClassifier(n_estimators=5, n_jobs=N_ENSEMBLE_CPUS)
rf_5_model.fit(feat_train, y_train)

rf_40_model = RandomForestClassifier(n_estimators=40, n_jobs=N_ENSEMBLE_CPUS)
rf_40_model.fit(feat_train, y_train)

rf_100_model = RandomForestClassifier(n_jobs=N_ENSEMBLE_CPUS)
rf_100_model.fit(feat_train, y_train)

evaluate_model(rf_5_model, "Random forest classifier (5 DTs)", feat_test, y_test)
evaluate_model(rf_40_model, "Random forest classifier (40 DTs)", feat_test, y_test)
evaluate_model(rf_100_model, "Random forest classifier (100 DTs)", feat_test, y_test)

[ Evaluation result for Random forest classifier (5 DTs) ]
Classification report:
              precision    recall  f1-score   support

           0       0.67      0.94      0.78     14373
           1       0.80      0.33      0.46      9895

    accuracy                           0.69     24268
   macro avg       0.74      0.64      0.62     24268
weighted avg       0.72      0.69      0.65     24268

Confusion matrix:
[[13581   792]
 [ 6661  3234]] 

[ Evaluation result for Random forest classifier (40 DTs) ]
Classification report:
              precision    recall  f1-score   support

           0       0.67      0.95      0.79     14373
           1       0.81      0.32      0.46      9895

    accuracy                           0.69     24268
   macro avg       0.74      0.64      0.62     24268
weighted avg       0.73      0.69      0.65     24268

Confusion matrix:
[[13648   725]
 [ 6716  3179]] 

[ Evaluation result for Random forest classifier (100 DTs) ]
Classification rep

In [10]:
feat_filter = traffic_dataset[['method', 'host', 'contentType', 'contentLength', 'label']]
feat_filter = feat_filter[feat_filter["method"] == "GET"]
feat_filter = feat_filter.replace({"method" : {"GET" : 0, "POST" : 1}})
feat_filter = feat_filter.replace({"host" : {"localhost:8080" : 0, "localhost:9090" : 1}})
feat_filter = feat_filter.replace({"contentType" : {"application/x-www-form-urlencoded" : 1}})
feat_filter = feat_filter.replace({"label" : {"norm" : 0, "anom" : 1}})
feat_filter = feat_filter.fillna(0)
feat_all = feat_filter.drop(["label"], axis=1).values
y_all = feat_filter["label"]

In [11]:
feat_train, feat_test, y_train, y_test = train_test_split(
    feat_all, y_all, test_size=0.4, random_state=0
)

In [12]:
rf_5_model = RandomForestClassifier(n_estimators=5, n_jobs=N_ENSEMBLE_CPUS)
rf_5_model.fit(feat_train, y_train)

rf_40_model = RandomForestClassifier(n_estimators=40, n_jobs=N_ENSEMBLE_CPUS)
rf_40_model.fit(feat_train, y_train)

rf_100_model = RandomForestClassifier(n_jobs=N_ENSEMBLE_CPUS)
rf_100_model.fit(feat_train, y_train)

evaluate_model(rf_5_model, "Random forest classifier (5 DTs)", feat_test, y_test)
evaluate_model(rf_40_model, "Random forest classifier (40 DTs)", feat_test, y_test)
evaluate_model(rf_100_model, "Random forest classifier (100 DTs)", feat_test, y_test)

[ Evaluation result for Random forest classifier (5 DTs) ]
Classification report:
              precision    recall  f1-score   support

           0       0.66      1.00      0.79     11227
           1       1.00      0.02      0.03      6009

    accuracy                           0.66     17236
   macro avg       0.83      0.51      0.41     17236
weighted avg       0.78      0.66      0.53     17236

Confusion matrix:
[[11227     0]
 [ 5907   102]] 

[ Evaluation result for Random forest classifier (40 DTs) ]
Classification report:
              precision    recall  f1-score   support

           0       0.66      1.00      0.79     11227
           1       1.00      0.02      0.03      6009

    accuracy                           0.66     17236
   macro avg       0.83      0.51      0.41     17236
weighted avg       0.78      0.66      0.53     17236

Confusion matrix:
[[11227     0]
 [ 5907   102]] 

[ Evaluation result for Random forest classifier (100 DTs) ]
Classification rep

In [13]:
feat_filter = traffic_dataset[['method', 'host', 'contentType', 'contentLength', 'label']]
feat_filter = feat_filter[feat_filter["method"] == "POST"]
feat_filter = feat_filter.replace({"method" : {"GET" : 0, "POST" : 1}})
feat_filter = feat_filter.replace({"host" : {"localhost:8080" : 0, "localhost:9090" : 1}})
feat_filter = feat_filter.replace({"contentType" : {"application/x-www-form-urlencoded" : 1}})
feat_filter = feat_filter.replace({"label" : {"norm" : 0, "anom" : 1}})
feat_filter = feat_filter.fillna(0)
feat_all = feat_filter.drop(["label"], axis=1).values
y_all = feat_filter["label"]

In [14]:
feat_train, feat_test, y_train, y_test = train_test_split(
    feat_all, y_all, test_size=0.4, random_state=0
)

In [15]:
rf_5_model = RandomForestClassifier(n_estimators=5, n_jobs=N_ENSEMBLE_CPUS)
rf_5_model.fit(feat_train, y_train)

rf_40_model = RandomForestClassifier(n_estimators=40, n_jobs=N_ENSEMBLE_CPUS)
rf_40_model.fit(feat_train, y_train)

rf_100_model = RandomForestClassifier(n_jobs=N_ENSEMBLE_CPUS)
rf_100_model.fit(feat_train, y_train)

evaluate_model(rf_5_model, "Random forest classifier (5 DTs)", feat_test, y_test)
evaluate_model(rf_40_model, "Random forest classifier (40 DTs)", feat_test, y_test)
evaluate_model(rf_100_model, "Random forest classifier (100 DTs)", feat_test, y_test)

[ Evaluation result for Random forest classifier (5 DTs) ]
Classification report:
              precision    recall  f1-score   support

           0       0.77      0.76      0.77      3186
           1       0.81      0.81      0.81      3846

    accuracy                           0.79      7032
   macro avg       0.79      0.79      0.79      7032
weighted avg       0.79      0.79      0.79      7032

Confusion matrix:
[[2436  750]
 [ 725 3121]] 

[ Evaluation result for Random forest classifier (40 DTs) ]
Classification report:
              precision    recall  f1-score   support

           0       0.77      0.77      0.77      3186
           1       0.81      0.81      0.81      3846

    accuracy                           0.79      7032
   macro avg       0.79      0.79      0.79      7032
weighted avg       0.79      0.79      0.79      7032

Confusion matrix:
[[2456  730]
 [ 749 3097]] 

[ Evaluation result for Random forest classifier (100 DTs) ]
Classification report:
   