In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import seaborn as sns
import shap
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df1 = pd.read_csv("../Dataset/CIC-IDS-2017/CSVs/MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv", encoding='utf-8')
df2 = pd.read_csv("../Dataset/CIC-IDS-2017/CSVs/MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv", encoding='utf-8')
df3 = pd.read_csv("../Dataset/CIC-IDS-2017/CSVs/MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv", encoding='utf-8')
df4 = pd.read_csv("../Dataset/CIC-IDS-2017/CSVs/MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv", encoding='utf-8')
df5 = pd.read_csv("../Dataset/CIC-IDS-2017/CSVs/MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv", encoding='utf-8')
df6 = pd.read_csv("../Dataset/CIC-IDS-2017/CSVs/MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv", encoding='utf-8')
df7 = pd.read_csv("../Dataset/CIC-IDS-2017/CSVs/MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv", encoding='utf-8')
df8 = pd.read_csv("../Dataset/CIC-IDS-2017/CSVs/MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv", encoding='utf-8')

df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8], ignore_index=True)
df.columns = df.columns.str.strip()
df['Label'] = df['Label'].str.replace('�', '-', regex=False)
le = LabelEncoder()
df['Label_encoded'] = le.fit_transform(df['Label'])
class_names = le.classes_

df_major = df[df['Label'] == 'BENIGN']
df_minor = df[df['Label'] != 'BENIGN']

df_major_down = resample(df_major, 
                         replace=False, 
                         n_samples=100000, 
                         random_state=42)

df_balanced = pd.concat([df_major_down, df_minor])

In [3]:
drop_cols = ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp']
df_balanced.drop(columns=[col for col in drop_cols if col in df_balanced.columns], inplace=True)
df_balanced.replace([np.inf, -np.inf], np.nan, inplace=True)
df_balanced.dropna(inplace=True)

df_balanced['Label'] = df_balanced['Label'].apply(lambda x: 0 if x.strip() == 'BENIGN' else 1)

X = df_balanced.drop(columns=['Label', 'Label_encoded'])
y = df_balanced['Label_encoded']

X = X.apply(pd.to_numeric, errors='coerce')
X = X.dropna()
y = y.loc[X.index]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


models = {
    'MLP': MLPClassifier(
    hidden_layer_sizes=(128, 64),
    activation='relu',
    solver='adam',
    alpha=1e-4,
    learning_rate='adaptive',
    max_iter=300,
    early_stopping=True,
    random_state=42
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100, 
        class_weight='balanced', 
        random_state=42
    ),
    'Logistic Regression': LogisticRegression(
    max_iter=3000,
    class_weight='balanced',
    solver='saga',
    random_state=42
    ),
    'K-Nearest Neighbors': KNeighborsClassifier(
        n_neighbors=5
    ),
    'Decision Tree': DecisionTreeClassifier(
    max_depth=10,
    class_weight='balanced',
    random_state=42
    )
}

In [5]:
loaded_models = {
    'MLP': {},
    'Random Forest': {},
    'Logistic Regression': {},
    'K-Nearest Neighbors': {},
    'Decision Tree': {}
}

for name, _ in loaded_models.items():
    try:
        loaded_models[name] = joblib.load(f'./Models/{name.replace(' ','_').lower()}_model.pkl')
    except:
        loaded_models[name] = {}

results = {}

for name, loaded_model in loaded_models.items():
    if loaded_model == {}:
        models[name].fit(X_train_scaled, y_train)
        y_pred = models[name].predict(X_test_scaled)
        results[name] = {
            'model': models[name],
            'y_pred': y_pred
        }
        print(f'{name} has been trained')
    else:
        y_pred = loaded_model.predict(X_test_scaled)
        results[name] = {
            'model': loaded_model,
            'y_pred': y_pred
        }
        print(f'{name} has been loaded')

MLP has been loaded
Random Forest has been loaded
Logistic Regression has been loaded
K-Nearest Neighbors has been loaded
Decision Tree has been loaded


In [6]:
for name, res in results.items():
    print(f"\n=== {name} ===")
    print(f"Accuracy: {accuracy_score(y_test, res['y_pred']):.4f}")
    print(f"Macro F1: {f1_score(y_test, res['y_pred'], average='macro'):.4f}")
    print(classification_report(y_test, res['y_pred'], target_names=class_names, zero_division=0))


=== MLP ===
Accuracy: 0.9962
Macro F1: 0.8277
                            precision    recall  f1-score   support

                    BENIGN       1.00      0.99      0.99     29979
                       Bot       0.87      0.90      0.89       587
                      DDoS       1.00      1.00      1.00     38408
             DoS GoldenEye       0.99      0.99      0.99      3088
                  DoS Hulk       1.00      1.00      1.00     69037
          DoS Slowhttptest       0.99      0.98      0.98      1650
             DoS slowloris       0.99      0.99      0.99      1739
               FTP-Patator       1.00      1.00      1.00      2380
                Heartbleed       1.00      1.00      1.00         3
              Infiltration       0.50      0.45      0.48        11
                  PortScan       1.00      1.00      1.00     47641
               SSH-Patator       0.97      0.99      0.98      1769
  Web Attack - Brute Force       0.63      0.94      0.75       452


In [7]:
summary = []

for name, res in results.items():
    acc = accuracy_score(y_test, res['y_pred'])
    macro_f1 = f1_score(y_test, res['y_pred'], average='macro')
    summary.append({'Model': name, 'Accuracy': acc, 'Macro F1': macro_f1})

df_summary = pd.DataFrame(summary).sort_values(by='Macro F1', ascending=False)
print(df_summary)

                 Model  Accuracy  Macro F1
1        Random Forest  0.998147  0.870209
0                  MLP  0.996197  0.827731
3  K-Nearest Neighbors  0.995420  0.818577
4        Decision Tree  0.943919  0.752345
2  Logistic Regression  0.930326  0.630981


In [8]:
output_dir = "Confusion Matrices Plots"
os.makedirs(output_dir, exist_ok=True)

for name, res in results.items():
    y_pred = res['y_pred']

    cm = confusion_matrix(y_test, y_pred, labels=range(len(class_names)))

    plt.figure(figsize=(14, 12))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
                xticklabels=class_names, 
                yticklabels=class_names,
                linewidths=0.5, linecolor='gray')

    plt.title(f"{name} – Confusion Matrix (Raw Counts)")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    filename = os.path.join(output_dir, f"{name}_confusion_matrix.png")
    plt.savefig(filename, dpi=300)
    plt.clf()
    print(f"Saved: {filename}")

Saved: Confusion Matrices Plots\MLP_confusion_matrix.png
Saved: Confusion Matrices Plots\Random Forest_confusion_matrix.png
Saved: Confusion Matrices Plots\Logistic Regression_confusion_matrix.png
Saved: Confusion Matrices Plots\K-Nearest Neighbors_confusion_matrix.png
Saved: Confusion Matrices Plots\Decision Tree_confusion_matrix.png


<Figure size 1400x1200 with 0 Axes>

<Figure size 1400x1200 with 0 Axes>

<Figure size 1400x1200 with 0 Axes>

<Figure size 1400x1200 with 0 Axes>

<Figure size 1400x1200 with 0 Axes>

In [17]:
output_dir = "Model Comparison Plots"
os.makedirs(output_dir, exist_ok=True)

df_summary.set_index('Model').plot(kind='barh', title='Model Comparison', colormap='viridis')
plt.xlabel('Score')
plt.xlim(0, 1)
plt.grid(True)
plt.tight_layout()
filename = os.path.join(output_dir, "Model_Comparison.png")
plt.savefig(filename, dpi=300)
plt.clf()
print(f"Saved: {filename}")

Saved: Model Comparison Plots\Model_Comparison.png


<Figure size 640x480 with 0 Axes>

In [14]:
output_dir = "SHAP Plots"
os.makedirs(output_dir, exist_ok=True)

for name in ['MLP', 'Logistic Regression']:
    print(f"Explaining {name} (KernelExplainer)...")
    model = results[name]['model']
    
    X_sample = pd.DataFrame(X_test_scaled[:200], columns=X.columns)

    explainer = shap.KernelExplainer(model.predict_proba, shap.kmeans(X_train_scaled, 50))

    shap_values = explainer.shap_values(X_sample, nsamples=200)

    for i, class_name in enumerate(le.classes_):
        shap_array = np.array(shap_values[i])

        shap_array = shap_array.T

        print(f"\nSHAP Summary for class: {class_name} (shape: {shap_array.shape})")
        shap.summary_plot(shap_array, X_sample, plot_type="bar", show=False)
        plt.title(f"SHAP Feature Importance – {class_name}")
        plt.tight_layout()
        filename = os.path.join(output_dir, f"{name}_shap_{class_name.replace(' ', '_').replace('/', '_')}.png")
        plt.savefig(filename, dpi=300)
        plt.clf()
        print(f"Saved: {filename}")

    shap_values_all = np.sum([np.abs(cls_shap) for cls_shap in shap_values], axis=0)
    shap_values_all = shap_values_all.T
    print(f"\nSHAP Summary for all classes (shape: {shap_values_all.shape})")
    shap.summary_plot(shap_values_all, X_sample, plot_type="bar", show=False)
    plt.title("SHAP Summary for MLP – All Classes")
    plt.tight_layout()
    filename = os.path.join(output_dir, f"{name}_shap_all_classes.png")
    plt.savefig(filename, dpi=300)
    plt.clf()
    print(f"Saved: {filename}")

Explaining MLP (KernelExplainer)...


100%|██████████| 200/200 [00:13<00:00, 14.30it/s]



SHAP Summary for class: BENIGN (shape: (15, 78))
Saved: SHAP Plots\MLP_shap_BENIGN.png

SHAP Summary for class: Bot (shape: (15, 78))
Saved: SHAP Plots\MLP_shap_Bot.png

SHAP Summary for class: DDoS (shape: (15, 78))
Saved: SHAP Plots\MLP_shap_DDoS.png

SHAP Summary for class: DoS GoldenEye (shape: (15, 78))
Saved: SHAP Plots\MLP_shap_DoS_GoldenEye.png

SHAP Summary for class: DoS Hulk (shape: (15, 78))
Saved: SHAP Plots\MLP_shap_DoS_Hulk.png

SHAP Summary for class: DoS Slowhttptest (shape: (15, 78))
Saved: SHAP Plots\MLP_shap_DoS_Slowhttptest.png

SHAP Summary for class: DoS slowloris (shape: (15, 78))
Saved: SHAP Plots\MLP_shap_DoS_slowloris.png

SHAP Summary for class: FTP-Patator (shape: (15, 78))
Saved: SHAP Plots\MLP_shap_FTP-Patator.png

SHAP Summary for class: Heartbleed (shape: (15, 78))
Saved: SHAP Plots\MLP_shap_Heartbleed.png

SHAP Summary for class: Infiltration (shape: (15, 78))
Saved: SHAP Plots\MLP_shap_Infiltration.png

SHAP Summary for class: PortScan (shape: (15, 7

100%|██████████| 200/200 [00:08<00:00, 24.66it/s]



SHAP Summary for class: BENIGN (shape: (15, 78))
Saved: SHAP Plots\Logistic Regression_shap_BENIGN.png

SHAP Summary for class: Bot (shape: (15, 78))
Saved: SHAP Plots\Logistic Regression_shap_Bot.png

SHAP Summary for class: DDoS (shape: (15, 78))
Saved: SHAP Plots\Logistic Regression_shap_DDoS.png

SHAP Summary for class: DoS GoldenEye (shape: (15, 78))
Saved: SHAP Plots\Logistic Regression_shap_DoS_GoldenEye.png

SHAP Summary for class: DoS Hulk (shape: (15, 78))
Saved: SHAP Plots\Logistic Regression_shap_DoS_Hulk.png

SHAP Summary for class: DoS Slowhttptest (shape: (15, 78))
Saved: SHAP Plots\Logistic Regression_shap_DoS_Slowhttptest.png

SHAP Summary for class: DoS slowloris (shape: (15, 78))
Saved: SHAP Plots\Logistic Regression_shap_DoS_slowloris.png

SHAP Summary for class: FTP-Patator (shape: (15, 78))
Saved: SHAP Plots\Logistic Regression_shap_FTP-Patator.png

SHAP Summary for class: Heartbleed (shape: (15, 78))
Saved: SHAP Plots\Logistic Regression_shap_Heartbleed.png

SHA

<Figure size 800x950 with 0 Axes>