In [1]:
import os
import time
import json
import pickle
import numpy as np
import pandas as pd
from collections import OrderedDict

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, accuracy_score, confusion_matrix, precision_recall_fscore_support
)

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# -------------------------- Config --------------------------
np.random.seed(42)

BASE_PATH = r"D:\AutoSNortCopy\XAI-AutoSnort\Datasets\CICIDS-2017"
OUTPUT_DIR = r"D:\AutoSNortCopy\XAI-AutoSnort\model_cic"
os.makedirs(OUTPUT_DIR, exist_ok=True)

FILE_PATHS = [
    os.path.join(BASE_PATH, "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"),
    os.path.join(BASE_PATH, "Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv"),
    os.path.join(BASE_PATH, "Friday-WorkingHours-Morning.pcap_ISCX.csv"),
    os.path.join(BASE_PATH, "Monday-WorkingHours.pcap_ISCX.csv"),
    os.path.join(BASE_PATH, "Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv"),
    os.path.join(BASE_PATH, "Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv"),
    os.path.join(BASE_PATH, "Tuesday-WorkingHours.pcap_ISCX.csv"),
    os.path.join(BASE_PATH, "Wednesday-workingHours.pcap_ISCX.csv")
]

REQ_COLS = [
    'Packet Length Std', 'Total Length of Bwd Packets', 'Subflow Bwd Bytes',
    'Destination Port', 'Packet Length Variance', 'Bwd Packet Length Mean',
    'Avg Bwd Segment Size', 'Bwd Packet Length Max', 'Init_Win_bytes_backward',
    'Total Length of Fwd Packets', 'Subflow Fwd Bytes', 'Init_Win_bytes_forward',
    'Average Packet Size', 'Packet Length Mean', 'Max Packet Length', 'Label'
]
REQ_COLS = [c.strip() for c in REQ_COLS]
FEATURE_NAMES = [c for c in REQ_COLS if c != "Label"]


print("--- [Module] Imports loaded ---")
print("--- File paths defined based on user-provided folder. ---")
print(f"--- Loading specific {len(FEATURE_NAMES)} features from paper's 'Top 15' model... ---")

--- [Module] Imports loaded ---
--- File paths defined based on user-provided folder. ---
--- Loading specific 15 features from paper's 'Top 15' model... ---


In [3]:
# -------------------------- Load --------------------------
print("--- Loading and concatenating 8 CSV files... ---")
t0 = time.time()
df_list = []
for f in FILE_PATHS:
    part = pd.read_csv(
        f, encoding="latin1", low_memory=False,
        usecols=lambda col: col.strip() in REQ_COLS
    )
    df_list.append(part)

df = pd.concat(df_list, ignore_index=True)
df.columns = df.columns.str.strip()
df = df[REQ_COLS]
print(f"--- Data loaded. Shape: {df.shape}. Time: {time.time()-t0:.2f}s ---")

--- Loading and concatenating 8 CSV files... ---
--- Data loaded. Shape: (2830743, 16). Time: 16.75s ---


In [4]:
# -------------------------- Labels --------------------------
print("--- Starting preprocessing pipeline... ---")

label_map = {
    'DoS GoldenEye': 'Dos/Ddos', 'DoS Hulk': 'Dos/Ddos',
    'DoS Slowhttptest': 'Dos/Ddos', 'DoS slowloris': 'Dos/Ddos',
    'Heartbleed': 'Dos/Ddos', 'DDoS': 'Dos/Ddos',
    'FTP-Patator': 'Brute Force', 'SSH-Patator': 'Brute Force',
    'Web Attack - Brute Force': 'Web Attack',
    'Web Attack - Sql Injection': 'Web Attack',
    'Web Attack - XSS': 'Web Attack',
    'Web Attack \x96 Brute Force': 'Web Attack',
    'Web Attack \x96 Sql Injection': 'Web Attack',
    'Web Attack \x96 XSS': 'Web Attack'
}
y_raw = df['Label'].replace(label_map)
X_raw = df.drop('Label', axis=1)

# -------------------------- Normalize (paper's per-column max) --------------------------
print("--- Applying custom normalization (paper's method)... ---")
X_scaled = X_raw.copy()
scaler_values = {}
for col in X_scaled.columns:
    X_scaled[col] = pd.to_numeric(X_scaled[col], errors='coerce')
    max_val = X_scaled[col].abs().max()
    if (max_val == 0) or pd.isna(max_val):
        X_scaled[col] = 0.0
        max_val = 0.0
    else:
        X_scaled[col] = X_scaled[col] / max_val
    scaler_values[col] = float(max_val)

# Clean + dedupe (on normalized)
df_processed = X_scaled.assign(Label=y_raw)
df_processed.replace([np.inf, -np.inf], np.nan, inplace=True)
df_processed.fillna(0, inplace=True)

before = len(df_processed)
df_processed.drop_duplicates(inplace=True)
after = len(df_processed)
print(f"--- Dropped {before-after} duplicate rows; kept {after}. ---")

--- Starting preprocessing pipeline... ---
--- Applying custom normalization (paper's method)... ---
--- Dropped 1903008 duplicate rows; kept 927735. ---


In [5]:
# -------------------------- Persist Artifacts --------------------------
with open(os.path.join(OUTPUT_DIR, "feature_names_cic.pkl"), "wb") as f:
    pickle.dump(FEATURE_NAMES, f)
with open(os.path.join(OUTPUT_DIR, "scaler_values_cic.pkl"), "wb") as f:
    pickle.dump(scaler_values, f)
print("--- Saved feature_names_cic.pkl & scaler_values_cic.pkl ---")

--- Saved feature_names_cic.pkl & scaler_values_cic.pkl ---


In [6]:
# -------------------------- Encode labels --------------------------
le = LabelEncoder()
y_enc = le.fit_transform(df_processed['Label'].values)
with open(os.path.join(OUTPUT_DIR, "encoder_cic.pkl"), "wb") as f:
    pickle.dump(le, f)
with open(os.path.join(OUTPUT_DIR, "label_value_to_name_cic.pkl"), "wb") as f:
    pickle.dump({i: c for i, c in enumerate(le.classes_)}, f)
print(f"--- Label classes: {list(le.classes_)} ---")

X_final = df_processed.drop('Label', axis=1)

--- Label classes: ['BENIGN', 'Bot', 'Brute Force', 'Dos/Ddos', 'Infiltration', 'PortScan', 'Web Attack'] ---


In [7]:
# -------------------------- Split --------------------------
print("--- Splitting into training and testing sets (70/30)... ---")
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_enc, test_size=0.30, random_state=42, stratify=y_enc
)
print(f"--- X_train: {X_train.shape}, X_test: {X_test.shape} ---")

# Align raw test rows with same indices (for RAW CSV)
X_raw_reset = X_raw.reset_index(drop=True)
X_test_raw = X_raw_reset.loc[X_test.index].reset_index(drop=True)

--- Splitting into training and testing sets (70/30)... ---
--- X_train: (649414, 15), X_test: (278321, 15) ---


In [8]:
# -------------------------- Train --------------------------
print("--- Defining & training RandomForest... ---")
rf = RandomForestClassifier(
    n_estimators=100, max_depth=10, min_samples_split=2,
    random_state=42, n_jobs=-1, class_weight='balanced'
)
t0 = time.time()
rf.fit(X_train, y_train)
print(f"--- Model trained in {time.time()-t0:.2f}s ---")

with open(os.path.join(OUTPUT_DIR, "random_forest_model_cic.pkl"), "wb") as f:
    pickle.dump(rf, f)
print("--- Model saved: random_forest_model_cic.pkl ---")

--- Defining & training RandomForest... ---
--- Model trained in 17.10s ---
--- Model saved: random_forest_model_cic.pkl ---


In [9]:
# -------------------------- Evaluate --------------------------
print("--- Evaluating model on test set... ---")
y_pred = rf.predict(X_test)

y_test_labels = le.inverse_transform(y_test)
y_pred_labels = le.inverse_transform(y_pred)
labels = le.classes_

print("\n--- [Report] Classification Report ---")
report_str = classification_report(y_test_labels, y_pred_labels, labels=labels, digits=4)
print(report_str)

# Save classification report as CSV (per-class)
prec, rec, f1, sup = precision_recall_fscore_support(
    y_test_labels, y_pred_labels, labels=labels, zero_division=0
)
cls_report_df = pd.DataFrame({
    "label": labels,
    "precision": prec,
    "recall": rec,
    "f1": f1,
    "support": sup.astype(int)
})
cls_report_df.to_csv(os.path.join(OUTPUT_DIR, "classification_report_per_class.csv"), index=False)

--- Evaluating model on test set... ---

--- [Report] Classification Report ---
              precision    recall  f1-score   support

      BENIGN     0.9998    0.9857    0.9927    255132
         Bot     0.0756    0.9830    0.1405       235
 Brute Force     0.9905    0.9858    0.9882       212
    Dos/Ddos     0.9977    0.9979    0.9978     22019
Infiltration     1.0000    0.8182    0.9000        11
    PortScan     0.6817    0.9896    0.8073       671
  Web Attack     0.0504    0.6341    0.0934        41

    accuracy                         0.9866    278321
   macro avg     0.6851    0.9135    0.7028    278321
weighted avg     0.9979    0.9866    0.9918    278321



In [10]:
# -------------------------- Class-wise metrics (TP/FP/FN/TN etc.) --------------------------
cm = confusion_matrix(y_test_labels, y_pred_labels, labels=labels)
total = cm.sum()

rows = []
print("\n--- [Report] Per-Class Analysis (TP, FP, FN, TN, Acc, TPR, TNR, BAcc) ---")
for i, lbl in enumerate(labels):
    tp = int(cm[i, i])
    fp = int(cm[:, i].sum() - tp)
    fn = int(cm[i, :].sum() - tp)
    tn = int(total - (tp + fp + fn))
    tpr = (tp / (tp + fn)) if (tp + fn) > 0 else 0.0  # recall
    tnr = (tn / (tn + fp)) if (tn + fp) > 0 else 0.0  # specificity
    acc_ovr = (tp + tn) / total if total > 0 else 0.0
    bacc = (tpr + tnr) / 2.0

    print(f"\nClass: {lbl}")
    print(f"  TP={tp}  FP={fp}  FN={fn}  TN={tn}")
    print(f"  Classwise-Accuracy(one-vs-rest)={acc_ovr:.4f}")
    print(f"  TPR/Recall={tpr:.4f}  TNR/Specificity={tnr:.4f}  Balanced-Acc={bacc:.4f}")

    rows.append(OrderedDict(
        label=lbl, TP=tp, FP=fp, FN=fn, TN=tn,
        classwise_accuracy=acc_ovr, recall=tpr, specificity=tnr, balanced_accuracy=bacc
    ))

per_class_df = pd.DataFrame(rows)
per_class_df.to_csv(os.path.join(OUTPUT_DIR, "metrics_per_class.csv"), index=False)

overall_acc = accuracy_score(y_test_labels, y_pred_labels)
print(f"\n--- [Report] Overall Accuracy: {overall_acc*100:.4f}% ---")


--- [Report] Per-Class Analysis (TP, FP, FN, TN, Acc, TPR, TNR, BAcc) ---

Class: BENIGN
  TP=251479  FP=54  FN=3653  TN=23135
  Classwise-Accuracy(one-vs-rest)=0.9867
  TPR/Recall=0.9857  TNR/Specificity=0.9977  Balanced-Acc=0.9917

Class: Bot
  TP=231  FP=2823  FN=4  TN=275263
  Classwise-Accuracy(one-vs-rest)=0.9898
  TPR/Recall=0.9830  TNR/Specificity=0.9898  Balanced-Acc=0.9864

Class: Brute Force
  TP=209  FP=2  FN=3  TN=278107
  Classwise-Accuracy(one-vs-rest)=1.0000
  TPR/Recall=0.9858  TNR/Specificity=1.0000  Balanced-Acc=0.9929

Class: Dos/Ddos
  TP=21973  FP=51  FN=46  TN=256251
  Classwise-Accuracy(one-vs-rest)=0.9997
  TPR/Recall=0.9979  TNR/Specificity=0.9998  Balanced-Acc=0.9989

Class: Infiltration
  TP=9  FP=0  FN=2  TN=278310
  Classwise-Accuracy(one-vs-rest)=1.0000
  TPR/Recall=0.8182  TNR/Specificity=1.0000  Balanced-Acc=0.9091

Class: PortScan
  TP=664  FP=310  FN=7  TN=277340
  Classwise-Accuracy(one-vs-rest)=0.9989
  TPR/Recall=0.9896  TNR/Specificity=0.9989  Ba

In [11]:
# -------------------------- Overall Precision / Recall / F1 --------------------------
overall_prec_weight, overall_rec_weight, overall_f1_weight, _ = precision_recall_fscore_support(
    y_test_labels, y_pred_labels, average='weighted'
)

macro_prec, macro_rec, macro_f1, _ = precision_recall_fscore_support(
    y_test_labels, y_pred_labels, average='macro'
)

print("\n--- [Report] Overall Model Performance Metrics ---")
print(f"Weighted Precision: {overall_prec_weight:.4f}")
print(f"Weighted Recall:    {overall_rec_weight:.4f}")
print(f"Weighted F1-score:  {overall_f1_weight:.4f}")
print(f"Macro Precision:    {macro_prec:.4f}")
print(f"Macro Recall:       {macro_rec:.4f}")
print(f"Macro F1-score:     {macro_f1:.4f}")

overall_metrics_df = pd.DataFrame([{
    "accuracy": overall_acc,
    "precision_weighted": overall_prec_weight,
    "recall_weighted": overall_rec_weight,
    "f1_weighted": overall_f1_weight,
    "precision_macro": macro_prec,
    "recall_macro": macro_rec,
    "f1_macro": macro_f1
}])

overall_metrics_df.to_csv(os.path.join(OUTPUT_DIR, "overall_metrics_cic.csv"), index=False)
print(f"--- Overall metrics saved -> overall_metrics_cic.csv ---")


--- [Report] Overall Model Performance Metrics ---
Weighted Precision: 0.9979
Weighted Recall:    0.9866
Weighted F1-score:  0.9918
Macro Precision:    0.6851
Macro Recall:       0.9135
Macro F1-score:     0.7028
--- Overall metrics saved -> overall_metrics_cic.csv ---


In [12]:
# -------------------------- Save test CSVs --------------------------
norm_path = os.path.join(OUTPUT_DIR, "test_set_with_readable_labels.csv")
raw_path = os.path.join(OUTPUT_DIR, "test_set_raw_values.csv")

test_norm = X_test.copy().reset_index(drop=True)
test_norm["Label"] = y_test_labels
test_norm.to_csv(norm_path, index=False)

test_raw = X_test_raw.copy()
test_raw["Label"] = y_test_labels
test_raw.to_csv(raw_path, index=False)

print(f"\n--- Saved normalized test set -> {norm_path}")
print(f"--- Saved RAW (unnormalized) test set -> {raw_path}")


--- Saved normalized test set -> D:\AutoSNortCopy\XAI-AutoSnort\model_cic\test_set_with_readable_labels.csv
--- Saved RAW (unnormalized) test set -> D:\AutoSNortCopy\XAI-AutoSnort\model_cic\test_set_raw_values.csv


In [13]:
# -------------------------- Confusion matrix PNG --------------------------
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels, cmap='Blues')
plt.title('Confusion Matrix for CIC-IDS 2017')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
cm_path = os.path.join(OUTPUT_DIR, "confusion_matrix_cicids.png")
plt.savefig(cm_path, bbox_inches="tight")
plt.close()

In [14]:
# -------------------------- Demo samples (raw medians per class) --------------------------
# Use raw feature medians of each class to create ready-to-use demo rows
df_with_label_raw = X_raw.copy()
df_with_label_raw["Label"] = y_raw.values
demo_rows = (
    df_with_label_raw.groupby("Label")[FEATURE_NAMES]
    .median(numeric_only=True)
    .reset_index()
)
demo_csv = os.path.join(OUTPUT_DIR, "demo_samples_raw_by_class.csv")
demo_rows.to_csv(demo_csv, index=False)

# Input schema for UI/batch validators
schema = {name: "number" for name in FEATURE_NAMES}
with open(os.path.join(OUTPUT_DIR, "feature_input_schema.json"), "w") as f:
    json.dump(schema, f, indent=2)

print(f"--- Confusion matrix saved -> {cm_path}")
print(f"--- Demo samples saved -> {demo_csv}")
print("\n--- [Script] All tasks complete. ---")

--- Confusion matrix saved -> D:\AutoSNortCopy\XAI-AutoSnort\model_cic\confusion_matrix_cicids.png
--- Demo samples saved -> D:\AutoSNortCopy\XAI-AutoSnort\model_cic\demo_samples_raw_by_class.csv

--- [Script] All tasks complete. ---
