In [10]:
import numpy as np
import pandas as pd

import requests
from io import StringIO

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# -----------------------------
# 1) Download NSL-KDD (real IDS dataset)
# -----------------------------
TRAIN_URL = "https://raw.githubusercontent.com/Jehuty4949/NSL_KDD/master/KDDTrain%2B.txt"
TEST_URL  = "https://raw.githubusercontent.com/Jehuty4949/NSL_KDD/master/KDDTest%2B.txt"

train_txt = requests.get(TRAIN_URL, timeout=60).text
test_txt  = requests.get(TEST_URL, timeout=60).text

# NSL-KDD columns: 41 features + label + difficulty
cols = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes","land",
    "wrong_fragment","urgent","hot","num_failed_logins","logged_in","num_compromised",
    "root_shell","su_attempted","num_root","num_file_creations","num_shells",
    "num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count",
    "srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate",
    "same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count",
    "dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
    "dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate",
    "label","difficulty"
]

train_df = pd.read_csv(StringIO(train_txt), header=None, names=cols)
test_df  = pd.read_csv(StringIO(test_txt), header=None, names=cols)

# Combine train+test, then do our own split (cleaner for assignment)
df = pd.concat([train_df, test_df], ignore_index=True)

# -----------------------------
# 2) Binary target: Normal vs Attack
# -----------------------------
# normal -> 0, everything else -> 1
y = (df["label"].astype(str).str.strip() != "normal").astype(int)

# Drop label + difficulty from features
X = df.drop(columns=["label", "difficulty"])

print("Total rows:", len(df))
print("Normal:", (y == 0).sum(), "Attack:", (y == 1).sum())

# -----------------------------
# 3) Identify numeric vs categorical columns
# -----------------------------
categorical_cols = ["protocol_type", "service", "flag"]
numeric_cols = [c for c in X.columns if c not in categorical_cols]

# -----------------------------
# 4) Preprocessing pipelines
# -----------------------------
num_scaled = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

num_noscale = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# For NB + ANN (scaling helps)
preprocess_scaled = ColumnTransformer(
    transformers=[
        ("num", num_scaled, numeric_cols),
        ("cat", cat_pipe, categorical_cols)
    ]
)

# For Decision Tree (no scaling needed)
preprocess_tree = ColumnTransformer(
    transformers=[
        ("num", num_noscale, numeric_cols),
        ("cat", cat_pipe, categorical_cols)
    ]
)

# -----------------------------
# 5) Train-test split
# -----------------------------
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# -----------------------------
# 6) Model A: Bayesian (Gaussian Naive Bayes)
# -----------------------------
nb_model = Pipeline(steps=[
    ("prep", preprocess_scaled),
    ("clf", GaussianNB())
])

nb_model.fit(X_tr, y_tr)
nb_pred = nb_model.predict(X_te)

print("\n=== Gaussian Naive Bayes (IDS) ===")
print("Accuracy:", accuracy_score(y_te, nb_pred))
print("Confusion Matrix:\n", confusion_matrix(y_te, nb_pred))
print("Classification Report:\n", classification_report(
    y_te, nb_pred, target_names=["Normal", "Attack"], zero_division=0
))

# -----------------------------
# 7) Model B: Decision Tree (Entropy)
# -----------------------------
dt_model = Pipeline(steps=[
    ("prep", preprocess_tree),
    ("clf", DecisionTreeClassifier(
        criterion="entropy",
        max_depth=15,
        min_samples_leaf=10,
        random_state=42
    ))
])

dt_model.fit(X_tr, y_tr)
dt_pred = dt_model.predict(X_te)

print("\n=== Decision Tree (Entropy) (IDS) ===")
print("Accuracy:", accuracy_score(y_te, dt_pred))
print("Confusion Matrix:\n", confusion_matrix(y_te, dt_pred))
print("Classification Report:\n", classification_report(
    y_te, dt_pred, target_names=["Normal", "Attack"], zero_division=0
))

# -----------------------------
# 8) Model C: ANN (MLP)
# -----------------------------
ann_model = Pipeline(steps=[
    ("prep", preprocess_scaled),
    ("clf", MLPClassifier(
        hidden_layer_sizes=(64, 32),
        activation="relu",
        max_iter=30,
        random_state=42
    ))
])

ann_model.fit(X_tr, y_tr)
ann_pred = ann_model.predict(X_te)

print("\n=== ANN (MLP) (IDS) ===")
print("Accuracy:", accuracy_score(y_te, ann_pred))
print("Confusion Matrix:\n", confusion_matrix(y_te, ann_pred))
print("Classification Report:\n", classification_report(
    y_te, ann_pred, target_names=["Normal", "Attack"], zero_division=0
))


Total rows: 148517
Normal: 77054 Attack: 71463

=== Gaussian Naive Bayes (IDS) ===
Accuracy: 0.8149474818206303
Confusion Matrix:
 [[19181    83]
 [ 6788 11078]]
Classification Report:
               precision    recall  f1-score   support

      Normal       0.74      1.00      0.85     19264
      Attack       0.99      0.62      0.76     17866

    accuracy                           0.81     37130
   macro avg       0.87      0.81      0.81     37130
weighted avg       0.86      0.81      0.81     37130


=== Decision Tree (Entropy) (IDS) ===
Accuracy: 0.9927821168866146
Confusion Matrix:
 [[19155   109]
 [  159 17707]]
Classification Report:
               precision    recall  f1-score   support

      Normal       0.99      0.99      0.99     19264
      Attack       0.99      0.99      0.99     17866

    accuracy                           0.99     37130
   macro avg       0.99      0.99      0.99     37130
weighted avg       0.99      0.99      0.99     37130






=== ANN (MLP) (IDS) ===
Accuracy: 0.9914085645030972
Confusion Matrix:
 [[19082   182]
 [  137 17729]]
Classification Report:
               precision    recall  f1-score   support

      Normal       0.99      0.99      0.99     19264
      Attack       0.99      0.99      0.99     17866

    accuracy                           0.99     37130
   macro avg       0.99      0.99      0.99     37130
weighted avg       0.99      0.99      0.99     37130

