In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
from pathlib import Path

# Create outputs folder if not exists
Path("../outputs/metrics").mkdir(parents=True, exist_ok=True)
Path("../outputs/figures").mkdir(parents=True, exist_ok=True)

# Swap between basic and CICIDS dataset here
df = pd.read_csv("../data/processed/cicids_processed.csv")

# Define features (X) and target (y)
y = df["is_attack"]
X = df.drop(columns=["is_attack", "attack_type"], errors="ignore")

# Train/Test split
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Scale features
scaler = StandardScaler(with_mean=False)   
Xte_s = scaler.transform(Xte)

logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(Xtr_s, ytr)
pred_lr = logreg.predict(Xte_s)

print("\nLogistic Regression Results")
print(classification_report(yte, pred_lr, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(yte, pred_lr))

best_knn = None
for k in [3,5,7]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(Xtr_s, ytr)
    pred_knn = knn.predict(Xte_s)

    f1 = f1_score(yte, pred_knn, zero_division=0)
    print(f"\nKNN (k={k}) Results")
    print(classification_report(yte, pred_knn, zero_division=0))

    if best_knn is None or f1 > best_knn[0]:
        best_knn = (f1, k, pred_knn)

print(f"\nBest KNN was k={best_knn[1]} with F1={best_knn[0]:.3f}")


Logistic Regression Results
              precision    recall  f1-score   support

           0       0.98      0.99      0.98    163715
           1       0.96      0.92      0.94     46000

    accuracy                           0.97    209715
   macro avg       0.97      0.96      0.96    209715
weighted avg       0.97      0.97      0.97    209715

Confusion Matrix:
 [[162034   1681]
 [  3615  42385]]

KNN (k=3) Results
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    163715
           1       1.00      1.00      1.00     46000

    accuracy                           1.00    209715
   macro avg       1.00      1.00      1.00    209715
weighted avg       1.00      1.00      1.00    209715

