In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
import xgboost as xgb
from sklearn.pipeline import Pipeline

#Loading dataset
data_path = r"C:\Users\cmhub\Desktop\network-anomaly-detector-starter\data\MachineLearningCSV\MachineLearningCVE\CICIDS2017_clean_binary.csv"
df = pd.read_csv(data_path)


# Compute absolute correlation with Label
corrs = df.corr(numeric_only=True)['Label'].abs().sort_values(ascending=False)

# Identify columns above a correlation threshold (e.g., 0.15)
leaky_cols = corrs[corrs > 0.15].index.drop('Label', errors='ignore').tolist()

print("Highly correlated features:", leaky_cols)

# Drop them safely
df = df.drop(columns=leaky_cols, errors='ignore')
print(f"✅ Dropped {len(leaky_cols)} leaky features.")


print("Dataset loaded successfully.")
print(df['Label'].value_counts(normalize=True))

Highly correlated features: ['Fwd IAT Std', 'Packet Length Variance', 'Bwd Packet Length Std', 'Idle Max', 'Idle Mean', 'Idle Min', 'Bwd Packet Length Max', 'Fwd IAT Max', 'Flow IAT Max', 'Packet Length Std', 'Bwd Packet Length Mean', 'Avg Bwd Segment Size', 'Max Packet Length', 'Flow IAT Std', 'Packet Length Mean', 'Average Packet Size', 'Min Packet Length', 'Bwd Packet Length Min']
✅ Dropped 18 leaky features.
Dataset loaded successfully.
Label
0    0.835079
1    0.164921
Name: proportion, dtype: float64


In [2]:
corrs_after = df.corr(numeric_only=True)['Label'].abs().sort_values(ascending=False)
print(corrs_after.head(10))


Label                    1.000000
Fwd IAT Total            0.242776
Flow Duration            0.240939
FIN Flag Count           0.229412
Bwd IAT Std              0.210041
Flow IAT Mean            0.183500
Fwd Packet Length Min    0.174271
Fwd IAT Mean             0.169882
PSH Flag Count           0.165872
Bwd IAT Max              0.160225
Name: Label, dtype: float64


In [None]:
# --- Day 5 Final Verification Cell ---

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import xgboost as xgb

print("🧩 Running Day 5 Final Verification...\n")

# --- 1️⃣ Prepare Features/Labels ---
X = df.drop(columns=['Label'], errors='ignore')
y = df['Label']

# --- 2️⃣ Split Dataset ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# --- 3️⃣ Dummy Baseline (Majority Class) ---
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train, y_train)
dummy_acc = dummy.score(X_test, y_test)
print(f"Dummy baseline accuracy: {dummy_acc:.3f}")

# --- 4️⃣ Sanity Shuffle (Randomized Labels) ---
y_perm = np.random.permutation(y_train)
sanity_model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    eval_metric='auc',
    random_state=42
)
sanity_model.fit(X_train, y_perm)
sanity_acc = sanity_model.score(X_test, y_test)
print(f"Sanity test accuracy: {sanity_acc:.3f}")

# --- 5️⃣ Real Model (True Labels) ---
real_model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]),
    n_jobs=-1,
    eval_metric='auc',
    random_state=42
)
real_model.fit(X_train, y_train)
y_pred = real_model.predict(X_test)
y_proba = real_model.predict_proba(X_test)[:,1]

real_acc = accuracy_score(y_test, y_pred)
real_auc = roc_auc_score(y_test, y_proba)

# --- 6️⃣ Determine Leakage Status ---
if sanity_acc < 0.6:
    status = "✅ PASS (No leakage detected)"
else:
    status = "❌ FAIL (Potential leakage remains!)"

# --- 7️⃣ Print Summary ---
print("\n--- Verification Summary ---")
print(f"Majority baseline: {dummy_acc:.3f}")
print(f"Sanity test (random labels): {sanity_acc:.3f}")
print(f"Real model accuracy: {real_acc:.3f}")
print(f"Real model ROC-AUC: {real_auc:.3f}")
print(f"\nLeakage Status: {status}\n")

# Optional: classification report
print("Classification Report:\n", classification_report(y_test, y_pred))




In [9]:
from sklearn.utils import shuffle
import numpy as np

X = df.drop('Label', axis=1)
y = df['Label']

# Stratified split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# Randomize labels
y_perm = np.random.permutation(y_train)

import xgboost as xgb
model = xgb.XGBClassifier(
    max_depth=8, learning_rate=0.05, n_estimators=500, n_jobs=-1
)
model.fit(X_train, y_perm)
print("Sanity test accuracy:", model.score(X_test, y_test))


Sanity test accuracy: 0.8350652591170825


In [10]:
from sklearn.dummy import DummyClassifier
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train, y_train)
print("Majority baseline:", dummy.score(X_test, y_test))


Majority baseline: 0.8350786948176584


In [5]:
#Computes the ratio of negative (benign) to positive (malicious) samples in the training set
neg, pos = np.bincount(y_train)
scales_pos_weight = neg / pos
print(f"Scale pos weight: {scales_pos_weight}")

#Parameters for XGBoost model
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'use_label_encoder': False,
    'max_depth': 8, 
    'learning_rate': 0.05,
    'n_estimators': 500,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'scale_pos_weight': scales_pos_weight,
    'n_jobs': -1,
    'random_state': 42
}

Scale pos weight: 5.063509797931306


In [6]:
#Training the XGBoost model
model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [7]:
y_pred  = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # Probability of "attack" class

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9991401151631478
ROC-AUC: 0.9999688002346221

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    435076
           1       1.00      1.00      1.00     85924

    accuracy                           1.00    521000
   macro avg       1.00      1.00      1.00    521000
weighted avg       1.00      1.00      1.00    521000


Confusion Matrix:
 [[434645    431]
 [    17  85907]]


In [8]:
import os
print("Loaded from:", os.path.abspath(data_path))
print(df.shape)
print(df.columns[:15])


Loaded from: C:\Users\cmhub\Desktop\network-anomaly-detector-starter\data\MachineLearningCSV\MachineLearningCVE\CICIDS2017_clean_binary.csv
(2604998, 61)
Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Min', 'Fwd IAT Total'],
      dtype='object')
