In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.utils import resample

print("Starting leakage-safe XGBoost pipeline...\n")

# 1. Load dataset
data_path = r"C:\Users\cmhub\Desktop\network-anomaly-detector-starter\data\MachineLearningCSV\MachineLearningCVE\CICIDS2017_clean_binary.csv"
df = pd.read_csv(data_path)
print("Original shape:", df.shape)

# 2. Remove duplicates
df = df.drop_duplicates(subset=df.columns.difference(['Label']))
print("After de-duplication:", len(df))

# 3. Handle class imbalance (downsample benign)
benign_df = df[df['Label'] == 0]
attack_df = df[df['Label'] == 1]
ratio = 3  # keep 3 benign for each attack
benign_down = resample(
    benign_df,
    replace=False,
    n_samples=min(len(attack_df) * ratio, len(benign_df)),
    random_state=42
)
df_balanced = (
    pd.concat([benign_down, attack_df])
    .sample(frac=1, random_state=42)
    .reset_index(drop=True)
)
print(f"Balanced dataset shape: {df_balanced.shape}")
print(f"Class distribution: {df_balanced['Label'].value_counts().to_dict()}")

# 4. Split features and labels
X = df_balanced.drop(columns=['Label'])
y = df_balanced['Label']

# 5. Stratified train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print(f"Train size: {X_train.shape}, Test size: {X_test.shape}")

# 6. Verify no overlap between train/test
print("\nChecking for overlap between train/test...")
common_rows = pd.merge(X_train, X_test)
print(f"Common rows: {len(common_rows)}")
if len(common_rows) == 0:
    print("No overlapping rows detected.")
else:
    print("Overlap detected — additional cleaning may be required.")

# 7. Drop highly correlated features
train_corr_df = X_train.copy()
train_corr_df['Label'] = y_train
corrs = train_corr_df.corr(numeric_only=True)['Label'].abs().sort_values(ascending=False)
leaky_cols = corrs[corrs > 0.15].index.drop('Label', errors='ignore').tolist()
print(f"\nDropping {len(leaky_cols)} highly correlated features:")
print(leaky_cols[:10], "..." if len(leaky_cols) > 10 else "")
X_train = X_train.drop(columns=leaky_cols, errors='ignore')
X_test = X_test.drop(columns=leaky_cols, errors='ignore')

# 8. Baseline model
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train, y_train)
dummy_acc = dummy.score(X_test, y_test)
print(f"\nDummy baseline accuracy: {dummy_acc:.3f}")

# 9. Define XGBoost pipeline
xgb_clf = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),
    n_jobs=-1,
    eval_metric='auc',
    random_state=42
)
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', xgb_clf)
])

# 10. Sanity test with random labels
y_perm = np.random.permutation(y_train)
sanity_model = Pipeline([
    ('scaler', StandardScaler()),
    ('model', xgb.XGBClassifier(
        n_estimators=300,
        max_depth=8,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        n_jobs=-1,
        eval_metric='auc',
        random_state=42
    ))
])
sanity_model.fit(X_train, y_perm)
y_proba_perm = sanity_model.predict_proba(X_test)[:, 1]
sanity_auc = roc_auc_score(y_test, y_proba_perm)
print(f"\nSanity test ROC-AUC (random labels): {sanity_auc:.3f}")

# 11. Real model training
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]
real_acc = accuracy_score(y_test, y_pred)
real_auc = roc_auc_score(y_test, y_proba)

# 12. Evaluate leakage status
status = "PASS (No leakage detected)" if sanity_auc < 0.6 else "FAIL (Potential leakage remains!)"

# 13. Summary
print("\n--- Verification Summary ---")
print(f"Majority baseline accuracy: {dummy_acc:.3f}")
print(f"Sanity test ROC-AUC: {sanity_auc:.3f}")
print(f"Model accuracy: {real_acc:.3f}")
print(f"Model ROC-AUC: {real_auc:.3f}")
print(f"Leakage status: {status}\n")
print("Classification Report:\n", classification_report(y_test, y_pred))


🧹 Starting full leakage-safe balanced XGBoost pipeline...

Original shape: (2604998, 79)
✅ After full de-duplication: 2303903
✅ Balanced dataset shape: (1190380, 79) | Benign:Attack = {0: 892785, 1: 297595}
Train size: (952304, 78), Test size: (238076, 78)

🔎 Checking for overlap between train/test...
Common rows between train/test: 0
✅ No overlapping rows found.

🧠 Dropping 29 highly correlated features: ['Fwd IAT Std', 'Bwd Packet Length Std', 'Idle Max', 'Packet Length Std', 'Idle Mean', 'Bwd Packet Length Max', 'Fwd IAT Max', 'Flow IAT Max', 'Idle Min', 'Bwd Packet Length Mean']...
📊 Dummy baseline accuracy: 0.750

🧠 Sanity test ROC-AUC (random labels): 0.530

--- Verification Summary ---
Majority baseline (accuracy): 0.750
Sanity test ROC-AUC (random labels): 0.530
Real model accuracy: 0.999
Real model ROC-AUC: 1.000

Leakage Status: ✅ PASS (No leakage detected)

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1