In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# --- 1️⃣ Load Fresh Data ---
data_path = r"C:\Users\cmhub\Desktop\network-anomaly-detector-starter\data\MachineLearningCSV\MachineLearningCVE\CICIDS2017_combined.csv"
full_df = pd.read_csv(data_path)
print("Loaded dataset:", full_df.shape)

# --- 2️⃣ Shuffle to avoid sequential label blocks ---
full_df = full_df.sample(frac=1, random_state=42).reset_index(drop=True)

# --- 3️⃣ Identify obvious duplicates / extra label columns ---
extra_labels = [c for c in full_df.columns if 'label' in c.lower() or 'attack' in c.lower()]
print("Possible duplicate label columns:", extra_labels)

# --- 4️⃣ Drop all known leak-prone or identifier columns ---
known_leaks = [
    # IDs / metadata
    'Day', 'Timestamp', 'Flow ID', 'Source IP', 'Destination IP',
    # Post-hoc rates and byte metrics
    'Flow Bytes/s', 'Flow Packets/s',
    # Packet-length and header-length stats
    'Bwd Packet Length Std','Bwd Packet Length Max','Bwd Packet Length Mean',
    'Avg Bwd Segment Size','Packet Length Std','Max Packet Length',
    'Packet Length Variance','Fwd IAT Std','Packet Length Mean',
    'Average Packet Size','Idle Max','Idle Mean','Flow IAT Max',
    'Fwd IAT Max','Idle Min','Flow IAT Std','Min Packet Length',
    'Fwd Header Length','Bwd Header Length',
    # Bulk / flag features often deterministic for attacks
    'Fwd Avg Bytes/Bulk','Bwd Avg Bytes/Bulk','Fwd Avg Packets/Bulk',
    'Bwd Avg Packets/Bulk','Fwd Avg Bulk Rate','Bwd Avg Bulk Rate',
    'URG Flag Count','PSH Flag Count','ECE Flag Count','RST Flag Count'
]
drop_cols = list(set(extra_labels + known_leaks + ['Label']))  # unique list
X = full_df.drop(columns=drop_cols, errors='ignore').copy()
y = full_df['Label'].copy()

# --- 5️⃣ Quick correlation check (should now be low) ---
corrs = full_df.drop(columns=drop_cols, errors='ignore').corr(numeric_only=True)['Label'].abs().sort_values(ascending=False)
print("\nTop remaining correlations with Label:\n", corrs.head(10))

# --- 6️⃣ Basic numeric sanitization ---
X = X.replace([np.inf, -np.inf], np.nan)
X = X.clip(lower=-1e10, upper=1e10)

# --- 7️⃣ Split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
assert not any(X_train.index.isin(X_test.index)), "Train/test overlap!"

# --- 8️⃣ Build pipeline ---
pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scale', StandardScaler()),
    ('clf', RandomForestClassifier(
        n_estimators=500,
        max_depth=20,
        min_samples_split=3,
        min_samples_leaf=2,
        class_weight='balanced',
        n_jobs=-1,
        random_state=42
    ))
])

# --- 9️⃣ Train & evaluate ---
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
proba = pipe.predict_proba(X_test)[:, 1]

print("\nAccuracy:", accuracy_score(y_test, pred))
print("ROC-AUC:", roc_auc_score(y_test, proba))
print("\nClassification Report:\n", classification_report(y_test, pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, pred))

# --- 🔟 Optional sanity check (random labels) ---
from sklearn.utils import shuffle
y_perm = np.random.permutation(y_train)
pipe.fit(X_train, y_perm)
print("\nSanity accuracy (should be ~0.5):", pipe.score(X_test, y_test))


Loaded dataset: (2830743, 80)
Label                     1.000000
Bwd Packet Length Std     0.510216
Bwd Packet Length Max     0.492007
Bwd Packet Length Mean    0.484189
Avg Bwd Segment Size      0.484189
Packet Length Std         0.470252
Max Packet Length         0.454054
Packet Length Variance    0.453847
Fwd IAT Std               0.422755
Packet Length Mean        0.414059
Average Packet Size       0.413037
Idle Max                  0.394220
Idle Mean                 0.390470
Flow IAT Max              0.388666
Fwd IAT Max               0.388642
Idle Min                  0.380651
Flow IAT Std              0.336720
Fwd IAT Total             0.215468
Flow Duration             0.213864
FIN Flag Count            0.188632
Name: Label, dtype: float64
Columns identical / near-identical to Label: []
Highly correlated with Label: ['Bwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Mean', 'Avg Bwd Segment Size', 'Packet Length Std', 'Max Packet Length', 'Packet Length Varian

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier


#Hyperparameter grid for RandomizedSearchCV

param_grid = {'n_estimators': [200, 300, 500], 
              'max_depth': [10, 20, None],
              'min_samples_split': [2, 4, 6],
              'min_samples_leaf': [1, 2, 3]}

rf = RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=42)
'''
grid_search = GridSearchCV(rf, param_grid, cv=2, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best parameters found:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)
'''
from sklearn.model_selection import RandomizedSearchCV

rand_search = RandomizedSearchCV(
    rf,
    param_distributions=param_grid,
    n_iter=20,       # tries 20 random combos
    scoring='f1_weighted',
    cv=3,
    n_jobs=-1,
    random_state=42
)

rand_search.fit(X_train, y_train)
print("Best Params:", rand_search.best_params_)
print("Best F1 Score:", rand_search.best_score_)


In [None]:
print(X.columns[:10])
print(len(X.columns))

Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std'],
      dtype='object')
78


In [None]:
#Feature Importance Analysis 
best_rf = rand_search.best_estimator_
importances = pd.Series(best_rf.feature_importances_, index=X.columns)

plt.figure(figsize=(8,6))
importances.sort_values().tail(15).plot(kind='barh', color="steelblue")
plt.title("Top 15 Important Features for Anomaly Detection")
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.tight_layout()
plt.show()

NameError: name 'rand_search' is not defined