In [1]:
# --- Core Libraries ---
import pandas as pd
import numpy as np
from pathlib import Path

# --- Scikit-learn Modules ---
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# --- Misc ---
import warnings
warnings.filterwarnings('ignore')


In [5]:
# Path to your dataset folder
data_dir = Path(r"C:\Users\cmhub\Desktop\network-anomaly-detector-starter\data\MachineLearningCSV\MachineLearningCVE")

csv_files = [
    "Monday-WorkingHours.pcap_ISCX.csv",
    "Tuesday-WorkingHours.pcap_ISCX.csv",
    "Wednesday-workingHours.pcap_ISCX.csv",
    "Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    "Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
    "Friday-WorkingHours-Morning.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv"
]

dfs = []
for f in csv_files:
    df = pd.read_csv(data_dir / f)
    df.columns = df.columns.str.strip().str.replace('\ufeff', '', regex=True)
    # unify label column name
    label_cols = [c for c in df.columns if 'label' in c.lower() or 'attack' in c.lower()]
    if label_cols:
        df = df.rename(columns={label_cols[0]: 'Label'})
    else:
        raise ValueError(f"No label column found in {f}")
    dfs.append(df)

# Merge all days together
full_df = pd.concat(dfs, ignore_index=True)

# Shuffle to avoid day-order bias
full_df = full_df.sample(frac=1, random_state=42).reset_index(drop=True)

print("âœ… Merged dataset shape:", full_df.shape)
print("âœ… Label column exists:", 'Label' in full_df.columns)
print(full_df['Label'].value_counts(normalize=True))


âœ… Merged dataset shape: (2604998, 79)
âœ… Label column exists: True
Label
BENIGN                        0.835079
DoS Hulk                      0.088704
PortScan                      0.061010
DoS GoldenEye                 0.003951
FTP-Patator                   0.003047
SSH-Patator                   0.002264
DoS slowloris                 0.002225
DoS Slowhttptest              0.002111
Bot                           0.000755
Web Attack ï¿½ Brute Force      0.000579
Web Attack ï¿½ XSS              0.000250
Infiltration                  0.000014
Web Attack ï¿½ Sql Injection    0.000008
Heartbleed                    0.000004
Name: proportion, dtype: float64


In [6]:
# Drop identifiers and metadata
X = full_df.drop(['Label', 'Flow ID', 'Source IP', 'Destination IP', 'Timestamp'], axis=1, errors='ignore')
y = full_df['Label']

# Replace inf/nan, clip extremes
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(0)
X = X.clip(lower=-1e10, upper=1e10)

print("âœ… Cleaned feature matrix shape:", X.shape)


âœ… Cleaned feature matrix shape: (2604998, 78)


In [7]:
# Convert to binary: 0 = BENIGN, 1 = ATTACK
y = y.apply(lambda x: 0 if x == 'BENIGN' else 1)
print(y.value_counts(normalize=True))


Label
0    0.835079
1    0.164921
Name: proportion, dtype: float64


In [None]:
# Train/test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

assert not any(X_train.index.isin(X_test.index)), "ðŸš¨ Train/test overlap detected!"
print("âœ… Train/Test split successful")


âœ… Train/Test split successful


In [None]:
#Pipeline with imputation, scaling, and RandomForest
pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scale', StandardScaler()),
    ('clf', RandomForestClassifier(
        n_estimators=500,
        max_depth=20,
        min_samples_split=3,
        min_samples_leaf=2,
        class_weight='balanced_subsample',
        n_jobs=-1,
        random_state=42
    ))
])

pipe.fit(X_train, y_train)


0,1,2
,steps,"[('imputer', ...), ('scale', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,20
,min_samples_split,3
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [10]:
pred = pipe.predict(X_test)
proba = pipe.predict_proba(X_test)[:, 1]

print("âœ… Accuracy:", accuracy_score(y_test, pred))
print("âœ… ROC-AUC:", roc_auc_score(y_test, proba))
print("\nClassification Report:\n", classification_report(y_test, pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred))


âœ… Accuracy: 0.998911708253359
âœ… ROC-AUC: 0.9999232341063363

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    435076
           1       0.99      1.00      1.00     85924

    accuracy                           1.00    521000
   macro avg       1.00      1.00      1.00    521000
weighted avg       1.00      1.00      1.00    521000


Confusion Matrix:
 [[434578    498]
 [    69  85855]]


In [11]:
from sklearn.utils import shuffle
y_perm = np.random.permutation(y_train)
pipe.fit(X_train, y_perm)
print("\nSanity-check accuracy (should be ~0.5):", pipe.score(X_test, y_test))



Sanity-check accuracy (should be ~0.5): 0.6128694817658349


In [12]:
full_df_clean = pd.concat([X, y.rename('Label')], axis=1)
full_df_clean.to_csv("CICIDS2017_clean_binary.csv", index=False)
print("âœ… Saved clean, leak-free dataset.")


âœ… Saved clean, leak-free dataset.


In [None]:
#XGB