In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
import xgboost as xgb
from keras.models import Model, Sequential, load_model
from keras.layers import Input, Dense
import joblib
import os


In [9]:
base_dir = Path(r"C:\Users\cmhub\Desktop\network-anomaly-detector-starter\data\MachineLearningCSV\MachineLearningCVE") 

files = {"Monday": "Monday-WorkingHours.pcap_ISCX.csv",
         "Tuesday": "Tuesday-WorkingHours.pcap_ISCX.csv",
         "Wednesday": "Wednesday-workingHours.pcap_ISCX.csv",
         "Thursday_Morning": "Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
         "Thursday_Afternoon": "Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
         "Friday_Morning": "Friday-WorkingHours-Morning.pcap_ISCX.csv",
         "Friday_Afternoon": "Friday-WorkingHours-Morning.pcap_ISCX.csv"}
dfs = []
for day, filename in files.items():
    df_day = pd.read_csv(base_dir / filename)

    # Normalize column names (remove accidental spaces)
    df_day.columns = df_day.columns.str.strip()

    # Rename ' Label' to 'Label' if needed
    if " Label" in df_day.columns:
        df_day = df_day.rename(columns={" Label": "Label"})

    df_day.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Clip extreme numeric outliers (prevents scaler overflow)
    num_cols = df_day.select_dtypes(include=[np.number]).columns
    df_day[num_cols] = df_day[num_cols].clip(lower=-1e10, upper=1e10)

    
    # Fill any remaining NaN with 0 (for simplicity)
    df_day = df_day.fillna(0)
    
    df_day["Day"] = day
    dfs.append(df_day)
full_df = pd.concat(dfs, ignore_index=True)
print(f"Combined dataset shape: {full_df.shape}")
print("Days in dataset:", full_df['Day'].unique())
print("Count of samples per day:")
print(full_df['Day'].value_counts())

print("Columns in dataset:", full_df.columns.tolist())
full_df["Label"] = full_df["Label"].apply(lambda x: 0 if x == "BENIGN" else 1)


Combined dataset shape: (2509564, 80)
Days in dataset: ['Monday' 'Tuesday' 'Wednesday' 'Thursday_Morning' 'Thursday_Afternoon'
 'Friday_Morning' 'Friday_Afternoon']
Count of samples per day:
Day
Wednesday             692703
Monday                529918
Tuesday               445909
Thursday_Afternoon    288602
Friday_Morning        191033
Friday_Afternoon      191033
Thursday_Morning      170366
Name: count, dtype: int64
Columns in dataset: ['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean

In [10]:
print(full_df.groupby("Day")["Label"].agg(
    Attacks=lambda s: (s==1).sum(),
    Benign=lambda s: (s==0).sum(),
    Total="count"
))

                    Attacks  Benign   Total
Day                                        
Friday_Afternoon       1966  189067  191033
Friday_Morning         1966  189067  191033
Monday                    0  529918  529918
Thursday_Afternoon       36  288566  288602
Thursday_Morning       2180  168186  170366
Tuesday               13835  432074  445909
Wednesday            252672  440031  692703


In [None]:
#Final alignment before train test split


#Day Based Train/Test Split
train_days = ["Monday", "Tuesday", "Wednesday", "Thursday_Morning", "Thursday_Afternoon", "Friday_Morning"]
test_days = ["Friday_Afternoon"]

train_df = full_df[full_df["Day"].isin(train_days)].copy()
test_df = full_df[full_df["Day"].isin(test_days)].copy()

# Drop the 'Day' column as it's no longer needed
X_train = train_df.drop(columns=["Label", "Day"])
y_train = train_df["Label"]
X_test = test_df.drop(columns=["Label", "Day"])
y_test = test_df["Label"]

X_train, y_train = X_train.align(y_train, join='inner', axis=0)
X_test, y_test = X_test.align(y_test, join='inner', axis=0)


print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")
print(f"TRAIN: {len(train_df)} rows | Attacks: {train_df['Label'].sum()} | Benign: {(train_df['Label']==0).sum()}")
print(f"TEST : {len(test_df)} rows | Attacks: {test_df['Label'].sum()} | Benign: {(test_df['Label']==0).sum()}")

Training set shape: (2318531, 78), Test set shape: (191033, 78)
TRAIN: 2318531 rows | Attacks: 270689 | Benign: 2047842
TEST : 191033 rows | Attacks: 1966 | Benign: 189067


In [15]:
#Building and training the autoencoder
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Simple symmetric autoencoder for now
input_dim = X_train_scaled.shape[1]
encoding_dim = input_dim // 2

autoencoder = Sequential([
    Dense(encoding_dim, activation="relu", input_shape=(input_dim,)),
    Dense(encoding_dim // 2, activation="relu"),
    Dense(encoding_dim, activation="relu"),
    Dense(input_dim, activation="linear")
])

autoencoder.compile(optimizer="adam", loss="mse")
autoencoder.fit(X_train_scaled, X_train_scaled,
                epochs=15,
                batch_size=512,
                shuffle=True,
                validation_data=(X_test_scaled, X_test_scaled))
autoencoder.save(r"C:\Users\cmhub\Desktop\network-anomaly-detector-starter\models\autoencoder_model.keras")
print("Model saved successfully.")
joblib.dump(scaler, r"C:\Users\cmhub\Desktop\network-anomaly-detector-starter\models\autoencoder_scaler.pkl")
print("Scaler saved successfully.")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/15
[1m4529/4529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 0.1564 - val_loss: 0.0674
Epoch 2/15
[1m4529/4529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 0.0595 - val_loss: 0.0259
Epoch 3/15
[1m4529/4529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 0.0458 - val_loss: 0.0238
Epoch 4/15
[1m4529/4529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 0.0398 - val_loss: 0.0189
Epoch 5/15
[1m4529/4529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 0.0330 - val_loss: 0.0157
Epoch 6/15
[1m4529/4529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 0.0272 - val_loss: 0.0138
Epoch 7/15
[1m4529/4529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 0.0222 - val_loss: 0.0173
Epoch 8/15
[1m4529/4529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 0.0211 - val_loss: 0.0114
Epoch 9/15
[1m4529/4529

In [6]:
train_recon = np.mean(np.square(X_train_scaled - autoencoder.predict(X_train_scaled)), axis=1)
test_recon = np.mean(np.square(X_test_scaled - autoencoder.predict(X_test_scaled)), axis=1)

X_train_hybrid = X_train.copy()
X_test_hybrid = X_test.copy()
X_train_hybrid["reconstruction_error"] = train_recon
X_test_hybrid["reconstruction_error"] = test_recon


[1m66485/66485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 778us/step
[1m11940/11940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 735us/step


In [12]:
train_combined = pd.concat([X_train_hybrid, y_train], axis=1)
attack_df = train_combined[train_combined["Label"] == 1]
benign_df = train_combined[train_combined["Label"] == 0]

# --- Safety check ---
if len(attack_df) == 0 or len(benign_df) == 0:
    print(f"Warning: No attack or benign samples found in training data. "
          f"Attacks={len(attack_df)}, Benign={len(benign_df)}")
    train_balanced = train_combined.copy()
else:
    ratio = 3  # keep 3 benign per attack
    n_samples = min(len(attack_df) * ratio, len(benign_df))
    
    benign_down = resample(
        benign_df,
        replace=False,
        n_samples=n_samples,
        random_state=42
    )

    train_balanced = pd.concat([attack_df, benign_down]).sample(frac=1, random_state=42)

y_train_bal = train_balanced["Label"]
X_train_bal = train_balanced.drop(columns=["Label"])



In [16]:
xgb_hybrid = xgb.XGBClassifier(
    n_estimators=400,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=len(y_train_bal[y_train_bal==0]) / len(y_train_bal[y_train_bal==1]),
    n_jobs=-1,
    eval_metric="auc",
    random_state=42
)

xgb_hybrid.fit(X_train_bal, y_train_bal)
y_pred = xgb_hybrid.predict(X_test_hybrid)
y_proba = xgb_hybrid.predict_proba(X_test_hybrid)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



ValueError: Found input variables with inconsistent numbers of samples: [191033, 382066]