In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import tensorflow as tf

# 1. Load and preprocess dataset
df = pd.read_csv("KDDTrain+.txt", header=None)
df_test = pd.read_csv("KDDTest+.txt", header=None)

In [None]:
# Drop num_outbound_cmds (column 20)
df.drop(columns=[20], inplace=True)
df_test.drop(columns=[20], inplace=True)

# One-hot encode protocol, service, flag (cols: 1, 2, 3)
categorical_cols = [1, 2, 3]
encoder = OneHotEncoder()
encoded = encoder.fit_transform(df[categorical_cols]).toarray()
encoded_test = encoder.transform(df_test[categorical_cols]).toarray()

# Replace with numerical and drop original
df_encoded = np.concatenate([encoded, df.drop(columns=categorical_cols).drop(columns=[41], axis=1).values], axis=1)
df_test_encoded = np.concatenate([encoded_test, df_test.drop(columns=categorical_cols).drop(columns=[41], axis=1).values], axis=1)



In [None]:
# Normalize
scaler = MinMaxScaler()
X_train = scaler.fit_transform(df_encoded)
X_test = scaler.transform(df_test_encoded)

# Binary labels
y_train = df[41].apply(lambda x: 0 if x == 'normal' else 1).values
y_test = df_test[41].apply(lambda x: 0 if x == 'normal' else 1).values

In [None]:
# 2. Build SSAE model
def build_ssae():
    inputs = tf.keras.Input(shape=(X_train.shape[1],))
    encoded = tf.keras.layers.Dense(100, activation='sigmoid')(inputs)
    encoded = tf.keras.layers.Dense(85, activation='sigmoid')(encoded)
    encoded = tf.keras.layers.Dense(55, activation='sigmoid')(encoded)
    bottleneck = tf.keras.layers.Dense(5, activation='sigmoid')(encoded)

    decoded = tf.keras.layers.Dense(55, activation='sigmoid')(bottleneck)
    decoded = tf.keras.layers.Dense(85, activation='sigmoid')(decoded)
    decoded = tf.keras.layers.Dense(100, activation='sigmoid')(decoded)
    outputs = tf.keras.layers.Dense(X_train.shape[1], activation='sigmoid')(decoded)

    autoencoder = tf.keras.Model(inputs, outputs)
    encoder_model = tf.keras.Model(inputs, bottleneck)

    autoencoder.compile(optimizer='adam', loss='mse')
    return autoencoder, encoder_model

autoencoder, encoder_model = build_ssae()
autoencoder.fit(X_train, X_train, epochs=50, batch_size=128, validation_data=(X_test, X_test), verbose=1)

Epoch 1/50
[1m985/985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 10ms/step - loss: 0.0492 - val_loss: 0.0353
Epoch 2/50
[1m985/985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 5ms/step - loss: 0.0350 - val_loss: 0.0354
Epoch 3/50
[1m985/985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 0.0349 - val_loss: 0.0350
Epoch 4/50
[1m985/985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 0.0348 - val_loss: 0.0235
Epoch 5/50
[1m985/985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.0143 - val_loss: 0.0145
Epoch 6/50
[1m985/985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - loss: 0.0099 - val_loss: 0.0132
Epoch 7/50
[1m985/985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - loss: 0.0083 - val_loss: 0.0125
Epoch 8/50
[1m985/985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - loss: 0.0078 - val_loss: 0.0121
Epoch 9/50
[1m985/985[0m [32m━━━━━

<keras.src.callbacks.history.History at 0x7ccca516efd0>

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC
import numpy as np

# 5-Fold Cross-Validation (no leakage version)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_scores = []

print("\nStarting 5-Fold Cross-Validation using SSAE+SVM (no data leakage)...\n")

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    print(f"\n--- Fold {fold} ---")

    # Raw split
    X_fold_raw_train, X_fold_raw_val = X_train[train_idx], X_train[val_idx]
    y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]

    # OPTIONAL: Retrain the SSAE on the training fold if needed
    # autoencoder.fit(X_fold_raw_train, X_fold_raw_train, epochs=..., batch_size=...)

    # Encode the train and val sets using the pretrained SSAE
    X_fold_train_encoded = encoder_model.predict(X_fold_raw_train)
    X_fold_val_encoded = encoder_model.predict(X_fold_raw_val)

    # SVM with class balancing
    svm = SVC(kernel='rbf', C=10, gamma=0.01, class_weight='balanced')
    svm.fit(X_fold_train_encoded, y_fold_train)
    y_pred = svm.predict(X_fold_val_encoded)

    # Evaluation
    acc = accuracy_score(y_fold_val, y_pred)
    acc_scores.append(acc)

    print(f"Fold {fold} Accuracy: {acc:.4f}")
    print(classification_report(y_fold_val, y_pred, digits=4))

# Final average result
print("\nAverage Cross-Validation Accuracy: {:.4f}".format(np.mean(acc_scores)))



Starting 5-Fold Cross-Validation using SSAE+SVM (no data leakage)...


--- Fold 1 ---
[1m3150/3150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 1 Accuracy: 0.8852
              precision    recall  f1-score   support

           0     0.8274    0.9922    0.9023     13469
           1     0.9884    0.7622    0.8607     11726

    accuracy                         0.8852     25195
   macro avg     0.9079    0.8772    0.8815     25195
weighted avg     0.9023    0.8852    0.8830     25195


--- Fold 2 ---
[1m3150/3150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 2 Accuracy: 0.8829
              precision    recall  f1-score   support

           0     0.8245    0.9921    0.9006     13469
           1     0.9881    0.7575    0.8575     11726

    accuracy                         0.8829     25195
