#### 1. Setup

In [None]:
import numpy as np
import pandas as pd

from pathlib import Path

from sklearn.ensemble import IsolationForest
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_auc_score
)

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)


#### 2. Load data


In [None]:
PROC_DIR = Path("../data/processed")

X_train_normal = np.load(PROC_DIR / "X_train_normal.npy")
X_test  = np.load(PROC_DIR / "X_test.npy")
y_test  = np.load(PROC_DIR / "y_test.npy")

print("Train (normal only):", X_train_normal.shape)
print("Test:", X_test.shape)
print("Test label distribution:", np.bincount(y_test))


Train (normal only): (67343, 118)
Test: (22544, 118)
Test label distribution: [ 9711 12833]


#### 3. Train Isolation Forest

In [65]:
iforest = IsolationForest(
    n_estimators=200,
    contamination=0.1,     # tuned for high recall
    random_state=RANDOM_STATE,
    n_jobs=-1
)

iforest.fit(X_train_normal)


#### 4. Isolation Forest Anomaly Scores

In [66]:
score_if_train = -iforest.decision_function(X_train_normal)
score_if_test  = -iforest.decision_function(X_test)


#### 6. Train Autoencoder

In [67]:
input_dim = X_train_normal.shape[1]

autoencoder = Sequential([
    Dense(64, activation='relu', input_shape=(input_dim,)),
    Dense(32, activation='relu'),
    Dense(64, activation='relu'),
    Dense(input_dim, activation='linear')
])

autoencoder.compile(
    optimizer='adam',
    loss='mse'
)

early_stop = EarlyStopping(
    monitor='loss',
    patience=5,
    restore_best_weights=True
)

autoencoder.fit(
    X_train_normal,
    X_train_normal,
    epochs=50,
    batch_size=256,
    shuffle=True,
    callbacks=[early_stop],
    verbose=1
)


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.5121
Epoch 2/50
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.2393  
Epoch 3/50
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 952us/step - loss: 0.1669
Epoch 4/50
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 962us/step - loss: 0.1315
Epoch 5/50
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 995us/step - loss: 0.1082
Epoch 6/50
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0898  
Epoch 7/50
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 957us/step - loss: 0.0761
Epoch 8/50
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 962us/step - loss: 0.0660
Epoch 9/50
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0548
Epoch 10/50
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 957us/step

<keras.src.callbacks.history.History at 0x18aeae314c0>

#### 7. Autoencoder Reconstruction Scores

In [68]:
def reconstruction_error(X, model):
    return np.mean(
        np.square(X - model.predict(X)),
        axis=1
    )

score_ae_train = reconstruction_error(X_train_normal, autoencoder)
score_ae_test  = reconstruction_error(X_test, autoencoder)


[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 363us/step
[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 357us/step


#### 7. Score Normalization

In [69]:
def minmax_norm(train_score, test_score):
    min_v = train_score.min()
    max_v = train_score.max()
    train_norm = (train_score - min_v) / (max_v - min_v + 1e-8)
    test_norm  = (test_score  - min_v) / (max_v - min_v + 1e-8)
    return train_norm, test_norm

score_if_train_n, score_if_test_n = minmax_norm(score_if_train, score_if_test)
score_ae_train_n, score_ae_test_n = minmax_norm(score_ae_train, score_ae_test)


#### 8. Score-level Fusion

In [None]:
alpha = 0.5   # weight for Isolation Forest (tune later)

final_score_train = (
    alpha * score_if_train_n +
    (1 - alpha) * score_ae_train_n
)

final_score_test = (
    alpha * score_if_test_n +
    (1 - alpha) * score_ae_test_n
)


#### 9. Threshold

In [89]:
THRESHOLD = np.percentile(final_score_train, 85)

print("Hybrid anomaly threshold:", THRESHOLD)


Hybrid anomaly threshold: 0.004066797835814209


#### 10. Final Prediction

In [90]:
y_pred_hybrid = (final_score_test > THRESHOLD).astype(int)


#### 11. Evaluation

In [91]:
cm = confusion_matrix(y_test, y_pred_hybrid)
print("Confusion Matrix:\n", cm)

print("\nClassification Report:")
print(classification_report(y_test, y_pred_hybrid))

roc = roc_auc_score(y_test, final_score_test)
print("ROC-AUC (Hybrid Parallel):", roc)


Confusion Matrix:
 [[ 8701  1010]
 [ 2231 10602]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.90      0.84      9711
           1       0.91      0.83      0.87     12833

    accuracy                           0.86     22544
   macro avg       0.85      0.86      0.86     22544
weighted avg       0.86      0.86      0.86     22544

ROC-AUC (Hybrid Parallel): 0.9543283757283056


#### 12. Diagnostic Outputs

In [92]:
print("Final anomaly rate:", y_pred_hybrid.mean())
print("Mean score (normal):", final_score_test[y_test == 0].mean())
print("Mean score (attack):", final_score_test[y_test == 1].mean())


Final anomaly rate: 0.5150816181689141
Mean score (normal): 0.001739785869572815
Mean score (attack): 0.01941114738619844
