In [3]:
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix

# Load preprocessed data
X_processed = np.load("dataset/X_processed.npy")
y_true = np.load("dataset/y_true.npy")

# Initialize and train Isolation Forest
iso_forest = IsolationForest(
    n_estimators=100,
    contamination='auto',
    random_state=42,
    verbose=1
)
iso_forest.fit(X_processed)

# Predict anomaly scores (-1 for anomaly, 1 for normal)
y_pred_raw = iso_forest.predict(X_processed)

# Convert to binary format (1 = anomaly, 0 = normal)
y_pred = (y_pred_raw == -1).astype(int)

# Evaluation
print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))

print("\nClassification Report:")
print(classification_report(y_true, y_pred, digits=4))


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.7s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    2.7s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    5.4s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    2.7s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    5.7s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    3.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    5.5s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    2.4s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    5.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    2.4s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    5.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    2.5s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    5.2s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    3.2s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    6.3s finished
[Parallel(n_j

Confusion Matrix:
[[ 544464  428317]
 [3577414  348236]]

Classification Report:
              precision    recall  f1-score   support

           0     0.1321    0.5597    0.2137    972781
           1     0.4484    0.0887    0.1481   3925650

    accuracy                         0.1822   4898431
   macro avg     0.2903    0.3242    0.1809   4898431
weighted avg     0.3856    0.1822    0.1611   4898431



##### Trained an unsupervised Isolation Forest model on the preprocessed data. The baseline showed **low accuracy (\~18%)** and **poor recall for attacks (\~9%)**, indicating it missed many anomalies. Since performance was insufficient, we proceeded to **Step 4A.2: Hyperparameter tuning** to improve detection.


In [7]:
# Define hyperparameter grid
contamination_vals = [0.1, 0.2, 0.3]
max_samples_vals = ['auto', 0.2, 0.5]
n_estimators_vals = [100, 200]

# Track best configuration
best_f1 = 0
best_params = {}
all_results = []

# Manual tuning loop
for contamination in contamination_vals:
    for max_samples in max_samples_vals:
        for n_estimators in n_estimators_vals:
            print(f"\nTraining with contamination={contamination}, max_samples={max_samples}, n_estimators={n_estimators}")

            # Train model
            model = IsolationForest(
                contamination=contamination,
                max_samples=max_samples,
                n_estimators=n_estimators,
                random_state=42,
                verbose=0,
                n_jobs=-1

            )
            model.fit(X_processed)

            # Predict: -1 = anomaly, 1 = normal
            y_pred_raw = model.predict(X_processed)
            y_pred = (y_pred_raw == -1).astype(int)

            # Evaluate performance
            report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
            f1 = report['1']['f1-score']
            recall = report['1']['recall']
            precision = report['1']['precision']

            # Store results
            all_results.append({
                'contamination': contamination,
                'max_samples': max_samples,
                'n_estimators': n_estimators,
                'f1': f1,
                'recall': recall,
                'precision': precision
            })

            # Update best
            if f1 > best_f1:
                best_f1 = f1
                best_params = {
                    'contamination': contamination,
                    'max_samples': max_samples,
                    'n_estimators': n_estimators
                }

# Display best configuration
print("\n Best Configuration Found:")
print(best_params)
print(f"Best F1-score (attack class): {best_f1:.4f}")



Training with contamination=0.1, max_samples=auto, n_estimators=100

Training with contamination=0.1, max_samples=auto, n_estimators=200

Training with contamination=0.1, max_samples=0.2, n_estimators=100

Training with contamination=0.1, max_samples=0.2, n_estimators=200

Training with contamination=0.1, max_samples=0.5, n_estimators=100

Training with contamination=0.1, max_samples=0.5, n_estimators=200

Training with contamination=0.2, max_samples=auto, n_estimators=100

Training with contamination=0.2, max_samples=auto, n_estimators=200

Training with contamination=0.2, max_samples=0.2, n_estimators=100

Training with contamination=0.2, max_samples=0.2, n_estimators=200

Training with contamination=0.2, max_samples=0.5, n_estimators=100

Training with contamination=0.2, max_samples=0.5, n_estimators=200

Training with contamination=0.3, max_samples=auto, n_estimators=100

Training with contamination=0.3, max_samples=auto, n_estimators=200

Training with contamination=0.3, max_samp

In [9]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Best parameters from tuning
final_model = IsolationForest(
    contamination=best_params['contamination'],
    max_samples=best_params['max_samples'],
    n_estimators=best_params['n_estimators'],
    random_state=42,
    n_jobs=-1
)

# Train on full data
final_model.fit(X_processed)

# Predict anomalies
y_pred_final_raw = final_model.predict(X_processed)
y_pred_final = (y_pred_final_raw == -1).astype(int)

# Evaluate
print("\n Final Evaluation on Full Data:")
print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred_final))

print("\nClassification Report:")
print(classification_report(y_true, y_pred_final, zero_division=0))

# Optional: Save the model
joblib.dump(final_model, 'models/isolation_forest_final.pkl')



 Final Evaluation on Full Data:
Confusion Matrix:
[[ 391241  581540]
 [3038633  887017]]

Classification Report:
              precision    recall  f1-score   support

           0       0.11      0.40      0.18    972781
           1       0.60      0.23      0.33   3925650

    accuracy                           0.26   4898431
   macro avg       0.36      0.31      0.25   4898431
weighted avg       0.51      0.26      0.30   4898431



['models/isolation_forest_final.pkl']

In [None]:
from sklearn.ensemble import IsolationForest

# Best hyperparameters found from tuning
best_params = {
    'contamination': 0.3,
    'max_samples': 'auto',
    'n_estimators': 200
}

# Train Isolation Forest
iso_model = IsolationForest(
    contamination=best_params['contamination'],
    max_samples=best_params['max_samples'],
    n_estimators=best_params['n_estimators'],
    random_state=42,
    n_jobs=-1
)
iso_model.fit(X_processed)


y_pred_iso = (iso_model.predict(X_processed) == -1).astype(int)

# Evaluation
print("Isolation Forest Final Evaluation:")
print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred_iso))
print("\nClassification Report:")
print(classification_report(y_true, y_pred_iso, zero_division=0))


📊 Isolation Forest Final Evaluation:
Confusion Matrix:
[[ 391241  581540]
 [3038633  887017]]

Classification Report:
              precision    recall  f1-score   support

           0       0.11      0.40      0.18    972781
           1       0.60      0.23      0.33   3925650

    accuracy                           0.26   4898431
   macro avg       0.36      0.31      0.25   4898431
weighted avg       0.51      0.26      0.30   4898431

