# 03 – Isolation Forest Base Model (OOF + Test)  
This notebook trains an **Isolation Forest** anomaly detector on the scaled + one-hot encoded dataset produced by:

- `01_preprocessing_real_time_split.ipynb`

It produces:
- `oof_iforest.csv` (out-of-fold anomaly *risk scores* for the **train** period)
- `test_iforest.csv` (risk scores for the **future test** period)

Key requirements implemented:
- **TimeSeriesSplit** for OOF (no leakage)
- **Optuna** hyperparameter tuning to maximize **F2-score** at a fixed threshold
- Converts raw anomaly scores to a **0–1 risk score** via **MinMax scaling** (fit on train fold only)
- **Confusion matrix** + metrics on the **test** period (labels used only for evaluation)
- **SHAP** explanation (TreeExplainer if available; otherwise Kernel fallback)

> Threshold is set to `0.05` by default (change once at the top and it updates everywhere).

In [None]:
# --- (Optional) Install dependencies (Colab-safe)
try:
    import optuna  # noqa: F401
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "optuna"])

try:
    import shap  # noqa: F401
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "shap"])


In [None]:
import os, sys, json, random, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import optuna

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    precision_score,
    recall_score,
    fbeta_score,
    roc_auc_score,
    average_precision_score,
    precision_recall_curve,
    roc_curve,
)

warnings.filterwarnings("ignore")

# -----------------------------
# Global config (EDIT HERE)
# -----------------------------
SEED = 42
N_SPLITS = 5
N_TRIALS = 30
THRESHOLD = 0.05

np.random.seed(SEED)
random.seed(SEED)

# Detect Colab + mount Drive
IN_COLAB = "google.colab" in sys.modules
if IN_COLAB:
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive")

datapath = "/content/drive/MyDrive/RThesis/" if IN_COLAB else "./"
os.makedirs(datapath, exist_ok=True)

print("datapath =", datapath)


In [None]:
# -----------------------------
# Load preprocessed scaled datasets
# -----------------------------
X_train_path = os.path.join(datapath, "X_train_scaled.csv")
X_test_path  = os.path.join(datapath, "X_test_scaled.csv")
y_train_path = os.path.join(datapath, "y_train.csv")
y_test_path  = os.path.join(datapath, "y_test.csv")
train_keys_path = os.path.join(datapath, "train_keys.csv")
test_keys_path  = os.path.join(datapath, "test_keys.csv")

for p in [X_train_path, X_test_path, y_train_path, y_test_path, train_keys_path, test_keys_path]:
    assert os.path.exists(p), f"Missing file: {p}"

X_train = pd.read_csv(X_train_path)
X_test  = pd.read_csv(X_test_path)

y_train = pd.read_csv(y_train_path)["isFraud"].astype(int)
y_test  = pd.read_csv(y_test_path)["isFraud"].astype(int)

train_keys = pd.read_csv(train_keys_path)
test_keys  = pd.read_csv(test_keys_path)

row_id_train = train_keys["row_id"].values if "row_id" in train_keys.columns else np.arange(len(X_train))
row_id_test  = test_keys["row_id"].values if "row_id" in test_keys.columns else np.arange(len(X_test))

assert len(X_train) == len(y_train) == len(row_id_train), "Train alignment mismatch"
assert len(X_test)  == len(y_test)  == len(row_id_test), "Test alignment mismatch"

print("X_train:", X_train.shape, " | fraud rate:", y_train.mean())
print("X_test :", X_test.shape,  " | fraud rate:", y_test.mean())


In [None]:
# -----------------------------
# Helper functions
# -----------------------------
def compute_f2(y_true, y_score, threshold=THRESHOLD, beta=2.0):
    y_pred = (y_score >= threshold).astype(int)
    return fbeta_score(y_true, y_pred, beta=beta, zero_division=0)

def evaluate_binary(y_true, y_score, threshold=THRESHOLD, title="Model"):
    y_pred = (y_score >= threshold).astype(int)

    prec = precision_score(y_true, y_pred, zero_division=0)
    rec  = recall_score(y_true, y_pred, zero_division=0)
    f2   = fbeta_score(y_true, y_pred, beta=2.0, zero_division=0)

    try:
        auc = roc_auc_score(y_true, y_score)
    except Exception:
        auc = np.nan
    try:
        ap = average_precision_score(y_true, y_score)
    except Exception:
        ap = np.nan

    print(f"\n[{title}]  threshold={threshold}")
    print(f"Precision: {prec:.4f} | Recall: {rec:.4f} | F2: {f2:.4f} | ROC-AUC: {auc:.4f} | PR-AUC: {ap:.4f}")
    print("\nClassification report:")
    print(classification_report(y_true, y_pred, digits=4, zero_division=0))

    cm = confusion_matrix(y_true, y_pred)
    print("Confusion matrix:\n", cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm)
    ax.set_title(f"{title} – Confusion Matrix")
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
    ax.set_xticks([0,1]); ax.set_yticks([0,1])
    ax.set_xticklabels(["Non-Fraud","Fraud"])
    ax.set_yticklabels(["Non-Fraud","Fraud"])
    for (i, j), v in np.ndenumerate(cm):
        ax.text(j, i, str(v), ha="center", va="center")
    plt.show()

    # ROC + PR curves
    try:
        fpr, tpr, _ = roc_curve(y_true, y_score)
        plt.figure()
        plt.plot(fpr, tpr)
        plt.plot([0,1],[0,1], linestyle="--")
        plt.title(f"{title} – ROC Curve")
        plt.xlabel("FPR"); plt.ylabel("TPR")
        plt.show()
    except Exception as e:
        print("ROC curve skipped:", e)

    try:
        p, r, _ = precision_recall_curve(y_true, y_score)
        plt.figure()
        plt.plot(r, p)
        plt.title(f"{title} – Precision-Recall Curve")
        plt.xlabel("Recall"); plt.ylabel("Precision")
        plt.show()
    except Exception as e:
        print("PR curve skipped:", e)

    return {"precision": prec, "recall": rec, "f2": f2, "roc_auc": auc, "pr_auc": ap, "cm": cm}

def raw_anomaly_score(model: IsolationForest, X: pd.DataFrame) -> np.ndarray:
    # IsolationForest: decision_function is higher for inliers, lower for outliers.
    # Convert so that higher = more anomalous (fraud risk).
    return (-model.decision_function(X)).astype(float)

def scaled_score_from_train(train_raw: np.ndarray, other_raw: np.ndarray):
    scaler = MinMaxScaler()
    scaler.fit(train_raw.reshape(-1, 1))
    return scaler, scaler.transform(other_raw.reshape(-1, 1)).ravel()


In [None]:
# -----------------------------
# Optuna hyperparameter tuning (maximize mean CV F2)
# -----------------------------
def make_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 100, 800),
        "max_samples": trial.suggest_float("max_samples", 0.4, 1.0),
        "max_features": trial.suggest_float("max_features", 0.4, 1.0),
        "contamination": trial.suggest_float("contamination", 0.001, 0.1, log=True),
        "bootstrap": trial.suggest_categorical("bootstrap", [False, True]),
        "random_state": SEED,
        "n_jobs": -1,
        "verbose": 0,
    }

def objective(trial):
    params = make_params(trial)
    tss = TimeSeriesSplit(n_splits=N_SPLITS)
    f2s = []

    for fold, (tr_idx, va_idx) in enumerate(tss.split(X_train), start=1):
        X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
        y_va = y_train.iloc[va_idx]

        model = IsolationForest(**params)
        model.fit(X_tr)

        tr_raw = raw_anomaly_score(model, X_tr)
        va_raw = raw_anomaly_score(model, X_va)

        _, va_score = scaled_score_from_train(tr_raw, va_raw)
        f2s.append(compute_f2(y_va.values, va_score, threshold=THRESHOLD))

    return float(np.mean(f2s))

study = optuna.create_study(direction="maximize", study_name="iforest_f2")
study.optimize(objective, n_trials=N_TRIALS)

best_params = study.best_trial.params
print("Best CV F2:", study.best_value)
print("Best params:\n", json.dumps(best_params, indent=2))


In [None]:
# -----------------------------
# Final params + save
# -----------------------------
final_params = {**best_params, "random_state": SEED, "n_jobs": -1, "verbose": 0}

best_params_path = os.path.join(datapath, "best_params_iforest.json")
with open(best_params_path, "w") as f:
    json.dump(final_params, f, indent=2)
print("Saved best params to:", best_params_path)


In [None]:
# -----------------------------
# Generate OOF anomaly risk scores (0–1) with best params
# Note: TimeSeriesSplit does NOT give OOF for the very first chunk.
# Those rows remain NaN and are excluded from meta training (no leakage).
# -----------------------------
tss = TimeSeriesSplit(n_splits=N_SPLITS)

oof_score = np.full(len(X_train), np.nan, dtype=float)
oof_fold = np.full(len(X_train), -1, dtype=int)
has_oof = np.zeros(len(X_train), dtype=bool)

for fold, (tr_idx, va_idx) in enumerate(tss.split(X_train), start=1):
    X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    y_va = y_train.iloc[va_idx]

    model = IsolationForest(**final_params)
    model.fit(X_tr)

    tr_raw = raw_anomaly_score(model, X_tr)
    va_raw = raw_anomaly_score(model, X_va)

    _, va_score = scaled_score_from_train(tr_raw, va_raw)
    oof_score[va_idx] = va_score
    oof_fold[va_idx] = fold
    has_oof[va_idx] = True

    print(f"Fold {fold}/{N_SPLITS} F2={compute_f2(y_va.values, va_score):.4f}")

# Evaluate OOF only where we actually have OOF predictions
mask = has_oof
_ = evaluate_binary(y_train.values[mask], oof_score[mask], threshold=THRESHOLD, title="IsolationForest OOF (Train)")
print("OOF coverage:", mask.mean(), "(fraction of train rows used for meta)")


In [None]:
# -----------------------------
# Fit final model on FULL train (unsupervised), score test
# -----------------------------
final_model = IsolationForest(**final_params)
final_model.fit(X_train)

train_raw_full = raw_anomaly_score(final_model, X_train)
test_raw = raw_anomaly_score(final_model, X_test)

scaler_full = MinMaxScaler()
scaler_full.fit(train_raw_full.reshape(-1, 1))
test_score = scaler_full.transform(test_raw.reshape(-1, 1)).ravel()

_ = evaluate_binary(y_test.values, test_score, threshold=THRESHOLD, title="IsolationForest (Test)")

# Save outputs
oof_df = pd.DataFrame({
    "row_id": row_id_train,
    "y_true": y_train.values,
    "has_oof": has_oof,
    "fold": oof_fold,
    "iforest_score": oof_score,
    "pred_label": np.where(has_oof, (oof_score >= THRESHOLD).astype(int), np.nan),
    "threshold_used": np.where(has_oof, THRESHOLD, np.nan),
})

test_df = pd.DataFrame({
    "row_id": row_id_test,
    "y_true": y_test.values,
    "iforest_score": test_score,
    "pred_label": (test_score >= THRESHOLD).astype(int),
    "threshold_used": THRESHOLD,
})

oof_path  = os.path.join(datapath, "oof_iforest.csv")
test_path = os.path.join(datapath, "test_iforest.csv")
oof_df.to_csv(oof_path, index=False)
test_df.to_csv(test_path, index=False)

print("Saved:", oof_path)
print("Saved:", test_path)


In [None]:
# -----------------------------
# SHAP – explain anomaly risk score
# -----------------------------
import shap

SAMPLE_SIZE = min(500, len(X_test))
shap_sample = X_test.sample(n=SAMPLE_SIZE, random_state=SEED)

# Try TreeExplainer first (fast for tree models). If it fails, fallback to KernelExplainer (slow).
try:
    explainer = shap.TreeExplainer(final_model)
    shap_values = explainer.shap_values(shap_sample)

    # Some SHAP versions return list; normalize
    if isinstance(shap_values, list) and len(shap_values) > 0:
        shap_vals_to_plot = shap_values[0]
    else:
        shap_vals_to_plot = shap_values

    shap.summary_plot(shap_vals_to_plot, shap_sample, show=False)
    plt.title("IsolationForest – SHAP Summary (Test Sample)")
    plt.show()

    shap.summary_plot(shap_vals_to_plot, shap_sample, plot_type="bar", show=False)
    plt.title("IsolationForest – SHAP Importance (Test Sample)")
    plt.show()

except Exception as e:
    print("TreeExplainer failed; using KernelExplainer fallback. Reason:", repr(e))

    background = X_train.sample(n=min(100, len(X_train)), random_state=SEED)

    def score_fn(X_np):
        X_df = pd.DataFrame(X_np, columns=X_train.columns)
        raw = raw_anomaly_score(final_model, X_df)
        return scaler_full.transform(raw.reshape(-1, 1)).ravel()

    explainer = shap.KernelExplainer(score_fn, background)
    shap_values = explainer.shap_values(shap_sample, nsamples=200)

    shap.summary_plot(shap_values, shap_sample, show=False)
    plt.title("IsolationForest – SHAP Summary (KernelExplainer, Test Sample)")
    plt.show()
