# 05 – XGBoost Meta-Learner (Stacking)  
This notebook trains the **stacking meta model** using only the **signals** (including CatBoost if available) produced by the 3 base models:

- LightGBM probability (`lgb_proba`)
- IsolationForest anomaly risk score (`iforest_score`)
- Autoencoder reconstruction-error risk score (`ae_score`)

Inputs (produced by the base notebooks):
- `oof_lgb.csv`, `test_lgb.csv`
- `oof_iforest.csv`, `test_iforest.csv`
- `oof_ae.csv`, `test_ae.csv`

It produces:
- `oof_meta.csv` (OOF meta probabilities on the train period; used for analysis)
- `test_meta.csv` (meta probabilities on the future test period)

Key requirements implemented:
- Time-aware training: meta train is sorted by `TransactionDT`
- **Optuna** tuning to maximize **F2-score** at a fixed threshold
- **Confusion matrix** + metrics on the test period
- **SHAP** explanation of meta model

> Threshold is set to `0.05` by default (change once at the top and it updates everywhere).

In [None]:
# --- (Optional) Install dependencies (Colab-safe)
try:
    import optuna  # noqa: F401
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "optuna"])

try:
    import shap  # noqa: F401
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "shap"])


In [None]:
import os, sys, json, random, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import optuna
import xgboost as xgb

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    precision_score,
    recall_score,
    fbeta_score,
    roc_auc_score,
    average_precision_score,
    precision_recall_curve,
    roc_curve,
)

warnings.filterwarnings("ignore")

# -----------------------------
# Global config (EDIT HERE)
# -----------------------------
SEED = 42
N_SPLITS = 5
N_TRIALS = 50
THRESHOLD = 0.05

np.random.seed(SEED)
random.seed(SEED)

# Detect Colab + mount Drive
IN_COLAB = "google.colab" in sys.modules
if IN_COLAB:
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive")

datapath = "/content/drive/MyDrive/RThesis/" if IN_COLAB else "./"
os.makedirs(datapath, exist_ok=True)

print("datapath =", datapath)


In [None]:
# -----------------------------
# Load base model outputs
# -----------------------------
paths_needed = [
    "oof_lgb.csv", "test_lgb.csv",
    "oof_iforest.csv", "test_iforest.csv",
    "oof_ae.csv", "test_ae.csv",
    "oof_cat.csv", "test_cat.csv",
    "y_train.csv", "y_test.csv",
    "train_keys.csv", "test_keys.csv",
]
for fn in paths_needed:
    p = os.path.join(datapath, fn)
    assert os.path.exists(p), f"Missing file: {p}"

oof_lgb = pd.read_csv(os.path.join(datapath, "oof_lgb.csv"))
oof_if  = pd.read_csv(os.path.join(datapath, "oof_iforest.csv"))
oof_ae  = pd.read_csv(os.path.join(datapath, "oof_ae.csv"))
oof_cat = pd.read_csv(os.path.join(datapath, "oof_cat.csv"))

test_lgb = pd.read_csv(os.path.join(datapath, "test_lgb.csv"))
test_if  = pd.read_csv(os.path.join(datapath, "test_iforest.csv"))
test_ae  = pd.read_csv(os.path.join(datapath, "test_ae.csv"))
test_cat = pd.read_csv(os.path.join(datapath, "test_cat.csv"))

y_train = pd.read_csv(os.path.join(datapath, "y_train.csv"))["isFraud"].astype(int)
y_test  = pd.read_csv(os.path.join(datapath, "y_test.csv"))["isFraud"].astype(int)

train_keys = pd.read_csv(os.path.join(datapath, "train_keys.csv"))
test_keys  = pd.read_csv(os.path.join(datapath, "test_keys.csv"))

# Ensure row_id exists
if "row_id" not in train_keys.columns:
    train_keys["row_id"] = np.arange(len(train_keys))
if "row_id" not in test_keys.columns:
    test_keys["row_id"] = np.arange(len(test_keys))

print("Loaded base outputs.")


In [None]:
# -----------------------------
# Build meta train/test tables (merge by row_id)
# -----------------------------
# Keep only required score columns (and keep has_oof if present)
def slim(df, score_col):
    cols = ["row_id", score_col]
    extra = [c for c in ["has_oof", "fold"] if c in df.columns]
    cols += extra
    return df[cols].copy()

oof_lgb_s = slim(oof_lgb, "lgb_proba")
oof_if_s  = slim(oof_if, "iforest_score")
oof_ae_s  = slim(oof_ae, "ae_score")
oof_cat_s = slim(oof_cat, "cat_proba")

test_lgb_s = slim(test_lgb, "lgb_proba")
test_if_s  = slim(test_if, "iforest_score")
test_ae_s  = slim(test_ae, "ae_score")
test_cat_s = slim(test_cat, "cat_proba")

# Merge OOF
meta_train = train_keys[["row_id", "TransactionDT"]].copy()
meta_train = meta_train.merge(oof_lgb_s, on="row_id", how="left")
meta_train = meta_train.merge(oof_if_s,  on="row_id", how="left")
meta_train = meta_train.merge(oof_ae_s,  on="row_id", how="left")
meta_train = meta_train.merge(oof_cat_s, on="row_id", how="left")

meta_train["y_true"] = y_train.values

# Filter to rows where ALL base models provide OOF (no leakage / no missing)
score_cols = ["lgb_proba", "iforest_score", "ae_score", "cat_proba"]
mask = meta_train[score_cols].notnull().all(axis=1)

# If has_oof columns exist, also enforce them
for c in [c for c in meta_train.columns if c.startswith("has_oof")]:
    mask &= meta_train[c].fillna(False).astype(bool)

meta_train = meta_train[mask].copy()
meta_train = meta_train.sort_values("TransactionDT").reset_index(drop=True)

print("Meta train shape:", meta_train.shape)
print("Meta train fraud rate:", meta_train["y_true"].mean())
print("Dropped rows (no full OOF):", int((~mask).sum()))

# Merge TEST
meta_test = test_keys[["row_id", "TransactionDT"]].copy()
meta_test = meta_test.merge(test_lgb_s, on="row_id", how="left")
meta_test = meta_test.merge(test_if_s,  on="row_id", how="left")
meta_test = meta_test.merge(test_ae_s,  on="row_id", how="left")
meta_test = meta_test.merge(test_cat_s, on="row_id", how="left")
meta_test["y_true"] = y_test.values
meta_test = meta_test.sort_values("TransactionDT").reset_index(drop=True)

print("Meta test shape:", meta_test.shape)


In [None]:
# -----------------------------
# Prepare matrices
# -----------------------------
feature_cols = ["lgb_proba", "iforest_score", "ae_score", "cat_proba"]

X_meta_train = meta_train[feature_cols].values.astype(np.float32)
y_meta_train = meta_train["y_true"].values.astype(int)

X_meta_test = meta_test[feature_cols].values.astype(np.float32)
y_meta_test = meta_test["y_true"].values.astype(int)

# scale_pos_weight base
n_pos = int(y_meta_train.sum())
n_neg = int((1 - y_meta_train).sum())
base_spw = max(1.0, n_neg / max(1, n_pos))

print("Meta base scale_pos_weight:", base_spw)


In [None]:
# -----------------------------
# Helper functions
# -----------------------------
def compute_f2(y_true, y_score, threshold=THRESHOLD, beta=2.0):
    y_pred = (y_score >= threshold).astype(int)
    return fbeta_score(y_true, y_pred, beta=beta, zero_division=0)

def evaluate_binary(y_true, y_score, threshold=THRESHOLD, title="Model"):
    y_pred = (y_score >= threshold).astype(int)

    prec = precision_score(y_true, y_pred, zero_division=0)
    rec  = recall_score(y_true, y_pred, zero_division=0)
    f2   = fbeta_score(y_true, y_pred, beta=2.0, zero_division=0)

    try:
        auc = roc_auc_score(y_true, y_score)
    except Exception:
        auc = np.nan
    try:
        ap = average_precision_score(y_true, y_score)
    except Exception:
        ap = np.nan

    print(f"\n[{title}]  threshold={threshold}")
    print(f"Precision: {prec:.4f} | Recall: {rec:.4f} | F2: {f2:.4f} | ROC-AUC: {auc:.4f} | PR-AUC: {ap:.4f}")
    print("\nClassification report:")
    print(classification_report(y_true, y_pred, digits=4, zero_division=0))

    cm = confusion_matrix(y_true, y_pred)
    print("Confusion matrix:\n", cm)

    fig, ax = plt.subplots()
    ax.imshow(cm)
    ax.set_title(f"{title} – Confusion Matrix")
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
    ax.set_xticks([0,1]); ax.set_yticks([0,1])
    ax.set_xticklabels(["Non-Fraud","Fraud"])
    ax.set_yticklabels(["Non-Fraud","Fraud"])
    for (i, j), v in np.ndenumerate(cm):
        ax.text(j, i, str(v), ha="center", va="center")
    plt.show()

    # ROC + PR curves
    try:
        fpr, tpr, _ = roc_curve(y_true, y_score)
        plt.figure()
        plt.plot(fpr, tpr)
        plt.plot([0,1],[0,1], linestyle="--")
        plt.title(f"{title} – ROC Curve")
        plt.xlabel("FPR"); plt.ylabel("TPR")
        plt.show()
    except Exception as e:
        print("ROC curve skipped:", e)

    try:
        p, r, _ = precision_recall_curve(y_true, y_score)
        plt.figure()
        plt.plot(r, p)
        plt.title(f"{title} – Precision-Recall Curve")
        plt.xlabel("Recall"); plt.ylabel("Precision")
        plt.show()
    except Exception as e:
        print("PR curve skipped:", e)

    return {"precision": prec, "recall": rec, "f2": f2, "roc_auc": auc, "pr_auc": ap, "cm": cm}


In [None]:
# -----------------------------
# Optuna tuning (maximize mean CV F2)
# -----------------------------
def make_params(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 200, 3000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 2, 8),
        "min_child_weight": trial.suggest_float("min_child_weight", 1.0, 20.0),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.5 * base_spw, 2.0 * base_spw, log=True),
    }

def objective(trial):
    params = make_params(trial)

    tss = TimeSeriesSplit(n_splits=N_SPLITS)
    f2s = []

    for fold, (tr_idx, va_idx) in enumerate(tss.split(X_meta_train), start=1):
        X_tr, y_tr = X_meta_train[tr_idx], y_meta_train[tr_idx]
        X_va, y_va = X_meta_train[va_idx], y_meta_train[va_idx]

        model = xgb.XGBClassifier(
            objective="binary:logistic",
            random_state=SEED,
            n_jobs=-1,
            tree_method="hist",
            eval_metric="aucpr",
            **params
        )

        model.fit(X_tr, y_tr)
        proba_va = model.predict_proba(X_va)[:, 1]
        f2s.append(compute_f2(y_va, proba_va, threshold=THRESHOLD))

    return float(np.mean(f2s))

study = optuna.create_study(direction="maximize", study_name="meta_xgb_f2")
study.optimize(objective, n_trials=N_TRIALS)

best_params = study.best_trial.params
print("Best CV F2:", study.best_value)
print("Best params:\n", json.dumps(best_params, indent=2))


In [None]:
# -----------------------------
# Train final meta model on full meta-train
# -----------------------------
final_params = {
    "objective": "binary:logistic",
    "random_state": SEED,
    "n_jobs": -1,
    "tree_method": "hist",
    "eval_metric": "aucpr",
    **best_params,
}

best_params_path = os.path.join(datapath, "best_params_meta_xgb.json")
with open(best_params_path, "w") as f:
    json.dump(final_params, f, indent=2)
print("Saved best params to:", best_params_path)

meta_model = xgb.XGBClassifier(**final_params)
meta_model.fit(X_meta_train, y_meta_train)

# Test evaluation
meta_test_proba = meta_model.predict_proba(X_meta_test)[:, 1]
_ = evaluate_binary(y_meta_test, meta_test_proba, threshold=THRESHOLD, title="Meta XGBoost (Test)")


In [None]:
# -----------------------------
# OOF predictions for meta model (optional but useful for analysis)
# -----------------------------
tss = TimeSeriesSplit(n_splits=N_SPLITS)

oof_meta = np.full(len(X_meta_train), np.nan, dtype=float)
oof_fold = np.full(len(X_meta_train), -1, dtype=int)
has_oof = np.zeros(len(X_meta_train), dtype=bool)

for fold, (tr_idx, va_idx) in enumerate(tss.split(X_meta_train), start=1):
    X_tr, y_tr = X_meta_train[tr_idx], y_meta_train[tr_idx]
    X_va, y_va = X_meta_train[va_idx], y_meta_train[va_idx]

    m = xgb.XGBClassifier(**final_params)
    m.fit(X_tr, y_tr)

    proba_va = m.predict_proba(X_va)[:, 1]
    oof_meta[va_idx] = proba_va
    oof_fold[va_idx] = fold
    has_oof[va_idx] = True

    print(f"Fold {fold}/{N_SPLITS} F2={compute_f2(y_va, proba_va):.4f}")

mask = has_oof
_ = evaluate_binary(y_meta_train[mask], oof_meta[mask], threshold=THRESHOLD, title="Meta XGBoost OOF (Train)")
print("OOF coverage:", mask.mean())


In [None]:
# -----------------------------
# Save meta predictions
# -----------------------------
oof_out = pd.DataFrame({
    "row_id": meta_train["row_id"].values,
    "TransactionDT": meta_train["TransactionDT"].values,
    "y_true": y_meta_train,
    "has_oof": has_oof,
    "fold": oof_fold,
    "meta_proba": oof_meta,
    "pred_label": np.where(has_oof, (oof_meta >= THRESHOLD).astype(int), np.nan),
    "threshold_used": np.where(has_oof, THRESHOLD, np.nan),
})

test_out = pd.DataFrame({
    "row_id": meta_test["row_id"].values,
    "TransactionDT": meta_test["TransactionDT"].values,
    "y_true": y_meta_test,
    "meta_proba": meta_test_proba,
    "pred_label": (meta_test_proba >= THRESHOLD).astype(int),
    "threshold_used": THRESHOLD,
})

oof_path = os.path.join(datapath, "oof_meta.csv")
test_path = os.path.join(datapath, "test_meta.csv")
oof_out.to_csv(oof_path, index=False)
test_out.to_csv(test_path, index=False)

print("Saved:", oof_path)
print("Saved:", test_path)


In [None]:
# -----------------------------
# SHAP for meta model (very interpretable: only base-model signal features (now 4 if CatBoost included))
# -----------------------------
import shap

# Use a small sample for plotting
SAMPLE_SIZE = min(2000, len(X_meta_train))
idx = np.random.RandomState(SEED).choice(len(X_meta_train), size=SAMPLE_SIZE, replace=False)
X_shap = pd.DataFrame(X_meta_train[idx], columns=feature_cols)

explainer = shap.TreeExplainer(meta_model)
shap_values = explainer.shap_values(X_shap)

# shap_values for binary can be list; normalize
if isinstance(shap_values, list) and len(shap_values) == 2:
    shap_vals_to_plot = shap_values[1]
else:
    shap_vals_to_plot = shap_values

shap.summary_plot(shap_vals_to_plot, X_shap, show=False)
plt.title("Meta XGBoost – SHAP Summary (Train Sample)")
plt.show()

shap.summary_plot(shap_vals_to_plot, X_shap, plot_type="bar", show=False)
plt.title("Meta XGBoost – SHAP Importance (Train Sample)")
plt.show()
