# 06 — CatBoost Base Model (TimeSeries OOF + Optuna + SHAP)

This notebook:

- Loads CatBoost-ready datasets from **01b_preprocessing_catboost_dataset.ipynb**
- Trains **CatBoost** with a strict **time-aware split** (TimeSeriesSplit)
- Tunes hyperparameters with **Optuna** to maximize **F2** at a fixed **threshold = 0.05**
- Produces:
  - `oof_cat.csv` (train OOF predictions)
  - `test_cat.csv` (test predictions)
- Reports confusion matrices on **OOF** and **TEST**
- Shows a **Precision–Recall curve**
- Shows **SHAP** summary (CatBoost SHAPValues) on a sample


In [None]:
# --- Install deps (Colab-safe)
import sys, subprocess, importlib

def _pip(pkg: str):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])

for pkg in ["catboost", "optuna", "tqdm", "shap", "scikit-learn"]:
    try:
        importlib.import_module(pkg if pkg != "scikit-learn" else "sklearn")
    except Exception:
        _pip(pkg)

print("Installed.")


In [None]:
import os, sys, json, random, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from catboost import CatBoostClassifier, Pool
import optuna

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    precision_score,
    recall_score,
    fbeta_score,
    average_precision_score,
    precision_recall_curve,
)

from tqdm.auto import tqdm

warnings.filterwarnings("ignore")

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

THRESHOLD = 0.05  # fixed threshold requested; change later if needed

# --- Choose datapath automatically (works in Colab or local)
IN_COLAB = "google.colab" in sys.modules

def _find_datapath(paths):
    for p in paths:
        if os.path.exists(os.path.join(p, "X_train_cat.csv")):
            return p
    return None

# First try local / Colab working directory without Drive
candidate_paths = ["./", "/content/"]
datapath = _find_datapath(candidate_paths)

# If not found and running in Colab, try mounting Drive and searching there
if datapath is None and IN_COLAB:
    try:
        from google.colab import drive  # type: ignore
        drive.mount("/content/drive")
        candidate_paths.append("/content/drive/MyDrive/RThesis/")
        datapath = _find_datapath(candidate_paths)
    except Exception:
        pass

if datapath is None:
    raise FileNotFoundError(
        "Could not find X_train_cat.csv. Run 01b_preprocessing_catboost_dataset.ipynb first, "
        "and ensure its outputs are in one of: " + ", ".join(candidate_paths)
    )

print("datapath =", datapath)

# --- GPU detection (CatBoost uses GPU, not TPU)
try:
    from catboost.utils import get_gpu_device_count
    gpu_count = get_gpu_device_count()
except Exception:
    gpu_count = 0

print("CatBoost GPU devices:", gpu_count, "| Using:", "GPU" if gpu_count > 0 else "CPU")


In [None]:
# --- Load CatBoost-ready data
X_train_path = os.path.join(datapath, "X_train_cat.csv")
X_test_path  = os.path.join(datapath, "X_test_cat.csv")
y_train_path = os.path.join(datapath, "y_train.csv")
y_test_path  = os.path.join(datapath, "y_test.csv")
train_keys_path = os.path.join(datapath, "train_keys.csv")
test_keys_path  = os.path.join(datapath, "test_keys.csv")
cat_cols_path   = os.path.join(datapath, "cat_feature_cols.json")

for p in [X_train_path, X_test_path, y_train_path, y_test_path, train_keys_path, test_keys_path]:
    assert os.path.exists(p), f"Missing: {p}"

X_train = pd.read_csv(X_train_path)
X_test  = pd.read_csv(X_test_path)

y_train = pd.read_csv(y_train_path)["isFraud"]
y_test  = pd.read_csv(y_test_path)["isFraud"]

train_keys = pd.read_csv(train_keys_path)
test_keys  = pd.read_csv(test_keys_path)

# categorical columns
if os.path.exists(cat_cols_path):
    with open(cat_cols_path, "r") as f:
        cat_cols = json.load(f)
else:
    cat_cols = [c for c in X_train.columns if X_train[c].dtype == "object"]

cat_cols = [c for c in cat_cols if c in X_train.columns]
cat_idx = [X_train.columns.get_loc(c) for c in cat_cols]


# Ensure categorical columns are strings (CatBoost expects strings/ints for categories)
for c in cat_cols:
    X_train[c] = X_train[c].astype("string").fillna("missing")
    X_test[c]  = X_test[c].astype("string").fillna("missing")

print("Train shape:", X_train.shape, "| Test shape:", X_test.shape)
print("Train fraud%:", float(y_train.mean())*100, "| Test fraud%:", float(y_test.mean())*100)
print("Categorical cols:", len(cat_cols))


In [None]:
# --- Utilities
def evaluate_at_threshold(y_true, proba, thr=0.05):
    pred = (proba >= thr).astype(int)
    cm = confusion_matrix(y_true, pred)
    prec = precision_score(y_true, pred, zero_division=0)
    rec = recall_score(y_true, pred, zero_division=0)
    f2 = fbeta_score(y_true, pred, beta=2, zero_division=0)
    ap = average_precision_score(y_true, proba)
    return {"cm": cm, "precision": prec, "recall": rec, "f2": f2, "ap": ap}

def plot_pr_curve(y_true, proba, title="Precision–Recall"):
    p, r, _ = precision_recall_curve(y_true, proba)
    plt.figure()
    plt.plot(r, p)
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(title)
    plt.grid(True)
    plt.show()

def build_cb_params(trial=None, best_params=None):
    # base params
    params = {
        "loss_function": "Logloss",
        "eval_metric": "PRAUC",
        "auto_class_weights": "Balanced",  # fast imbalance handling (replaces SMOTE)
        "random_seed": SEED,
        "allow_writing_files": False,
        "verbose": False,
    }
    # GPU if available
    if gpu_count > 0:
        params.update({"task_type": "GPU", "devices": "0"})
    else:
        params.update({"thread_count": -1})

    if trial is not None:
        params.update({
            "iterations": trial.suggest_int("iterations", 600, 2500),
            "learning_rate": trial.suggest_float("learning_rate", 0.02, 0.2, log=True),
            "depth": trial.suggest_int("depth", 4, 9),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-2, 50.0, log=True),
            "random_strength": trial.suggest_float("random_strength", 1e-3, 10.0, log=True),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 3.0),
            "border_count": trial.suggest_int("border_count", 64, 200),
        })
    elif best_params is not None:
        params.update(best_params)
    return params


In [None]:
# --- Optuna tuning (time-aware CV)
# Tip: if still slow, set N_SPLITS_OPTUNA = 2 and/or reduce n_trials.
N_SPLITS_OPTUNA = 3
tscv_opt = TimeSeriesSplit(n_splits=N_SPLITS_OPTUNA)

def objective(trial: optuna.Trial) -> float:
    params = build_cb_params(trial=trial)

    oof = np.zeros(len(X_train), dtype=float)

    for fold, (tr_idx, va_idx) in enumerate(tqdm(tscv_opt.split(X_train), total=N_SPLITS_OPTUNA, leave=False, desc="Optuna folds")):
        X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
        y_tr, y_va = y_train.iloc[tr_idx].values, y_train.iloc[va_idx].values

        train_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
        val_pool   = Pool(X_va, y_va, cat_features=cat_idx)

        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=val_pool, use_best_model=True, early_stopping_rounds=150)

        oof[va_idx] = model.predict_proba(X_va)[:, 1]

        # Prune bad trials early using fold PRAUC
        try:
            fold_prauc = model.best_score_.get("validation", {}).get("PRAUC", None)
            if fold_prauc is not None:
                trial.report(fold_prauc, step=fold)
                if trial.should_prune():
                    raise optuna.TrialPruned()
        except Exception:
            pass

    metrics = evaluate_at_threshold(y_train.values, oof, thr=THRESHOLD)
    return float(metrics["f2"])  # maximize F2 at fixed threshold

sampler = optuna.samplers.TPESampler(seed=SEED)
pruner  = optuna.pruners.MedianPruner(n_startup_trials=2, n_warmup_steps=1)

study = optuna.create_study(direction="maximize", sampler=sampler, pruner=pruner)
study.optimize(objective, n_trials=25, show_progress_bar=True)

print("Best F2:", study.best_value)
print("Best params:", study.best_params)


In [None]:
# --- Train full OOF with best params (5 folds)
best = study.best_params.copy()
best_params = build_cb_params(best_params=best)

N_SPLITS_OOF = 5
tscv = TimeSeriesSplit(n_splits=N_SPLITS_OOF)

oof_proba = np.zeros(len(X_train), dtype=float)

for fold, (tr_idx, va_idx) in enumerate(tqdm(tscv.split(X_train), total=N_SPLITS_OOF, desc="OOF folds")):
    X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    y_tr, y_va = y_train.iloc[tr_idx].values, y_train.iloc[va_idx].values

    train_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
    val_pool   = Pool(X_va, y_va, cat_features=cat_idx)

    model = CatBoostClassifier(**best_params)
    model.fit(train_pool, eval_set=val_pool, use_best_model=True, early_stopping_rounds=150)

    oof_proba[va_idx] = model.predict_proba(X_va)[:, 1]

# OOF evaluation
oof_metrics = evaluate_at_threshold(y_train.values, oof_proba, thr=THRESHOLD)
print(f"\nOOF @ threshold {THRESHOLD} metrics:", {k: v for k, v in oof_metrics.items() if k != "cm"})
print("Confusion matrix (OOF):\n", oof_metrics["cm"])
plot_pr_curve(y_train.values, oof_proba, title="CatBoost OOF Precision–Recall")


In [None]:
# --- Train final model (fit best_iteration on last fold) and evaluate on test
# Use the last time split as a validation set to choose best_iteration, then refit on full train.
last_split = list(tscv.split(X_train))[-1]
tr_idx, va_idx = last_split

X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
y_tr, y_va = y_train.iloc[tr_idx].values, y_train.iloc[va_idx].values

tmp_model = CatBoostClassifier(**best_params)
tmp_model.fit(
    Pool(X_tr, y_tr, cat_features=cat_idx),
    eval_set=Pool(X_va, y_va, cat_features=cat_idx),
    use_best_model=True,
    early_stopping_rounds=150,
)

best_iter = tmp_model.get_best_iteration()
if best_iter is None or best_iter <= 0:
    best_iter = best_params.get("iterations", 1500)
print("Using best_iteration =", best_iter)

final_params = best_params.copy()
final_params["iterations"] = int(best_iter)

final_model = CatBoostClassifier(**final_params)
final_model.fit(Pool(X_train, y_train.values, cat_features=cat_idx), verbose=False)

test_proba = final_model.predict_proba(X_test)[:, 1]
test_metrics = evaluate_at_threshold(y_test.values, test_proba, thr=THRESHOLD)

print(f"\nTEST @ threshold {THRESHOLD} metrics:", {k: v for k, v in test_metrics.items() if k != "cm"})
print("Confusion matrix (TEST):\n", test_metrics["cm"])
print("\nClassification report (TEST):\n", classification_report(y_test.values, (test_proba >= THRESHOLD).astype(int), zero_division=0))
plot_pr_curve(y_test.values, test_proba, title="CatBoost TEST Precision–Recall")


In [None]:
# --- Save OOF + TEST outputs for stacking (aligned by row_id)
oof_df = pd.DataFrame({
    "row_id": train_keys["row_id"].values,
    "TransactionID": train_keys["TransactionID"].values,
    "y_true": y_train.values,
    "cat_proba": oof_proba,
    "pred_label": (oof_proba >= THRESHOLD).astype(int),
    "threshold_used": THRESHOLD,
})
test_df = pd.DataFrame({
    "row_id": test_keys["row_id"].values,
    "TransactionID": test_keys["TransactionID"].values,
    "y_true": y_test.values,
    "cat_proba": test_proba,
    "pred_label": (test_proba >= THRESHOLD).astype(int),
    "threshold_used": THRESHOLD,
})

oof_path  = os.path.join(datapath, "oof_cat.csv")
test_path = os.path.join(datapath, "test_cat.csv")
oof_df.to_csv(oof_path, index=False)
test_df.to_csv(test_path, index=False)

print("Saved:", oof_path)
print("Saved:", test_path)


In [None]:
# --- SHAP (CatBoost SHAPValues) on a sample for speed
import shap

SAMPLE_N = min(3000, len(X_train))
sample_idx = np.random.choice(len(X_train), size=SAMPLE_N, replace=False)
X_shap = X_train.iloc[sample_idx].copy()
y_shap = y_train.iloc[sample_idx].values

pool_shap = Pool(X_shap, y_shap, cat_features=cat_idx)

# CatBoost returns SHAP values with an extra last column = expected value
shap_vals = final_model.get_feature_importance(pool_shap, type="ShapValues")
shap_values = shap_vals[:, :-1]

plt.figure()
shap.summary_plot(shap_values, X_shap, max_display=25, show=True)
