# Importing Libs

In [48]:
# ---- toggles ----
LOG_WANDB   = True    
SAVE_MODELS = True    

# optional (only used if LOG_WANDB=True)
try:
    import wandb
except Exception:
    wandb = None  # harmless when LOG_WANDB=False
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from typing import Dict, List
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, average_precision_score, brier_score_loss, log_loss
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.calibration import CalibrationDisplay, CalibratedClassifierCV
import argparse, time, os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.impute import SimpleImputer

from catboost import CatBoostClassifier, Pool
import joblib


# Initialization & Helper Functions

In [None]:
"""
Q6 — Final single-model (CatBoost) — LOCKED 17-FEATURE SET
- Target: is_goal
- Split: stratified 80/20, seed 42
- Features (exact 17):
  BASE (7): distance_from_net, rebound, period, last_event_distance,
            shot_angle, shot_type, period_time_seconds
  RAW (4):  time_since_last_event, angle_change, event_speed, last_event_type
  DERIVED (6): log_distance, abs_angle, cos_angle,
               dist_x_abs_angle, rush, big_turn
- No W&B, no plots, no saving — prints only AUC / PR-AUC and timing.
"""

'\nQ6 — Best Shot (Expected Goals)\n- data: advanced_train.csv (train/val split), test is untouched here\n- target: is_goal (1 = goal)\n- split: stratified 80/20, seed 42 (single split reused by all models)\n- preprocessing: train-only imputation (numeric=median, categorical=most_frequent),\n  OneHotEncoder(handle_unknown=\'ignore\', dense output), booleans -> 0/1\n- models:\n    1) q6_lr_baseline         : LogisticRegression(class_weight=\'balanced\')\n    2) q6_hgb_tuned           : HistGradientBoostingClassifier (light randomized search)\n    3) q6_rf_tuned            : RandomForestClassifier (light randomized search)\n    4) q6_hgb_calibrated      : CalibratedClassifierCV(sigmoid) over best HGB\n- evaluation (validation only):\n    - ROC + AUC\n    - Goal rate vs model percentile\n    - Cumulative % of goals vs percentile\n    - Reliability (calibration) diagram\n- W&B:\n    - reinit=True, resume="never"\n    - group=\'q6\', run names start with \'q6_...\'\n    - log only val_auc p

In [50]:
# ----------------------------- constants ----------------------------- #

SEED = 42
VAL_SIZE = 0.20
PROJECT = "milestone_2"
ENTITY = None        # or set your team entity explicitly, e.g., "IFT6758_team4"
GROUP = "q6"
N_BINS_PERCENTILES = 100
TARGET = "is_goal"
BASE = [
    "distance_from_net", "rebound", "period", "last_event_distance",
    "shot_angle", "shot_type", "period_time_seconds"
]
RAW = ["time_since_last_event", "angle_change", "event_speed", "last_event_type"]
DERIVED = ["log_distance", "abs_angle", "cos_angle", "dist_x_abs_angle", "rush", "big_turn"]
FEATURES = BASE + RAW + DERIVED
CAT_COLS = ["shot_type", "last_event_type", "period"]  # treated as categoricals


In [51]:
def compute_goalrate_vs_percentile(y_true, y_prob, n_bins=N_BINS_PERCENTILES):
    """Goal rate within each probability percentile bin (100..1)."""
    y = np.asarray(y_true)
    p = np.asarray(y_prob)
    x_percentiles = np.arange(100, 0, -1)
    rates = np.full(n_bins, np.nan, dtype=float)
    for i, pct in enumerate(x_percentiles):
        hi = np.quantile(p, pct / 100.0, method="linear")
        lo = np.quantile(p, max(pct - 1, 0) / 100.0, method="linear")
        mask = (p <= hi) & (p > lo) if pct > 1 else (p <= hi) & (p >= lo)
        if mask.sum() > 0:
            rates[i] = y[mask].mean()
    return x_percentiles, rates


def compute_cum_goals_vs_percentile(y_true, y_prob, n_bins=N_BINS_PERCENTILES):
    """Cumulative proportion of GOALS captured as we sweep from high→low probs."""
    y = np.asarray(y_true)
    p = np.asarray(y_prob)
    order = np.argsort(-p)
    y_sorted = y[order]
    cum_goals = np.cumsum(y_sorted)
    total_goals = max(1, y.sum())

    x_percentiles = np.arange(100, 0, -1)
    curve = np.zeros_like(x_percentiles, dtype=float)
    n = len(y)
    for i, pct in enumerate(x_percentiles):
        k = max(1, int(np.floor(pct / 100.0 * n)))
        curve[i] = cum_goals[k - 1] / total_goals
    return x_percentiles, curve

In [52]:
def plot_roc(ax, y_true, curves: Dict[str, np.ndarray]):
    ax.plot([0, 1], [0, 1], linestyle="--", label="chance (45°)")
    for label, probs in curves.items():
        fpr, tpr, _ = roc_curve(y_true, probs)
        auc_val = roc_auc_score(y_true, probs)
        ax.plot(fpr, tpr, label=f"{label} (AUC={auc_val:.3f})")
    ax.set_title("ROC curve (validation)")
    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")
    ax.legend(loc="lower right")


def plot_goalrate(ax, y_true, curves: Dict[str, np.ndarray]):
    for label, probs in curves.items():
        x, y = compute_goalrate_vs_percentile(y_true, probs)
        ax.plot(x, y, label=label)
    ax.set_title("Goal Rate vs Model Percentile (validation)")
    ax.set_xlabel("Shot probability model percentile (high→low)")
    ax.set_ylabel("Goals / Shots")
    ax.invert_xaxis()
    ax.legend()


def plot_cum_goals(ax, y_true, curves: Dict[str, np.ndarray]):
    for label, probs in curves.items():
        x, y = compute_cum_goals_vs_percentile(y_true, probs)
        ax.plot(x, y, label=label)
    ax.set_title("Cumulative % of Goals vs Percentile (validation)")
    ax.set_xlabel("Shot probability model percentile (high→low)")
    ax.set_ylabel("Proportion of goals")
    ax.invert_xaxis()
    ax.legend(loc="lower right")


def plot_calibration(ax, y_true, curves: Dict[str, np.ndarray], n_bins=10):
    for label, probs in curves.items():
        CalibrationDisplay.from_predictions(
            y_true, probs, n_bins=n_bins, name=label, ax=ax
        )
    ax.set_title("Reliability (Calibration) — validation")
    ax.set_xlabel("Predicted probability")
    ax.set_ylabel("Observed frequency")

In [53]:
# # === your helper functions must be defined/imported above these calls ===
# def add_features(df: pd.DataFrame) -> pd.DataFrame:
#     out = df.copy()
#     out["rebound"] = out["rebound"].astype(float)
#     out["log_distance"] = np.log1p(out["distance_from_net"])
#     out["abs_angle"] = np.abs(out["shot_angle"])
#     out["cos_angle"] = np.cos(np.deg2rad(out["shot_angle"]))
#     out["dist_x_abs_angle"] = out["distance_from_net"] * out["abs_angle"]
#     out["rush"] = (out["time_since_last_event"] <= 2).astype(float)
#     out["big_turn"] = (out["angle_change"] >= 30).astype(float)
#     return out

# def make_pool(X: pd.DataFrame, y=None):
#     # avoid SettingWithCopy warnings
#     X = X.copy()

#     # Impute missing values in categorical columns (optional; CatBoost can handle NaNs)
#     imp = SimpleImputer(strategy="most_frequent")
#     for col in CAT_COLS:
#         if col in X.columns:
#             X.loc[:, col] = imp.fit_transform(X[[col]])[:, 0]

#     cat_idx = [X.columns.get_loc(c) for c in CAT_COLS if c in X.columns]
#     return Pool(X, label=y, cat_features=cat_idx)


In [54]:
# === your helper functions must be defined/imported above these calls ===
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out["rebound"] = out["rebound"].astype(float)
    out["log_distance"] = np.log1p(out["distance_from_net"])
    out["abs_angle"] = np.abs(out["shot_angle"])
    out["cos_angle"] = np.cos(np.deg2rad(out["shot_angle"]))
    out["dist_x_abs_angle"] = out["distance_from_net"] * out["abs_angle"]
    out["rush"] = (out["time_since_last_event"] <= 2).astype(float)
    out["big_turn"] = (out["angle_change"] >= 30).astype(float)

    # Ensure categorical features are handled before make_pool
    for col in CAT_COLS:
        if col in out.columns:
            # Fill NaN with a placeholder string and then convert to string
            out[col] = out[col].fillna('missing').astype(str)
    return out

def make_pool(X: pd.DataFrame, y=None):
    # avoid SettingWithCopy warnings
    X = X.copy()

    # Convert categorical columns to string type to handle NaNs - This was moved to add_features for robustness
    # for col in CAT_COLS:
    #     if col in X.columns:
    #         X.loc[:, col] = X[col].astype(str)

    cat_idx = [X.columns.get_loc(c) for c in CAT_COLS if c in X.columns]
    return Pool(X, label=y, cat_features=cat_idx)

# Model Building & Evaluation

In [55]:
def cat_params():
    return dict(
        loss_function="Logloss",
        eval_metric="AUC",
        auto_class_weights="Balanced",
        iterations=800,
        early_stopping_rounds=50,
        depth=8,
        learning_rate=0.05,
        l2_leaf_reg=3,
        random_seed=SEED,
        use_best_model=True,
        verbose=False,
        bagging_temperature=0.0,
    )

In [56]:
def main(args):
    path = Path(args.train_csv)
    df = pd.read_csv(path)

    needed = set(BASE + RAW + [TARGET])
    missing = needed - set(df.columns)
    if missing:
        raise ValueError(f"Missing columns for locked 17-feature setup: {missing}")

    df = add_features(df)
    Xy = df[FEATURES + [TARGET]].copy()

    train_df, val_df = train_test_split(
        Xy, test_size=VAL_SIZE, random_state=SEED, stratify=Xy[TARGET]
    )
    X_tr, y_tr = train_df[FEATURES], train_df[TARGET].values
    X_va, y_va = val_df[FEATURES], val_df[TARGET].values

    # ---------------- W&B init (optional) ----------------
    if LOG_WANDB:
        run = wandb.init(
            project="milestone_2",
            name="q6_final_cat_calibrated_locked17",
            tags=["q6", "final", "catboost", "calibrated", "locked17"],
            config={"seed": SEED, "val_size": VAL_SIZE, "features": FEATURES},
            resume="never",
        )

    # ---------------- CatBoost final (raw) ----------------
    dtr = make_pool(X_tr, y_tr)
    dva = make_pool(X_va, y_va)

    cb_final = CatBoostClassifier(**cat_params())
    t0 = time.time()
    cb_final.fit(dtr, eval_set=dva)
    p_cat_raw = cb_final.predict_proba(dva)[:, 1]
    dt_cb = time.time() - t0

    auc_raw = roc_auc_score(y_va, p_cat_raw)
    ap_raw  = average_precision_score(y_va, p_cat_raw)

    # ---------------- OOF Platt calibration (train only) ----------------
    cat_idx = [X_tr.columns.get_loc(c) for c in CAT_COLS if c in X_tr.columns]
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
    oof = np.zeros(len(X_tr), dtype=float)

    for tr_idx, oof_idx in skf.split(X_tr, y_tr):
        Xt, yt = X_tr.iloc[tr_idx], y_tr[tr_idx]
        Xo, yo = X_tr.iloc[oof_idx], y_tr[oof_idx]

        cb = CatBoostClassifier(**cat_params())
        cb.fit(Xt, yt, cat_features=cat_idx, eval_set=(Xo, yo))
        oof[oof_idx] = cb.predict_proba(Xo)[:, 1]

    platt = LogisticRegression(
        penalty="l2", C=1e6, solver="lbfgs", max_iter=1000
    )
    platt.fit(oof.reshape(-1, 1), y_tr)

    p_cat_cal = platt.predict_proba(p_cat_raw.reshape(-1, 1))[:, 1]

    auc_cal = roc_auc_score(y_va, p_cat_cal)
    ap_cal  = average_precision_score(y_va, p_cat_cal)
    brier   = brier_score_loss(y_va, p_cat_cal)
    lloss   = log_loss(y_va, p_cat_cal, labels=[0, 1])

    # ---------------- fast HGB comparator (optional curve) ----------------
    num_cols = [c for c in FEATURES if c not in CAT_COLS]
    cat_cols = [c for c in FEATURES if c in CAT_COLS]
    pre = ColumnTransformer(
        transformers=[
            ("num", SimpleImputer(strategy="median"), num_cols),
            ("cat", Pipeline([
                ("imp", SimpleImputer(strategy="most_frequent")),
                ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False, min_frequency=0.01))
            ]), cat_cols),
        ],
        remainder="drop",
    )
    hgb = HistGradientBoostingClassifier(
        early_stopping=True, random_state=SEED,
        max_leaf_nodes=63, learning_rate=0.08,
        min_samples_leaf=50, l2_regularization=0.01,
    )
    pipe_hgb = Pipeline([("pre", pre), ("clf", hgb)])
    pipe_hgb.fit(X_tr, y_tr)
    p_hgb = pipe_hgb.predict_proba(X_va)[:, 1]

    # ---------------- prints ----------------
    print("\n===== Q6 Final CatBoost — Calibrated (LOCKED 17) =====")
    print(f"train: {len(train_df):,} | val: {len(val_df):,} | features: {len(FEATURES)}")
    print(f"CatBoost RAW   AUC={auc_raw:.4f}  PR-AUC={ap_raw:.4f}   time={dt_cb:.1f}s")
    print(f"CatBoost CAL   AUC={auc_cal:.4f}  PR-AUC={ap_cal:.4f}   Brier={brier:.4f}  LogLoss={lloss:.4f}")
    print("======================================================\n")

    # ---------------- figures (your helpers) ----------------
    os.makedirs("figs_q6", exist_ok=True)

    curves = {
        "catboost (calibrated)": p_cat_cal,
        "hgb": p_hgb,
        "random": np.random.RandomState(SEED).rand(len(y_va)),
    }

    fig, ax = plt.subplots(figsize=(7, 5))
    plot_roc(ax, y_va, curves)
    fig.savefig("figs_q6/q6_roc_val.png", dpi=160, bbox_inches="tight"); plt.close(fig)

    fig, ax = plt.subplots(figsize=(7, 5))
    plot_goalrate(ax, y_va, curves)
    fig.savefig("figs_q6/q6_goalrate_val.png", dpi=160, bbox_inches="tight"); plt.close(fig)

    fig, ax = plt.subplots(figsize=(7, 5))
    plot_cum_goals(ax, y_va, curves)
    fig.savefig("figs_q6/q6_cum_goals_val.png", dpi=160, bbox_inches="tight"); plt.close(fig)

    fig, ax = plt.subplots(figsize=(7, 5))
    plot_calibration(ax, y_va, curves, n_bins=10)
    fig.savefig("figs_q6/q6_calibration_val.png", dpi=160, bbox_inches="tight"); plt.close(fig)

    # ---------------- W&B logging + artifact (optional) ----------------
    if LOG_WANDB:
        wandb.log({
            "val/auc_cat_raw": auc_raw,
            "val/ap_cat_raw": ap_raw,
            "val/auc_cat_cal": auc_cal,
            "val/ap_cat_cal": ap_cal,
            "val/brier_cat_cal": brier,
            "val/logloss_cat_cal": lloss,
            "roc_val": wandb.Image("figs_q6/q6_roc_val.png"),
            "goalrate_val": wandb.Image("figs_q6/q6_goalrate_val.png"),
            "cum_goals_val": wandb.Image("figs_q6/q6_cum_goals_val.png"),
            "calibration_val": wandb.Image("figs_q6/q6_calibration_val.png"),
        })

        if SAVE_MODELS:
            cb_final.save_model("q6_catboost_final.cbm")
            joblib.dump(platt, "q6_platt.joblib")
            art = wandb.Artifact("q6_catboost_final_calibrated", type="model",
                                 metadata={"features": FEATURES, "locked": "17", "calibration": "platt_oof"})
            art.add_file("q6_catboost_final.cbm")
            art.add_file("q6_platt.joblib")
            run.log_artifact(art)

        wandb.finish()

In [57]:
if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument("--train_csv", type=str, default="../ift6758/data/milestone2/advanced_train.csv",
                    help="../ift6758/data/milestone2/advanced_train.csv")
    args = ap.parse_args(args=[]) # Pass an empty list to avoid notebook arguments
    main(args)

wandb: Currently logged in as: aftabgazali003 (IFT6758_team4) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin



===== Q6 Final CatBoost — Calibrated (LOCKED 17) =====
train: 253,232 | val: 63,308 | features: 17
CatBoost RAW   AUC=0.7511  PR-AUC=0.2324   time=42.1s
CatBoost CAL   AUC=0.7511  PR-AUC=0.2324   Brier=0.0792  LogLoss=0.2774



0,1
val/ap_cat_cal,▁
val/ap_cat_raw,▁
val/auc_cat_cal,▁
val/auc_cat_raw,▁
val/brier_cat_cal,▁
val/logloss_cat_cal,▁

0,1
val/ap_cat_cal,0.23237
val/ap_cat_raw,0.23237
val/auc_cat_cal,0.75111
val/auc_cat_raw,0.75111
val/brier_cat_cal,0.0792
val/logloss_cat_cal,0.27741
