# Importing Libs

In [None]:
import argparse
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_curve, roc_auc_score, log_loss, brier_score_loss, accuracy_score
)
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibrationDisplay
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import wandb

# Initialization & Helper Functions

In [2]:

# ------------------------------ config ------------------------------ #

SEED = 42
VAL_SIZE = 0.20
GROUP = "q2-baselines"
PROJECT = "milestone_2"        # keep or change if you prefer
ENTITY = None                  # set to your team/entity if you want to force it
N_BINS = 100                   # percentile resolution for custom curves


In [3]:
# --------------------------- helper funcs --------------------------- #

def stratified_split(df, target, seed=SEED, val_size=VAL_SIZE):
    """Single stratified split reused for all models."""
    train_df, val_df = train_test_split(
        df, test_size=val_size, random_state=seed, stratify=df[target]
    )
    return train_df.reset_index(drop=True), val_df.reset_index(drop=True)


def fit_logreg(train_df, feature_cols, target_col):
    """LogReg with train-only median imputation (no leakage)."""
    X = train_df[feature_cols].values
    y = train_df[target_col].values
    model = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("clf", LogisticRegression())  # default settings
    ])
    model.fit(X, y)
    return model


def predict_proba(model, df, feature_cols):
    return model.predict_proba(df[feature_cols].values)[:, 1]

def random_baseline_probs(n, seed=SEED):
    rng = np.random.default_rng(seed)
    return rng.random(n)

In [4]:
def compute_goalrate_vs_percentile(y_true, y_prob, n_bins=N_BINS):
    """
    Goal rate within each percentile bin of predicted probability.
    Returns x (percentiles from 100 to 1) and y (goal rate per bin).
    """
    x_percentiles = np.arange(100, 0, -1)  # 100, 99, ..., 1
    rates = np.full(n_bins, np.nan, dtype=float)

    probs = np.asarray(y_prob)
    y = np.asarray(y_true)

    for i, p in enumerate(x_percentiles):
        hi = np.quantile(probs, p/100.0, method="linear")
        lo = np.quantile(probs, max((p-1), 0)/100.0, method="linear")
        mask = (probs <= hi) & (probs > lo) if p > 1 else (probs <= hi) & (probs >= lo)
        denom = mask.sum()
        if denom > 0:
            rates[i] = y[mask].mean()
    return x_percentiles, rates


def compute_cum_goals_vs_percentile(y_true, y_prob, n_bins=N_BINS):
    """
    Cumulative proportion of GOALS captured as we sweep from top probabilities down.
    Returns x (percentiles from 100 to 1) and y (cumulative fraction of all goals).
    """
    y = np.asarray(y_true)
    probs = np.asarray(y_prob)

    order = np.argsort(-probs)  # descending by prob
    y_sorted = y[order]
    cum_goals = np.cumsum(y_sorted)
    total_goals = max(1, y.sum())

    x_percentiles = np.arange(100, 0, -1)  # 100, 99, ..., 1
    curve = np.zeros_like(x_percentiles, dtype=float)

    n = len(y)
    for i, p in enumerate(x_percentiles):
        k = max(1, int(np.floor(p/100.0 * n)))
        curve[i] = cum_goals[k-1] / total_goals
    return x_percentiles, curve

def evaluate_probs(y_true, y_prob):
    return {"val_auc": roc_auc_score(y_true, y_prob)}

def plot_roc(ax, y_true, curves_dict):
    ax.plot([0, 1], [0, 1], linestyle="--", label="chance (45°)")
    for label, probs in curves_dict.items():
        fpr, tpr, _ = roc_curve(y_true, probs)
        auc_val = roc_auc_score(y_true, probs)
        ax.plot(fpr, tpr, label=f"{label} (AUC={auc_val:.3f})")
    ax.set_title("ROC curve (validation)")
    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")
    ax.legend(loc="lower right")


def plot_goalrate(ax, y_true, curves_dict):
    for label, probs in curves_dict.items():
        x, y = compute_goalrate_vs_percentile(y_true, probs)
        ax.plot(x, y, label=label)
    ax.set_title("Goal Rate vs Shot-Probability Percentile (validation)")
    ax.set_xlabel("Shot probability model percentile (high→low)")
    ax.set_ylabel("Goals / Shots")
    ax.invert_xaxis()
    ax.legend()


def plot_cum_goals(ax, y_true, curves_dict):
    for label, probs in curves_dict.items():
        x, y = compute_cum_goals_vs_percentile(y_true, probs)
        ax.plot(x, y, label=label)
    ax.set_title("Cumulative % of Goals vs Percentile (validation)")
    ax.set_xlabel("Shot probability model percentile (high→low)")
    ax.set_ylabel("Proportion of goals")
    ax.invert_xaxis()
    ax.legend(loc="lower right")


def plot_calibration(ax, y_true, curves_dict, n_bins=10):
    for label, probs in curves_dict.items():
        CalibrationDisplay.from_predictions(
            y_true, probs, n_bins=n_bins, name=label, ax=ax
        )
    ax.set_title("Reliability Diagram (validation)")
    ax.set_xlabel("Predicted probability")
    ax.set_ylabel("Observed frequency")


In [5]:
def save_and_log_model(run, model, name, meta):
    """Save model locally and log as a W&B artifact."""
    os.makedirs("artifacts", exist_ok=True)
    path = os.path.join("artifacts", f"{name}.joblib")
    joblib.dump(model, path)
    art = wandb.Artifact(name=name, type="model", metadata=meta)
    art.add_file(path)
    run.log_artifact(art, aliases=["v0", "baseline"])
    return path

In [6]:
df = pd.read_csv('../ift6758/data/milestone2/baseline_train.csv')
df.shape

(316540, 4)

# Model Building & Evaluation

In [7]:
def main(args):
    # Load data
    df = pd.read_csv(args.train_csv)
    assert {"distance_from_net", "shot_angle", "is_goal"}.issubset(df.columns), \
        "CSV must include distance_from_net, shot_angle, is_goal"

    # Single split (reused)
    train_df, val_df = stratified_split(df, target="is_goal", seed=SEED, val_size=VAL_SIZE)

    feature_sets = {
        "lr-distance": ["distance_from_net"],
        "lr-angle": ["shot_angle"],
        "lr-both": ["distance_from_net", "shot_angle"],
    }

    # Fit models, evaluate, store everything
    model_runs = {}          # name -> wandb run id
    val_probs = {}           # name -> probabilities on validation set
    metrics_by_model = {}    # name -> metrics dict
    rng = np.random.default_rng(SEED)

    for run_name, feats in feature_sets.items():
        config = {
            "features": feats,
            "target_col": "is_goal",
            "split": {"type": "stratified", "val_size": VAL_SIZE, "seed": SEED},
            "scaling": "none",
            "model": "logreg-default",
            "data_version": os.path.basename(args.train_csv),
            "notes": "q2 baseline, no tuning",
        }

        run = wandb.init(
            project=PROJECT, entity=ENTITY, group=GROUP,
            name=run_name, config=config, reinit=True, resume="never"
        )

        # train
        model = fit_logreg(train_df, feats, "is_goal")
        p_val = predict_proba(model, val_df, feats)
        

        # evaluate
        y_val = val_df["is_goal"].values
        summary = evaluate_probs(y_val, p_val)
        wandb.log(summary)
        
        # save model
        model_meta = {"features": feats, "seed": SEED, "solver": "default", **summary}
        save_and_log_model(run, model, name=run_name, meta=model_meta)

        # record run info for the comparison step
        model_runs[run_name] = run.id
        val_probs[run_name] = p_val
        metrics_by_model[run_name] = summary

        run.finish()

    # random baseline probs on validation set (deterministic)
    y_val = val_df["is_goal"].values
    val_probs["random-uniform"] = random_baseline_probs(n=len(y_val), seed=SEED)


    # ------------------ combined comparison figures ------------------ #
    comp_run = wandb.init(
        project=PROJECT, entity=ENTITY, group=GROUP,
        name="q2-comparison", config={
            "includes": list(feature_sets.keys()) + ["random"],
            "split": {"type": "stratified", "val_size": VAL_SIZE, "seed": SEED},
            "data_version": os.path.basename(args.train_csv),
            "notes": "four figures with four curves each (validation only)"
        }, reinit=True, resume="never"
    )

    # figure 1: ROC
    fig1, ax1 = plt.subplots(figsize=(6, 5))
    plot_roc(ax1, y_val, curves_dict=val_probs)
    wandb.log({"fig_roc_auc": wandb.Image(fig1)})
    plt.close(fig1)

    # figure 2: Goal rate vs percentile
    fig2, ax2 = plt.subplots(figsize=(6, 5))
    plot_goalrate(ax2, y_val, curves_dict=val_probs)
    wandb.log({"fig_goalrate_vs_percentile": wandb.Image(fig2)})
    plt.close(fig2)

    # figure 3: Cumulative % of goals vs percentile
    fig3, ax3 = plt.subplots(figsize=(6, 5))
    plot_cum_goals(ax3, y_val, curves_dict=val_probs)
    wandb.log({"fig_cum_goals_vs_percentile": wandb.Image(fig3)})
    plt.close(fig3)

    # figure 4: Reliability (calibration) diagram
    fig4, ax4 = plt.subplots(figsize=(6, 5))
    plot_calibration(ax4, y_val, curves_dict=val_probs, n_bins=10)
    wandb.log({"fig_reliability_diagram": wandb.Image(fig4)})
    plt.close(fig4)

    # log a compact metrics summary table across models
    metrics_table = pd.DataFrame(metrics_by_model).T.reset_index().rename(columns={"index": "model"})

    comp_run.finish()

In [8]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_csv", type=str, default="../ift6758/data/milestone2/baseline_train.csv",
                        help="../ift6758/data/milestone2/baseline_train.csv")
    # test.csv intentionally unused here (kept for later milestones)
    args = parser.parse_args(args=[])
    main(args)

wandb: Currently logged in as: aftabgazali003 (IFT6758_team4) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


0,1
val_auc,▁

0,1
val_auc,0.69657


0,1
val_auc,▁

0,1
val_auc,0.5681


0,1
val_auc,▁

0,1
val_auc,0.71451
