# 06 - Model Comparison (Full 2025 Test)

This notebook loads the saved models (`.joblib`) and evaluates them on **all available 2025 race data** in the processed dataset.
It produces publication-ready plots and a comprehensive set of numerical metrics for the paper.

Outputs are saved to `reports/notebooks/06_model_comparison_full/`.


In [None]:
import sys
from pathlib import Path

ROOT = Path.cwd().resolve().parent if Path.cwd().name == "notebooks" else Path.cwd().resolve()
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
print(f"Using project root: {ROOT}")


In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import joblib

from _common import load_dataset, prepare_features
from src.split import SplitConfig
from src.plots import plot_actual_vs_pred, plot_error_distribution, save_plot

OUTPUT_DIR = ROOT / "reports" / "notebooks" / "06_model_comparison_full"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

np.random.seed(42)


In [None]:
# Load dataset and build the 2025 test split
split_config = SplitConfig(test_rounds=None)
df, metadata = load_dataset()
train_df, val_df, trainval_df, test_df, features = prepare_features(df, metadata, split_config=split_config)

X_test = test_df[features]
y_test = test_df["LapTimeSeconds"].to_numpy()

summary = (
    test_df[["Season", "RoundNumber", "EventName"]]
    .drop_duplicates()
    .sort_values(["Season", "RoundNumber"])
)
print(f"Test rows: {len(test_df):,}")
print(f"Test seasons: {sorted(test_df['Season'].unique())}")
print(f"Test rounds: {summary['RoundNumber'].tolist()}")
summary.head(12)


In [None]:
# Load saved models
MODELS_DIR = ROOT / "reports" / "models"
MODEL_PATHS = {
    "Linear": MODELS_DIR / "linear.joblib",
    "XGBoost": MODELS_DIR / "xgboost.joblib",
    "Deep MLP": MODELS_DIR / "deep_mlp.joblib",
}

missing = [name for name, path in MODEL_PATHS.items() if not path.exists()]
if missing:
    raise FileNotFoundError(f"Missing saved models: {missing}. Expected in {MODELS_DIR}")

models = {name: joblib.load(path) for name, path in MODEL_PATHS.items()}
models


In [None]:
# Generate predictions
preds = {name: model.predict(X_test) for name, model in models.items()}

# Shared error frame (for group-level analysis and plots)
meta_cols = [
    c for c in [
        "Season", "RoundNumber", "EventName", "Driver", "Team", "Compound",
        "Circuit", "TireAgeCategory", "LapNumber", "Stint", "TyreLife"
    ] if c in test_df.columns
]

frames = []
for name, y_pred in preds.items():
    temp = test_df[meta_cols].copy()
    temp["model"] = name
    temp["y_true"] = y_test
    temp["y_pred"] = y_pred
    temp["error"] = y_pred - y_test
    temp["abs_error"] = np.abs(temp["error"])
    frames.append(temp)

errors_df = pd.concat(frames, ignore_index=True)
errors_df.head()


In [None]:
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    mean_absolute_percentage_error,
    median_absolute_error,
    max_error,
    explained_variance_score,
    mean_squared_log_error,
)

def style_plot(fig, *, height=480, width=None, title=None):
    fig.update_layout(
        template="plotly_white",
        height=height,
        width=width,
        title=title or fig.layout.title.text,
        font=dict(size=14),
    )
    fig.update_xaxes(
        showline=True,
        linewidth=1,
        linecolor="black",
        mirror=True,
        ticks="outside",
        showgrid=True,
        gridcolor="rgba(0,0,0,0.1)",
    )
    fig.update_yaxes(
        showline=True,
        linewidth=1,
        linecolor="black",
        mirror=True,
        ticks="outside",
        showgrid=True,
        gridcolor="rgba(0,0,0,0.1)",
    )
    return fig

def compute_full_metrics(y_true, y_pred, n_features):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    resid = y_pred - y_true
    abs_err = np.abs(resid)
    n = len(y_true)
    eps = 1e-6

    mse = mean_squared_error(y_true, y_pred)
    rmse = float(np.sqrt(mse))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    adj_r2 = (
        1 - (1 - r2) * (n - 1) / max(n - n_features - 1, 1)
        if n > n_features + 1 else np.nan
    )

    mape = float(np.mean(abs_err / np.clip(np.abs(y_true), eps, None)) * 100)
    smape = float(np.mean(2 * abs_err / (np.abs(y_true) + np.abs(y_pred) + eps)) * 100)
    rmspe = float(np.sqrt(np.mean((resid / np.clip(y_true, eps, None)) ** 2)) * 100)

    try:
        msle = mean_squared_log_error(
            np.clip(y_true, eps, None),
            np.clip(y_pred, eps, None),
        )
        rmsle = float(np.sqrt(msle))
    except Exception:
        msle = np.nan
        rmsle = np.nan

    bias = float(resid.mean())
    std_err = float(resid.std())
    med_err = float(np.median(resid))
    mad = float(np.median(np.abs(resid - np.median(resid))))

    pearson = float(np.corrcoef(y_true, y_pred)[0, 1]) if n > 1 else np.nan
    spearman = float(pd.Series(y_true).corr(pd.Series(y_pred), method="spearman")) if n > 1 else np.nan

    return {
        "n": n,
        "mae": mae,
        "mse": mse,
        "rmse": rmse,
        "r2": r2,
        "adj_r2": adj_r2,
        "explained_variance": explained_variance_score(y_true, y_pred),
        "mape_pct": mape,
        "smape_pct": smape,
        "rmspe_pct": rmspe,
        "medae": median_absolute_error(y_true, y_pred),
        "max_error": max_error(y_true, y_pred),
        "msle": msle,
        "rmsle": rmsle,
        "bias_mean_error": bias,
        "median_error": med_err,
        "std_error": std_err,
        "mad_error": mad,
        "pearson_r": pearson,
        "spearman_r": spearman,
        "p50_abs_error": float(np.percentile(abs_err, 50)),
        "p90_abs_error": float(np.percentile(abs_err, 90)),
        "p95_abs_error": float(np.percentile(abs_err, 95)),
        "p99_abs_error": float(np.percentile(abs_err, 99)),
        "within_1s_pct": float(np.mean(abs_err <= 1.0) * 100),
        "within_2s_pct": float(np.mean(abs_err <= 2.0) * 100),
        "within_5s_pct": float(np.mean(abs_err <= 5.0) * 100),
        "mean_true": float(np.mean(y_true)),
        "std_true": float(np.std(y_true)),
        "mean_pred": float(np.mean(y_pred)),
        "std_pred": float(np.std(y_pred)),
    }

def plot_residuals_vs_pred(y_true, y_pred, title):
    df_plot = pd.DataFrame({
        "Predicted": y_pred,
        "Residual": y_pred - y_true,
    })
    fig = px.scatter(
        df_plot,
        x="Predicted",
        y="Residual",
        title=title,
        opacity=0.5,
    )
    fig.add_hline(y=0, line_dash="dash", line_color="red")
    fig.update_xaxes(title="Predicted lap time (s)")
    fig.update_yaxes(title="Residual (s)")
    return fig

def plot_calibration(y_true, y_pred, title, bins=20):
    df_plot = pd.DataFrame({
        "y_true": y_true,
        "y_pred": y_pred,
    })
    df_plot["bin"] = pd.qcut(df_plot["y_pred"], q=bins, duplicates="drop")
    grouped = df_plot.groupby("bin", observed=True).agg(
        pred_mean=("y_pred", "mean"),
        true_mean=("y_true", "mean"),
        n=("y_true", "size"),
    ).reset_index()
    fig = go.Figure()
    fig.add_scatter(
        x=grouped["pred_mean"],
        y=grouped["true_mean"],
        mode="lines+markers",
        name="Binned mean",
    )
    min_v = float(min(grouped["pred_mean"].min(), grouped["true_mean"].min()))
    max_v = float(max(grouped["pred_mean"].max(), grouped["true_mean"].max()))
    fig.add_trace(
        go.Scatter(
            x=[min_v, max_v],
            y=[min_v, max_v],
            mode="lines",
            line=dict(color="red", dash="dash"),
            name="Perfect",
        )
    )
    fig.update_layout(
        title=title,
        xaxis_title="Predicted (binned mean, s)",
        yaxis_title="Actual (binned mean, s)",
    )
    return fig

def plot_error_by_category_custom(df, category_col, title, top_n=12):
    grouped = (
        df.groupby(category_col, dropna=False)["abs_error"]
        .mean()
        .sort_values(ascending=False)
        .head(top_n)
        .reset_index()
    )
    fig = px.bar(
        grouped,
        x="abs_error",
        y=category_col,
        orientation="h",
        title=title,
        labels={"abs_error": "MAE (s)", category_col: category_col},
    )
    return fig


In [None]:
# Overall metrics table
rows = []
for name, y_pred in preds.items():
    scores = compute_full_metrics(y_test, y_pred, n_features=len(features))
    scores["model"] = name
    rows.append(scores)

metrics_df = pd.DataFrame(rows)
metrics_df = metrics_df.sort_values("mae").reset_index(drop=True)

# Reorder for readability
cols = [
    "model", "n",
    "mae", "rmse", "r2", "adj_r2", "explained_variance",
    "mape_pct", "smape_pct", "rmspe_pct",
    "medae", "max_error",
    "bias_mean_error", "median_error", "std_error", "mad_error",
    "pearson_r", "spearman_r",
    "p50_abs_error", "p90_abs_error", "p95_abs_error", "p99_abs_error",
    "within_1s_pct", "within_2s_pct", "within_5s_pct",
    "mse", "msle", "rmsle",
    "mean_true", "std_true", "mean_pred", "std_pred",
]
metrics_df = metrics_df[cols]
metrics_df


In [None]:
# Save metrics to CSV for the paper
metrics_df.to_csv(OUTPUT_DIR / "metrics_overall_2025.csv", index=False)

# Also save per-lap predictions for reproducibility
errors_df.to_parquet(OUTPUT_DIR / "predictions_2025.parquet", index=False)
errors_df.head()


In [None]:
# Bootstrap confidence intervals for key metrics
def bootstrap_ci(y_true, y_pred, n_boot=200, seed=42):
    rng = np.random.default_rng(seed)
    n = len(y_true)
    stats = {"mae": [], "rmse": [], "r2": [], "mape_pct": []}
    for _ in range(n_boot):
        idx = rng.integers(0, n, n)
        yt = y_true[idx]
        yp = y_pred[idx]
        stats["mae"].append(mean_absolute_error(yt, yp))
        stats["rmse"].append(np.sqrt(mean_squared_error(yt, yp)))
        stats["r2"].append(r2_score(yt, yp))
        stats["mape_pct"].append(float(np.mean(np.abs(yp - yt) / np.clip(np.abs(yt), 1e-6, None)) * 100))
    rows = []
    for metric, values in stats.items():
        low, high = np.percentile(values, [2.5, 97.5])
        rows.append({
            "metric": metric,
            "ci_low": float(low),
            "ci_high": float(high),
        })
    return pd.DataFrame(rows)

ci_frames = []
for name, y_pred in preds.items():
    ci = bootstrap_ci(y_test, y_pred, n_boot=200, seed=42)
    ci["model"] = name
    ci_frames.append(ci)

ci_df = pd.concat(ci_frames, ignore_index=True)
ci_df.to_csv(OUTPUT_DIR / "metrics_bootstrap_ci_2025.csv", index=False)
ci_df


In [None]:
# Model comparison plots
mae_rmse = metrics_df[["model", "mae", "rmse"]].melt(
    id_vars="model", var_name="metric", value_name="value"
)
fig = px.bar(
    mae_rmse,
    x="model",
    y="value",
    color="metric",
    barmode="group",
    title="Model Comparison: MAE vs RMSE (2025)",
    labels={"value": "Seconds", "model": "Model"},
)
style_plot(fig, height=480)
save_plot(fig, OUTPUT_DIR / "model_comparison_mae_rmse")
fig.show()

mape_smape = metrics_df[["model", "mape_pct", "smape_pct"]].melt(
    id_vars="model", var_name="metric", value_name="value"
)
fig = px.bar(
    mape_smape,
    x="model",
    y="value",
    color="metric",
    barmode="group",
    title="Model Comparison: MAPE vs sMAPE (2025)",
    labels={"value": "Percent", "model": "Model"},
)
style_plot(fig, height=480)
save_plot(fig, OUTPUT_DIR / "model_comparison_mape_smape")
fig.show()

fig = px.bar(
    metrics_df,
    x="model",
    y="r2",
    color="model",
    title="Model Comparison: R2 (2025)",
    labels={"r2": "R2", "model": "Model"},
)
style_plot(fig, height=450)
save_plot(fig, OUTPUT_DIR / "model_comparison_r2")
fig.show()

fig = px.scatter(
    metrics_df,
    x="mae",
    y="rmse",
    color="model",
    text="model",
    title="MAE vs RMSE (Bias-Variance Indicator)",
    labels={"mae": "MAE (s)", "rmse": "RMSE (s)"},
)
fig.update_traces(textposition="top center")
style_plot(fig, height=480)
save_plot(fig, OUTPUT_DIR / "mae_vs_rmse")
fig.show()


In [None]:
# Detailed diagnostics per model
for name, y_pred in preds.items():
    # Actual vs Predicted
    fig = plot_actual_vs_pred(y_test, y_pred, title=f"Actual vs Predicted - {name} (2025)", sample_size=6000)
    fig.update_xaxes(title="Actual lap time (s)")
    fig.update_yaxes(title="Predicted lap time (s)")
    style_plot(fig, height=520)
    save_plot(fig, OUTPUT_DIR / f"actual_vs_pred_{name.lower().replace(' ', '_')}")
    fig.show()

    # Residual distribution
    fig = plot_error_distribution(y_test, y_pred, title=f"Residual Distribution - {name} (2025)", bins=70)
    fig.update_xaxes(title="Residual (s)")
    fig.update_yaxes(title="Count")
    style_plot(fig, height=420)
    save_plot(fig, OUTPUT_DIR / f"residuals_{name.lower().replace(' ', '_')}")
    fig.show()

    # Residuals vs Predicted
    fig = plot_residuals_vs_pred(y_test, y_pred, title=f"Residuals vs Predicted - {name} (2025)")
    style_plot(fig, height=480)
    save_plot(fig, OUTPUT_DIR / f"residuals_vs_pred_{name.lower().replace(' ', '_')}")
    fig.show()

    # Calibration plot
    fig = plot_calibration(y_test, y_pred, title=f"Calibration (Binned Means) - {name} (2025)", bins=20)
    style_plot(fig, height=480)
    save_plot(fig, OUTPUT_DIR / f"calibration_{name.lower().replace(' ', '_')}")
    fig.show()


In [None]:
# Error by category (top 12 worst) for each model
category_cols = ["EventName", "Circuit", "Driver", "Team", "Compound", "TireAgeCategory"]
category_cols = [c for c in category_cols if c in errors_df.columns]

for name in models.keys():
    subset = errors_df[errors_df["model"] == name]
    for col in category_cols:
        # Save numeric group metrics
        group_df = (
            subset.groupby(col, dropna=False)
            .agg(
                n=("abs_error", "size"),
                mae=("abs_error", "mean"),
                rmse=("error", lambda x: float(np.sqrt(np.mean(np.square(x))))),
            )
            .sort_values("mae", ascending=False)
            .reset_index()
        )
        group_df.to_csv(
            OUTPUT_DIR / f"metrics_by_{col.lower()}_{name.lower().replace(' ', '_')}.csv",
            index=False,
        )

        # Plot worst categories
        fig = plot_error_by_category_custom(
            subset,
            category_col=col,
            title=f"MAE by {col} (Top 12 Worst) - {name} (2025)",
            top_n=12,
        )
        fig.update_xaxes(title="MAE (s)")
        fig.update_yaxes(title=col)
        style_plot(fig, height=520)
        save_plot(fig, OUTPUT_DIR / f"mae_by_{col.lower()}_{name.lower().replace(' ', '_')}")
        fig.show()


In [None]:
# Per-round metrics (2025) for each model
round_frames = []
for name in models.keys():
    subset = errors_df[errors_df["model"] == name]
    if "RoundNumber" not in subset.columns:
        continue
    grouped = subset.groupby(["RoundNumber", "EventName"], dropna=False)
    for (rnd, event), grp in grouped:
        y_true_g = grp["y_true"].to_numpy()
        y_pred_g = grp["y_pred"].to_numpy()
        round_frames.append({
            "model": name,
            "RoundNumber": rnd,
            "EventName": event,
            "n": len(grp),
            "mae": mean_absolute_error(y_true_g, y_pred_g),
            "rmse": np.sqrt(mean_squared_error(y_true_g, y_pred_g)),
            "r2": r2_score(y_true_g, y_pred_g) if len(y_true_g) > 1 else np.nan,
        })

round_df = pd.DataFrame(round_frames).sort_values(["model", "RoundNumber"])
round_df.to_csv(OUTPUT_DIR / "metrics_by_round_2025.csv", index=False)
round_df.head()


In [None]:
# Plot MAE by round for each model
if not round_df.empty:
    fig = px.line(
        round_df,
        x="RoundNumber",
        y="mae",
        color="model",
        markers=True,
        title="MAE by Round (2025)",
        labels={"mae": "MAE (s)", "RoundNumber": "Round"},
)
    style_plot(fig, height=450)
    save_plot(fig, OUTPUT_DIR / "mae_by_round_2025")
    fig.show()
