# 02 - Linear Model
Train a linear baseline and visualize results.


In [ ]:
from pathlib import Path
import sys

ROOT = Path("..").resolve()
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

import numpy as np
import pandas as pd


In [ ]:
from pathlib import Path

from src.models import make_linear_model, make_pipeline, build_search
from src.eval import evaluate_models, compute_full_metrics
from src.plots import plot_actual_vs_pred, plot_error_distribution, plot_model_comparison
from _common import load_dataset, prepare_features, ROOT
from src.split import SplitConfig

SEED = 42

SMALL_MODE = True  # toggle for quick iteration
TUNE_MODE = "off"  # off | fast | full
SEARCH_VERBOSE = 2  # sklearn CV logging
SEARCH_N_ITER = None  # only used for randomized search

# MLflow
MLFLOW_ENABLED = True
MLFLOW_EXPERIMENT = "f1-laptime"
MLFLOW_TRACKING_URI = (ROOT / "mlruns").as_uri()
MLFLOW_RUN_NAME = "linear_notebook"

# Verify MLflow availability
if MLFLOW_ENABLED:
    try:
        import mlflow  # noqa: F401
        print(f"MLflow available: {mlflow.__version__}")
    except Exception:
        print("MLflow not installed; set MLFLOW_ENABLED=False or install mlflow.")
        MLFLOW_ENABLED = False

# Model saving
SAVE_MODEL = True
MODEL_PATH = ROOT / "reports" / "models" / "linear.joblib"

# Base params (used even when tuning is off)
LINEAR_ALPHA = None

# Tuning grid (used when tuning is on)
LINEAR_PARAM_GRID = None
if SMALL_MODE:
    LINEAR_ALPHA = 1.0
    SEARCH_N_ITER = 6
    LINEAR_PARAM_GRID = {"model__alpha": [0.1, 0.5, 1.0, 5.0]}

split_config = SplitConfig(test_rounds=None)
df, metadata = load_dataset()
train_df, val_df, trainval_df, test_df, features = prepare_features(df, metadata, split_config=split_config)

X_train = train_df[features]
y_train = train_df["LapTimeSeconds"].to_numpy()
X_val = val_df[features]
y_val = val_df["LapTimeSeconds"].to_numpy()

base = make_pipeline(make_linear_model(SEED, alpha=LINEAR_ALPHA), features)
model = build_search(
    "Linear",
    base,
    random_state=SEED,
    mode=TUNE_MODE,
    param_grid=LINEAR_PARAM_GRID,
    n_iter=SEARCH_N_ITER,
    search_verbose=SEARCH_VERBOSE,
)
metrics, preds, fitted = evaluate_models({"Linear": model}, X_train, y_train, X_val, y_val)
metrics


In [ ]:
best = fitted["Linear"]
X_trainval = trainval_df[features]
y_trainval = trainval_df["LapTimeSeconds"].to_numpy()
X_test = test_df[features]
y_test = test_df["LapTimeSeconds"].to_numpy()
best.fit(X_trainval, y_trainval)
test_pred = best.predict(X_test)

plot_actual_vs_pred(y_test, test_pred, title="Linear: Predicted vs Actual")


In [ ]:
plot_error_distribution(y_test, test_pred, title="Linear: Residuals")


In [None]:
import joblib

# Test metrics (full 2025 season)
linear_test_metrics = compute_full_metrics(y_test, test_pred, n_features=len(features))
linear_test_metrics

# Save model for inference
if SAVE_MODEL:
    MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
    joblib.dump(best, MODEL_PATH)
    print("Saved model to {}".format(MODEL_PATH))

# MLflow logging
if MLFLOW_ENABLED:
    try:
        import mlflow
    except ImportError:
        print("MLflow not installed; skipping MLflow logging.")
    else:
        def _coerce_params(params):
            return {k: str(v) for k, v in params.items()}

        if MLFLOW_TRACKING_URI:
            mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
        mlflow.set_experiment(MLFLOW_EXPERIMENT)
        with mlflow.start_run(run_name=MLFLOW_RUN_NAME or "linear_notebook"):
            mlflow.log_param("tune_mode", TUNE_MODE)
            mlflow.log_param("small_mode", SMALL_MODE)
            mlflow.log_param("test_rounds", split_config.test_rounds)

            if hasattr(fitted["Linear"], "best_params_"):
                mlflow.log_params(_coerce_params(fitted["Linear"].best_params_))
            elif LINEAR_ALPHA is not None:
                mlflow.log_param("alpha", LINEAR_ALPHA)

            row = metrics[metrics["model"] == "Linear"].iloc[0]
            for metric in ("mae", "rmse", "r2"):
                mlflow.log_metric("val_" + metric, float(row[metric]))
            for metric in ("mae", "rmse", "r2", "mape_pct", "smape_pct"):
                if metric in linear_test_metrics:
                    mlflow.log_metric("test_" + metric, float(linear_test_metrics[metric]))

            if SAVE_MODEL and MODEL_PATH.exists():
                mlflow.log_artifact(str(MODEL_PATH), artifact_path="models")
