# 03 - XGBoost
Train XGBoost with optional hyperparameter tuning and visualize results.


In [9]:
from pathlib import Path
import sys

ROOT = Path("..").resolve()
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

import numpy as np
import pandas as pd


In [10]:
from pathlib import Path

from src.models import make_xgboost_model, make_pipeline, build_search
from src.eval import evaluate_models
from src.plots import plot_actual_vs_pred, plot_error_distribution
from _common import load_dataset, prepare_features, ROOT
from src.split import SplitConfig

SEED = 42

SMALL_MODE = True  # toggle for quick iteration
TUNE_MODE = "fast"  # off | fast | full
SEARCH_VERBOSE = 2  # sklearn CV logging
SEARCH_N_ITER = None  # only used for randomized search

# MLflow
MLFLOW_ENABLED = True
MLFLOW_EXPERIMENT = "f1-laptime"
MLFLOW_TRACKING_URI = (ROOT / "mlruns").as_uri()
MLFLOW_RUN_NAME = "xgboost_notebook"

# Verify MLflow availability
if MLFLOW_ENABLED:
    try:
        import mlflow  # noqa: F401
        print(f"MLflow available: {mlflow.__version__}")
    except Exception:
        print("MLflow not installed; set MLFLOW_ENABLED=False or install mlflow.")
        MLFLOW_ENABLED = False

# Model saving
SAVE_MODEL = True
MODEL_PATH = ROOT / "reports" / "models" / "xgboost.joblib"

# Base params (used even when tuning is off)
XGB_MAX_DEPTH = None
XGB_LEARNING_RATE = None
XGB_N_ESTIMATORS = None
XGB_SUBSAMPLE = None
XGB_COLSAMPLE_BYTREE = None
XGB_MIN_CHILD_WEIGHT = None
XGB_REG_ALPHA = None
XGB_REG_LAMBDA = None

# Tuning grid (used when tuning is on)
XGB_PARAM_GRID = None
if SMALL_MODE:
    XGB_MAX_DEPTH = 4
    XGB_LEARNING_RATE = 0.05
    XGB_N_ESTIMATORS = 300
    XGB_SUBSAMPLE = 0.8
    XGB_COLSAMPLE_BYTREE = 0.8
    XGB_MIN_CHILD_WEIGHT = 3
    XGB_REG_ALPHA = 0.1
    XGB_REG_LAMBDA = 2.0
    SEARCH_N_ITER = 8
    XGB_PARAM_GRID = {
        "model__max_depth": [3, 4, 5],
        "model__learning_rate": [0.03, 0.05, 0.1],
        "model__subsample": [0.7, 0.8],
        "model__colsample_bytree": [0.7, 0.8],
        "model__min_child_weight": [1, 3],
        "model__reg_alpha": [0.0, 0.1],
        "model__reg_lambda": [1.0, 2.0],
        "model__n_estimators": [300, 600],
    }

split_config = SplitConfig(test_rounds=6)
df, metadata = load_dataset()
train_df, val_df, trainval_df, test_df, features = prepare_features(df, metadata, split_config=split_config)

X_train = train_df[features]
y_train = train_df["LapTimeSeconds"].to_numpy()
X_val = val_df[features]
y_val = val_df["LapTimeSeconds"].to_numpy()

xgb_kwargs = {}
if XGB_MAX_DEPTH is not None:
    xgb_kwargs["max_depth"] = int(XGB_MAX_DEPTH)
if XGB_LEARNING_RATE is not None:
    xgb_kwargs["learning_rate"] = float(XGB_LEARNING_RATE)
if XGB_N_ESTIMATORS is not None:
    xgb_kwargs["n_estimators"] = int(XGB_N_ESTIMATORS)
if XGB_SUBSAMPLE is not None:
    xgb_kwargs["subsample"] = float(XGB_SUBSAMPLE)
if XGB_COLSAMPLE_BYTREE is not None:
    xgb_kwargs["colsample_bytree"] = float(XGB_COLSAMPLE_BYTREE)
if XGB_MIN_CHILD_WEIGHT is not None:
    xgb_kwargs["min_child_weight"] = float(XGB_MIN_CHILD_WEIGHT)
if XGB_REG_ALPHA is not None:
    xgb_kwargs["reg_alpha"] = float(XGB_REG_ALPHA)
if XGB_REG_LAMBDA is not None:
    xgb_kwargs["reg_lambda"] = float(XGB_REG_LAMBDA)

base = make_pipeline(make_xgboost_model(SEED, **xgb_kwargs), features)
model = build_search(
    "XGBoost",
    base,
    random_state=SEED,
    mode=TUNE_MODE,
    param_grid=XGB_PARAM_GRID,
    n_iter=SEARCH_N_ITER,
    search_verbose=SEARCH_VERBOSE,
)
metrics, preds, fitted = evaluate_models({"XGBoost": model}, X_train, y_train, X_val, y_val)
metrics


Models:   0%|          | 0/1 [00:00<?, ?it/s]

Train size: 40,303 | Eval size: 23,256
Features: 45
Training XGBoost...
Fitting 4 folds for each of 8 candidates, totalling 32 fits


Models: 100%|██████████| 1/1 [00:27<00:00, 27.61s/it]

XGBoost -> MAE: 0.8835, R2: 0.9705 (27.6s)
XGBoost best_params: {'model__subsample': 0.6, 'model__reg_lambda': 1.0, 'model__reg_alpha': 0.1, 'model__n_estimators': 800, 'model__min_child_weight': 3, 'model__max_depth': 8, 'model__learning_rate': 0.1, 'model__colsample_bytree': 0.7}





Unnamed: 0,mae,rmse,r2,model
0,0.883468,1.738775,0.970546,XGBoost


In [11]:
best = fitted["XGBoost"].best_estimator_ if hasattr(fitted["XGBoost"], "best_estimator_") else fitted["XGBoost"]
X_trainval = trainval_df[features]
y_trainval = trainval_df["LapTimeSeconds"].to_numpy()
X_test = test_df[features]
y_test = test_df["LapTimeSeconds"].to_numpy()
best.fit(X_trainval, y_trainval)
test_pred = best.predict(X_test)

plot_actual_vs_pred(y_test, test_pred, title="XGBoost: Predicted vs Actual")


In [12]:
plot_error_distribution(y_test, test_pred, title="XGBoost: Residuals")


In [None]:
from src.eval import compute_metrics
import joblib

# Test metrics
xgb_test_metrics = compute_metrics(y_test, test_pred)
xgb_test_metrics

# Save model for inference
if SAVE_MODEL:
    MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
    joblib.dump(best, MODEL_PATH)
    print(f"Saved model to {MODEL_PATH}")

# MLflow logging
if MLFLOW_ENABLED:
    try:
        import mlflow
    except ImportError:
        print("MLflow not installed; skipping MLflow logging.")
    else:
        def _coerce_params(params):
            return {k: str(v) for k, v in params.items()}

        if MLFLOW_TRACKING_URI:
            mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
        mlflow.set_experiment(MLFLOW_EXPERIMENT)
        with mlflow.start_run(run_name=MLFLOW_RUN_NAME or "xgboost_notebook"):
            mlflow.log_param("tune_mode", TUNE_MODE)
            mlflow.log_param("small_mode", SMALL_MODE)
            mlflow.log_param("test_rounds", split_config.test_rounds)

            if hasattr(fitted["XGBoost"], "best_params_"):
                mlflow.log_params(_coerce_params(fitted["XGBoost"].best_params_))
            else:
                mlflow.log_params(_coerce_params(xgb_kwargs))

            row = metrics[metrics["model"] == "XGBoost"].iloc[0]
            for metric in ("mae", "rmse", "r2"):
                mlflow.log_metric(f"val_{metric}", float(row[metric]))
            for metric in ("mae", "rmse", "r2"):
                mlflow.log_metric(f"test_{metric}", float(xgb_test_metrics[metric]))

            if SAVE_MODEL and MODEL_PATH.exists():
                mlflow.log_artifact(str(MODEL_PATH), artifact_path="models")
