# 03 - XGBoost
Train XGBoost with optional hyperparameter tuning and visualize results.


In [1]:
from pathlib import Path
import sys

ROOT = Path("..").resolve()
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

import numpy as np
import pandas as pd


In [2]:
from pathlib import Path

from src.models import make_xgboost_model, make_pipeline, build_search, make_tree_preprocessor
from src.eval import evaluate_models, compute_full_metrics
from src.plots import plot_actual_vs_pred, plot_error_distribution
from _common import load_dataset, prepare_features, ROOT
from src.split import SplitConfig

SEED = 42

SMALL_MODE = False  # toggle for quick iteration
TUNE_MODE = "fast"  # off | fast | full
SEARCH_VERBOSE = 2  # sklearn CV logging
SEARCH_N_ITER = None  # only used for randomized search

EARLY_STOPPING = True
EARLY_STOPPING_ROUNDS = 50
EARLY_STOPPING_METRIC = "mae"
EARLY_STOPPING_VERBOSE = True

# MLflow
MLFLOW_ENABLED = True
MLFLOW_EXPERIMENT = "f1-laptime"
MLFLOW_TRACKING_URI = (ROOT / "mlruns").as_uri()
MLFLOW_RUN_NAME = "xgboost_notebook"

# Verify MLflow availability
if MLFLOW_ENABLED:
    try:
        import mlflow  # noqa: F401
        print(f"MLflow available: {mlflow.__version__}")
    except Exception:
        print("MLflow not installed; set MLFLOW_ENABLED=False or install mlflow.")
        MLFLOW_ENABLED = False

# Model saving
SAVE_MODEL = True
MODEL_PATH = ROOT / "reports" / "models" / "xgboost.joblib"

# Base params (used even when tuning is off)
XGB_MAX_DEPTH = None
XGB_LEARNING_RATE = None
XGB_N_ESTIMATORS = None
XGB_SUBSAMPLE = None
XGB_COLSAMPLE_BYTREE = None
XGB_MIN_CHILD_WEIGHT = None
XGB_GAMMA = None
XGB_REG_ALPHA = None
XGB_REG_LAMBDA = None

# Tuning grid (used when tuning is on)
XGB_PARAM_GRID = {
    "model__max_depth": [3, 4, 5, 6, 8],
    "model__learning_rate": [0.01, 0.03, 0.05, 0.1],
    "model__subsample": [0.7, 0.8, 0.9],
    "model__colsample_bytree": [0.7, 0.8, 0.9],
    "model__min_child_weight": [1, 3, 5],
    "model__gamma": [0.0, 0.5, 1.0],
    "model__reg_alpha": [0.0, 0.1, 0.5],
    "model__reg_lambda": [1.0, 2.0, 5.0],
    "model__n_estimators": [300, 600, 900, 1200],
}
if SMALL_MODE:
    XGB_MAX_DEPTH = 12
    XGB_LEARNING_RATE = 0.05
    XGB_N_ESTIMATORS = 600
    XGB_SUBSAMPLE = 0.8
    XGB_COLSAMPLE_BYTREE = 0.7
    XGB_MIN_CHILD_WEIGHT = 3
    XGB_GAMMA = 0.0
    XGB_REG_ALPHA = 0.1
    XGB_REG_LAMBDA = 2.0
    SEARCH_N_ITER = 12
    XGB_PARAM_GRID = {
        "model__max_depth": [3, 4, 5, 12],
        "model__learning_rate": [0.03, 0.05, 0.1],
        "model__subsample": [0.7, 0.8],
        "model__colsample_bytree": [0.7, 0.8],
        "model__min_child_weight": [1, 3],
        "model__gamma": [0.0, 0.5],
        "model__reg_alpha": [0.0, 0.1],
        "model__reg_lambda": [1.0, 2.0],
        "model__n_estimators": [300, 600, 800],
        "model__learning_rate": [0.03, 0.05, 0.1],
    }

split_config = SplitConfig(test_rounds=None)
df, metadata = load_dataset()
train_df, val_df, trainval_df, test_df, features = prepare_features(df, metadata, split_config=split_config)

X_train = train_df[features]
y_train = train_df["LapTimeSeconds"].to_numpy()
X_val = val_df[features]
y_val = val_df["LapTimeSeconds"].to_numpy()

xgb_kwargs = {}
if XGB_MAX_DEPTH is not None:
    xgb_kwargs["max_depth"] = int(XGB_MAX_DEPTH)
if XGB_LEARNING_RATE is not None:
    xgb_kwargs["learning_rate"] = float(XGB_LEARNING_RATE)
if XGB_N_ESTIMATORS is not None:
    xgb_kwargs["n_estimators"] = int(XGB_N_ESTIMATORS)
if XGB_SUBSAMPLE is not None:
    xgb_kwargs["subsample"] = float(XGB_SUBSAMPLE)
if XGB_COLSAMPLE_BYTREE is not None:
    xgb_kwargs["colsample_bytree"] = float(XGB_COLSAMPLE_BYTREE)
if XGB_MIN_CHILD_WEIGHT is not None:
    xgb_kwargs["min_child_weight"] = float(XGB_MIN_CHILD_WEIGHT)
if XGB_GAMMA is not None:
    xgb_kwargs["gamma"] = float(XGB_GAMMA)
if XGB_REG_ALPHA is not None:
    xgb_kwargs["reg_alpha"] = float(XGB_REG_ALPHA)
if XGB_REG_LAMBDA is not None:
    xgb_kwargs["reg_lambda"] = float(XGB_REG_LAMBDA)

base = make_pipeline(make_xgboost_model(SEED, **xgb_kwargs), features)
model = build_search(
    "XGBoost",
    base,
    random_state=SEED,
    mode=TUNE_MODE,
    param_grid=XGB_PARAM_GRID,
    n_iter=SEARCH_N_ITER,
    search_verbose=SEARCH_VERBOSE,
)
metrics, preds, fitted = evaluate_models({"XGBoost": model}, X_train, y_train, X_val, y_val)
metrics


MLflow available: 3.8.1


Models:   0%|          | 0/1 [00:00<?, ?it/s]

Train size: 39,247 | Eval size: 23,199
Features: 49
Training XGBoost...
Fitting 4 folds for each of 12 candidates, totalling 48 fits


Models: 100%|██████████| 1/1 [00:36<00:00, 36.05s/it]

XGBoost -> MAE: 0.9862, R2: 0.9669 (36.0s)
XGBoost best_params: {'model__subsample': 0.7, 'model__reg_lambda': 5.0, 'model__reg_alpha': 0.1, 'model__n_estimators': 600, 'model__min_child_weight': 5, 'model__max_depth': 8, 'model__learning_rate': 0.05, 'model__gamma': 1.0, 'model__colsample_bytree': 0.8}





Unnamed: 0,mae,rmse,r2,model
0,0.986169,1.838638,0.966892,XGBoost


In [3]:
best_search = fitted["XGBoost"]
if hasattr(best_search, "best_params_"):
    best_params = {k.replace("model__", ""): v for k, v in best_search.best_params_.items()}
else:
    best_params = xgb_kwargs

base_n_estimators = int(best_params.get("n_estimators", 1200))

# Avoid passing n_estimators twice
best_params = dict(best_params)
best_params.pop("n_estimators", None)

X_trainval = trainval_df[features]
y_trainval = trainval_df["LapTimeSeconds"].to_numpy()
X_test = test_df[features]
y_test = test_df["LapTimeSeconds"].to_numpy()

if EARLY_STOPPING:
    import xgboost as xgb

    # Early stopping on train/val to pick best n_estimators
    preprocess = make_tree_preprocessor(features)
    X_train_p = preprocess.fit_transform(X_train)
    X_val_p = preprocess.transform(X_val)

    early_model = make_xgboost_model(
        SEED,
        **best_params,
        n_estimators=max(base_n_estimators, 1200),
        eval_metric=EARLY_STOPPING_METRIC,
        callbacks=[
            xgb.callback.EarlyStopping(
                rounds=EARLY_STOPPING_ROUNDS,
                metric_name=EARLY_STOPPING_METRIC,
                data_name="validation_0",
                save_best=True,
                maximize=False,
            )
        ],
    )
    early_model.fit(
        X_train_p,
        y_train,
        eval_set=[(X_val_p, y_val)],
        verbose=EARLY_STOPPING_VERBOSE,
    )

    best_iteration = getattr(early_model, "best_iteration", None)
    if best_iteration is not None:
        best_n_estimators = int(best_iteration) + 1
    else:
        best_n_estimators = max(base_n_estimators, 1200)

    # Refit on train+val with selected n_estimators
    preprocess_final = make_tree_preprocessor(features)
    X_trainval_p = preprocess_final.fit_transform(X_trainval)
    X_test_p = preprocess_final.transform(X_test)

    final_model = make_xgboost_model(SEED, **best_params, n_estimators=best_n_estimators)
    final_model.fit(X_trainval_p, y_trainval, verbose=False)

    from sklearn.pipeline import Pipeline
    best = Pipeline([("preprocess", preprocess_final), ("model", final_model)])
    test_pred = best.predict(X_test)
else:
    best = best_search.best_estimator_ if hasattr(best_search, "best_estimator_") else best_search
    best.fit(X_trainval, y_trainval)
    test_pred = best.predict(X_test)

plot_actual_vs_pred(y_test, test_pred, title="XGBoost: Predicted vs Actual")


[0]	validation_0-mae:8.30561
[1]	validation_0-mae:7.89482
[2]	validation_0-mae:7.50698
[3]	validation_0-mae:7.14156
[4]	validation_0-mae:6.79643
[5]	validation_0-mae:6.46115
[6]	validation_0-mae:6.13170
[7]	validation_0-mae:5.83904
[8]	validation_0-mae:5.56072
[9]	validation_0-mae:5.29955
[10]	validation_0-mae:5.05149
[11]	validation_0-mae:4.81108
[12]	validation_0-mae:4.58128
[13]	validation_0-mae:4.37326
[14]	validation_0-mae:4.16932
[15]	validation_0-mae:3.97572
[16]	validation_0-mae:3.79627
[17]	validation_0-mae:3.62943
[18]	validation_0-mae:3.47239
[19]	validation_0-mae:3.31625
[20]	validation_0-mae:3.17919
[21]	validation_0-mae:3.04031
[22]	validation_0-mae:2.90841
[23]	validation_0-mae:2.79159
[24]	validation_0-mae:2.68334
[25]	validation_0-mae:2.57277
[26]	validation_0-mae:2.47235
[27]	validation_0-mae:2.38237
[28]	validation_0-mae:2.29452
[29]	validation_0-mae:2.21320
[30]	validation_0-mae:2.13634
[31]	validation_0-mae:2.06157
[32]	validation_0-mae:1.98816
[33]	validation_0-ma

In [4]:
plot_error_distribution(y_test, test_pred, title="XGBoost: Residuals")


In [5]:
import plotly.express as px

error_df = test_df.copy()
error_df["y_true"] = y_test
error_df["y_pred"] = test_pred
error_df["abs_error"] = (error_df["y_pred"] - error_df["y_true"]).abs()

group_cols = ["EventName", "Circuit", "Compound", "Team", "Driver"]
group_cols = [c for c in group_cols if c in error_df.columns]

for col in group_cols:
    grp = (
        error_df.groupby(col, dropna=False)
        .agg(n=("abs_error", "size"), mae=("abs_error", "mean"))
        .sort_values("mae", ascending=False)
        .reset_index()
    )
    grp = grp[grp["n"] >= 50].head(15)
    fig = px.bar(
        grp,
        x="mae",
        y=col,
        orientation="h",
        title=f"MAE by {col} (Top 15, n>=50)",
    )
    fig.update_layout(xaxis_title="MAE (s)")
    fig


In [6]:
import joblib

# Test metrics (full 2025 season)
xgb_test_metrics = compute_full_metrics(y_test, test_pred, n_features=len(features))
xgb_test_metrics

# Save model for inference
if SAVE_MODEL:
    MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
    joblib.dump(best, MODEL_PATH)
    print("Saved model to {}".format(MODEL_PATH))

# MLflow logging
if MLFLOW_ENABLED:
    try:
        import mlflow
    except ImportError:
        print("MLflow not installed; skipping MLflow logging.")
    else:
        def _coerce_params(params):
            return {k: str(v) for k, v in params.items()}

        if MLFLOW_TRACKING_URI:
            mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
        mlflow.set_experiment(MLFLOW_EXPERIMENT)
        with mlflow.start_run(run_name=MLFLOW_RUN_NAME or "xgboost_notebook"):
            mlflow.log_param("tune_mode", TUNE_MODE)
            mlflow.log_param("small_mode", SMALL_MODE)
            mlflow.log_param("test_rounds", split_config.test_rounds)

            if hasattr(fitted["XGBoost"], "best_params_"):
                mlflow.log_params(_coerce_params(fitted["XGBoost"].best_params_))
            else:
                mlflow.log_params(_coerce_params(xgb_kwargs))

            row = metrics[metrics["model"] == "XGBoost"].iloc[0]
            for metric in ("mae", "rmse", "r2"):
                mlflow.log_metric("val_" + metric, float(row[metric]))
            for metric in ("mae", "rmse", "r2", "mape_pct", "smape_pct"):
                if metric in xgb_test_metrics:
                    mlflow.log_metric("test_" + metric, float(xgb_test_metrics[metric]))

            if SAVE_MODEL and MODEL_PATH.exists():
                mlflow.log_artifact(str(MODEL_PATH), artifact_path="models")


Saved model to C:\Users\tvcar\Desktop\FOM\2. Semester\Maschinelles Lernen\ml_f1\reports\models\xgboost.joblib



The filesystem tracking backend (e.g., './mlruns') will be deprecated in February 2026. Consider transitioning to a database backend (e.g., 'sqlite:///mlflow.db') to take advantage of the latest MLflow features. See https://github.com/mlflow/mlflow/issues/18534 for more details and migration guidance. For migrating existing data, https://github.com/mlflow/mlflow-export-import can be used.

