# 03 - XGBoost
Train XGBoost with optional hyperparameter tuning and visualize results.


In [ ]:
from pathlib import Path
import sys

ROOT = Path("..").resolve()
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

import numpy as np
import pandas as pd


In [ ]:
from src.models import make_xgboost_model, make_pipeline, build_search
from src.eval import evaluate_models
from src.plots import plot_actual_vs_pred, plot_error_distribution
from _common import load_dataset, prepare_features
from src.split import SplitConfig

SEED = 42
TUNE_MODE = "fast"  # off | fast | full
split_config = SplitConfig(test_rounds=6)
df, metadata = load_dataset()
train_df, val_df, trainval_df, test_df, features = prepare_features(df, metadata, split_config=split_config)

X_train = train_df[features]
y_train = train_df["LapTimeSeconds"].to_numpy()
X_val = val_df[features]
y_val = val_df["LapTimeSeconds"].to_numpy()

base = make_pipeline(make_xgboost_model(SEED), features)
model = build_search("XGBoost", base, random_state=SEED, mode=TUNE_MODE)
metrics, preds, fitted = evaluate_models({"XGBoost": model}, X_train, y_train, X_val, y_val)
metrics


In [ ]:
best = fitted["XGBoost"].best_estimator_ if hasattr(fitted["XGBoost"], "best_estimator_") else fitted["XGBoost"]
X_trainval = trainval_df[features]
y_trainval = trainval_df["LapTimeSeconds"].to_numpy()
X_test = test_df[features]
y_test = test_df["LapTimeSeconds"].to_numpy()
best.fit(X_trainval, y_trainval)
test_pred = best.predict(X_test)

plot_actual_vs_pred(y_test, test_pred, title="XGBoost: Predicted vs Actual")


In [ ]:
plot_error_distribution(y_test, test_pred, title="XGBoost: Residuals")
