# F1 Lap Time Model: XGBoost
Train and evaluate a tuned XGBoost regressor with a strict chronological split.


In [1]:
from pathlib import Path
import sys

def find_project_root(start: Path) -> Path:
    for parent in [start] + list(start.parents):
        if (parent / "src").is_dir() and (parent / "requirements.txt").exists():
            return parent
    return start

project_root = find_project_root(Path.cwd().resolve())
sys.path.insert(0, str(project_root))


Note: This notebook requires the xgboost package and can take time to run.


In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
import random

from src.data_loader import load_laps_for_seasons, clean_laps
from src.features import build_feature_table
from src.models import set_global_seed, make_xgboost_search
from src.evaluator import (
    chronological_split,
    train_and_evaluate,
    format_metrics_markdown,
    plot_feature_importance,
    plot_predicted_vs_actual,
)


In [3]:
RANDOM_STATE = 42
set_global_seed(RANDOM_STATE)

DATA_DIR = Path("data")
FEATURES_PATH = DATA_DIR / "processed" / "feature_table.parquet"


In [4]:
def load_features():
    if FEATURES_PATH.exists():
        feature_df = pd.read_parquet(FEATURES_PATH)
    else:
        raw_laps = load_laps_for_seasons([2022, 2023])
        clean_laps_df = clean_laps(raw_laps)
        feature_df, _, _ = build_feature_table(clean_laps_df)
        FEATURES_PATH.parent.mkdir(parents=True, exist_ok=True)
        feature_df.to_parquet(FEATURES_PATH, index=False)
    numeric_features = [
        "LapNumber",
        "Stint",
        "TyreLife",
        "LapTimeLag1",
        "LapTimeLag2",
        "LapTimeLag3",
        "RollingMean3",
    ]
    categorical_features = [
        "Driver",
        "Team",
        "Compound",
        "TrackStatusFlag",
        "Circuit",
    ]
    return feature_df, numeric_features, categorical_features


In [5]:
feature_df, numeric_features, categorical_features = load_features()
train_df, test_df = chronological_split(feature_df, split_season=2023)

model = make_xgboost_search(numeric_features, categorical_features, RANDOM_STATE)
metrics_df, predictions, fitted_models = train_and_evaluate(
    {"xgboost": model},
    train_df,
    test_df,
    numeric_features + categorical_features,
)

metrics_df


Unnamed: 0,mae,rmse,r2,model
0,1.330697,3.22417,0.910988,xgboost


In [6]:
print(format_metrics_markdown(metrics_df))


| Model | MAE | RMSE | R2 |
|---|---|---|---|
| xgboost | 1.3307 | 3.2242 | 0.9110 |


In [8]:
plot_feature_importance(
    fitted_models["xgboost"],
    "reports/feature_importance_xgboost_top10.png",
    top_n=10,
)

plot_predicted_vs_actual(
    test_df,
    predictions["xgboost"],
    event_name="Japan",
    season=2023,
    output_path="reports/spa_2023_pred_vs_actual.png",
)
