In [1]:
import numpy as np
import pandas as pd

import fastf1
from fastf1 import Cache

from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor


In [10]:
# --- FastF1 cache ---
Cache.enable_cache("./f1_cache")

# --- dataset scope ---
YEARS = list(range(2018, 2025))  # tweak as needed
EVENT = "Singapore"              # robust name
SESSION = "Q"                    # Qualifying

RANDOM_SEED = 42


In [11]:
fastf1.get_event_schedule(2024)[["RoundNumber", "EventName"]]


Unnamed: 0,RoundNumber,EventName
0,0,Pre-Season Testing
1,1,Bahrain Grand Prix
2,2,Saudi Arabian Grand Prix
3,3,Australian Grand Prix
4,4,Japanese Grand Prix
5,5,Chinese Grand Prix
6,6,Miami Grand Prix
7,7,Emilia Romagna Grand Prix
8,8,Monaco Grand Prix
9,9,Canadian Grand Prix


In [12]:
def build_sg_rounds(year_start=2008, year_end=2024):
    sg_rounds = {}
    for y in range(year_start, year_end + 1):
        try:
            sched = fastf1.get_event_schedule(y)
            # Find Singapore GP row (be flexible about naming)
            row = sched[sched["EventName"].str.contains("Singapore", case=False, na=False)]
            if row.empty:
                sg_rounds[y] = None
            else:
                sg_rounds[y] = int(row.iloc[0]["RoundNumber"])
        except Exception:
            sg_rounds[y] = None
    return sg_rounds

SG_ROUNDS = build_sg_rounds(2008, 2024)
SG_ROUNDS



{2008: 15,
 2009: 14,
 2010: 15,
 2011: 14,
 2012: 14,
 2013: 13,
 2014: 14,
 2015: 13,
 2016: 15,
 2017: 14,
 2018: 15,
 2019: 15,
 2020: None,
 2021: None,
 2022: 17,
 2023: 15,
 2024: 18}

In [13]:
def td_to_seconds(s: pd.Series) -> pd.Series:
    return pd.to_timedelta(s).dt.total_seconds()

def build_year_sg_quali(year: int) -> pd.DataFrame:


    round_no = SG_ROUNDS.get(year)
    if round_no is None:
        raise ValueError(f"Singapore GP not held in {year}")

    ses = fastf1.get_session(year, round_no, SESSION)
    ses.load()



    ses.load()

    laps = ses.laps.copy()
    laps = laps[laps["LapTime"].notna()].copy()

    if laps.empty:
        raise ValueError(f"No laps with LapTime for {year} {EVENT} {SESSION}")

    # best lap per driver
    best = (
        laps.sort_values(["Driver", "LapTime"])
            .groupby("Driver", as_index=False)
            .head(1)
            .copy()
    )
    best["year"] = year

    # sector seconds (no hard filtering)
    best["Sector1Time_s"] = td_to_seconds(best["Sector1Time"]) if "Sector1Time" in best.columns else np.nan
    best["Sector2Time_s"] = td_to_seconds(best["Sector2Time"]) if "Sector2Time" in best.columns else np.nan
    best["Sector3Time_s"] = td_to_seconds(best["Sector3Time"]) if "Sector3Time" in best.columns else np.nan

    # tyre info
    best["TyreLife"] = best["TyreLife"].fillna(0).astype(int) if "TyreLife" in best.columns else 0
    best["Compound"] = best["Compound"].fillna("UNKNOWN") if "Compound" in best.columns else "UNKNOWN"

    # results
    res = ses.results.copy()
    res.columns = res.columns.str.strip()

    # this should exist for your years
    results = res[["Abbreviation", "Position", "TeamName"]].copy()
    results = results.rename(columns={
        "Abbreviation": "Driver",
        "Position": "qualifying_position",
        "TeamName": "TeamName"
    })
    results["qualifying_position"] = pd.to_numeric(results["qualifying_position"], errors="coerce")

    df = best.merge(results, on="Driver", how="inner").dropna(subset=["qualifying_position"]).copy()

    # IDs
    df["Driver_id"] = df["Driver"].astype("category").cat.codes
    df["Team_id"] = df["TeamName"].astype("category").cat.codes
    df["Compound_id"] = df["Compound"].astype("category").cat.codes

    return df[[
        "year",
        "Sector1Time_s", "Sector2Time_s", "Sector3Time_s",
        "TyreLife",
        "Driver_id", "Team_id", "Compound_id",
        "qualifying_position"
    ]].reset_index(drop=True)


In [14]:
dfs = []
for y in YEARS:
    try:
        df_y = build_year_sg_quali(y)
        print(y, df_y.shape)
        if len(df_y) > 0:
            dfs.append(df_y)
    except Exception as e:
        print("FAILED", y, "->", type(e).__name__, e)

if not dfs:
    raise RuntimeError("No objects built. Check FAILED logs above.")

df_all = pd.concat(dfs, ignore_index=True)
df_all.shape



core           INFO 	Loading data for Singapore Grand Prix - Qualifying [v3.7.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '33', '5', '77', '7', '3', '11', '8', '31', '27', '14', '55', '16', '9', '10', '20', '28', '2', '35', '18']
core           INFO 	Loading data for Singapore Grand Prix - Qualifying [v3.7.0]
req            INFO 	Using c

2018 (20, 9)


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '44', '5', '33', '77', '23', '55', '27', '4', '11', '99', '10', '7', '20', '26', '18', '8', '63', '88', '3']
core           INFO 	Loading data for Singapore Grand Prix - Qualifying [v3.7.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data

2019 (20, 9)
FAILED 2020 -> ValueError Singapore GP not held in 2020
FAILED 2021 -> ValueError Singapore GP not held in 2021


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '11', '44', '55', '14', '4', '10', '1', '20', '22', '63', '18', '47', '5', '24', '77', '3', '31', '23', '6']
core           INFO 	Loading data for Singapore Grand Prix - Qualifying [v3.7.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data

2022 (20, 9)


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['55', '63', '16', '4', '44', '20', '14', '31', '27', '40', '1', '10', '11', '23', '22', '77', '81', '2', '24', '18']
core           INFO 	Loading data for Singapore Grand Prix - Qualifying [v3.7.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_da

2023 (20, 9)


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['4', '1', '44', '63', '81', '27', '14', '22', '16', '55', '23', '43', '11', '20', '31', '3', '18', '10', '77', '24']
core           INFO 	Loading data for Singapore Grand Prix - Qualifying [v3.7.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_da

2024 (20, 9)


(100, 9)

In [9]:
display(df_all["year"].value_counts().sort_index())
display(df_all.isna().sum())


year
2018    20
2019    20
2022    20
2023    20
2024    20
Name: count, dtype: int64

year                   0
Sector1Time_s          0
Sector2Time_s          0
Sector3Time_s          0
TyreLife               0
Driver_id              0
Team_id                0
Compound_id            0
qualifying_position    0
dtype: int64

In [18]:
train_years = [y for y in YEARS if y <= 2022]
val_year = 2023
test_year = 2024

train_df = df_all[df_all["year"].isin(train_years)].copy()
val_df   = df_all[df_all["year"] == val_year].copy()
test_df  = df_all[df_all["year"] == test_year].copy()

train_df.shape, val_df.shape, test_df.shape


((60, 9), (20, 9), (20, 9))

In [19]:
feature_cols = [
    "Sector1Time_s", "Sector2Time_s", "Sector3Time_s",
    "TyreLife",
    "Driver_id", "Team_id", "Compound_id"
]
target_col = "qualifying_position"

X_train, y_train = train_df[feature_cols], train_df[target_col]
X_val, y_val     = val_df[feature_cols],   val_df[target_col]
X_test, y_test   = test_df[feature_cols],  test_df[target_col]

model = XGBRegressor(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=RANDOM_SEED
)

model.fit(X_train, y_train)

pred_val = model.predict(X_val)
pred_test = model.predict(X_test)

print("VAL  MAE:", mean_absolute_error(y_val, pred_val))
print("TEST MAE:", mean_absolute_error(y_test, pred_test))


VAL  MAE: 4.207349705696106
TEST MAE: 3.1416093349456786


In [20]:
out = test_df.copy()
out["pred_pos"] = pred_test

out_sorted = out.sort_values("pred_pos").reset_index(drop=True)
display(out_sorted[["qualifying_position", "pred_pos"]].head(10))

spearman = out_sorted["qualifying_position"].corr(out_sorted["pred_pos"], method="spearman")
print("Spearman rank corr (test):", spearman)


Unnamed: 0,qualifying_position,pred_pos
0,5.0,5.908404
1,9.0,5.938285
2,6.0,6.21218
3,3.0,6.70085
4,1.0,7.089408
5,7.0,7.271446
6,10.0,7.394323
7,4.0,7.506766
8,11.0,8.690067
9,8.0,9.070718


Spearman rank corr (test): 0.8075187969924811


In [26]:
out = test_df.copy()
out["pred_score"] = pred_test

# Convert scores to predicted positions (rank)
out["predicted_position"] = (
    out["pred_score"]
    .rank(method="first", ascending=True)
    .astype(int)
)

# Sort by predicted grid
out_view = out.sort_values("predicted_position").reset_index(drop=True)

display(
    out_view[[
        "qualifying_position",
        "predicted_position",
        "pred_score"
    ]].head(10)
)


Unnamed: 0,qualifying_position,predicted_position,pred_score
0,5.0,1,5.908404
1,9.0,2,5.938285
2,6.0,3,6.21218
3,3.0,4,6.70085
4,1.0,5,7.089408
5,7.0,6,7.271446
6,10.0,7,7.394323
7,4.0,8,7.506766
8,11.0,9,8.690067
9,8.0,10,9.070718


In [25]:
display(out_view[[
    "predicted_position",
    "Driver_id",
    "qualifying_position",
    "pred_score"
]])


Unnamed: 0,predicted_position,Driver_id,qualifying_position,pred_score
0,1,12,5.0,5.908404
1,2,7,9.0,5.938285
2,3,6,6.0,6.21218
3,4,5,3.0,6.70085
4,5,9,1.0,7.089408
5,6,1,7.0,7.271446
6,7,15,10.0,7.394323
7,8,14,4.0,7.506766
8,9,0,11.0,8.690067
9,10,17,8.0,9.070718


## Notebook Rationale & Methodological Notes (02_in_session_quali_model)

### What is this notebook trying to do?
This notebook builds an **in-session qualifying baseline model** for Formula 1, using telemetry available during qualifying (e.g. sector times, tyre life).  
The goal is **not forecasting**, but to validate whether in-session signals are sufficient to recover the **competitive structure of the grid**.

This acts as a **pipeline and signal validation step** before expanding to:
- multiple circuits
- pre-session forecasting
- strategy-level modelling

---

### Why is this limited to a single circuit (Singapore)?
A single circuit is used intentionally to control for:
- track geometry
- sector definitions
- qualifying format consistency

Singapore is a stable, high-signal circuit that serves as a **controlled environment**.  
Multi-circuit generalisation is deferred to a later phase.

---

### Why only 2018–2025 data?
Telemetry availability before 2018 is incomplete (missing sector times, tyre metadata, etc.).  
To maintain a **consistent feature space**, this notebook restricts to seasons with reliable telemetry.

This is a design choice, not a limitation.

---

### What is the train–evaluation split?
The model uses a **leave-one-year-out (LOYO)** evaluation strategy:
- Train on all Singapore qualifying sessions except one season
- Test on the held-out season
- Repeat for each available year

This avoids temporal leakage and better reflects real-world generalisation.

Random splits are intentionally avoided.

---

### Why use Spearman correlation in evaluation?
Qualifying is fundamentally a **ranking problem**, not a pure regression problem.

Spearman rank correlation measures whether the model:
- preserves the **relative ordering** of drivers
- correctly distinguishes faster vs slower drivers

This complements MAE:
- MAE measures absolute error in predicted positions
- Spearman measures correctness of ordering

High Spearman with moderate MAE indicates strong structural signal.

---

### Why do predictions cluster around the mean?
Tree-based regressors trained with regression losses tend to:
- compress predictions toward the mean
- avoid extreme claims when uncertainty is high

This behaviour is expected and indicates **cautious modelling**, not failure.

At this stage, predictions are best interpreted as **latent pace scores**, where ordering matters more than exact numeric position.

---

### Why not use learning-to-rank or meta-models yet?
Advanced techniques such as:
- learning-to-rank
- stacking / meta-models
- hyperparameter optimisation

are deferred intentionally.

This phase prioritises:
- task clarity
- evaluation integrity
- understanding failure modes

Complex ensembles are only justified once the problem structure is fully understood.

---

### Why not hyperparameter tuning or grid search?
A reasonable baseline configuration already captures most of the available signal.

Early optimisation risks:
- overfitting to small datasets
- hiding data or evaluation issues
- improving metrics without improving understanding

Tuning is deferred to later phases once the task definition is fixed.

---

### What does success look like for this notebook?
Success is defined as:
- stable performance across years
- meaningful Spearman correlation (≈ 0.7–0.85)
- no evidence of data leakage
- interpretable failure modes

This notebook is **not expected** to:
- predict exact grid positions
- generalise across circuits
- support race strategy decisions

---

### What comes next?
Subsequent phases will address:
- multi-circuit generalisation
- circuit-aware models
- ranking objectives
- pre-session forecasting
- strategy-level decision modelling

This notebook establishes the foundation those phases rely on.


### Why is one entire year held out during evaluation instead of using random splits?
At first glance, holding out an entire season may seem wasteful, especially since ensemble methods like Random Forests use bootstrap sampling and random splits during training.

However, this difference is **intentional and necessary** for Formula 1 data.

Random splits and bootstrap sampling assume that samples are **independent and identically distributed (IID)**. In F1, this assumption does not hold:
- cars evolve year to year
- regulations change
- team performance shifts
- drivers change teams
- tyre specifications vary across seasons

As a result, rows from the same season are statistically related.  
Allowing samples from the same year to appear in both training and evaluation would introduce **temporal leakage**, leading to overly optimistic performance estimates.

A **leave-one-year-out** strategy instead asks a harder and more realistic question:
> *If the model were trained on all past seasons, how well would it generalise to a completely new season it has never seen?*

Each year is held out once, and all other years are used for training. This ensures:
- no season contaminates its own evaluation
- every season is evaluated fairly
- full data utilisation across folds

While this approach typically results in lower metrics than random splits, it provides a more honest assessment of real-world generalisation in a non-stationary, time-dependent domain like motorsport.
