In [1]:
import fastf1 as ff1 
ff1.Cache.enable_cache('../data/raw')
print("Cache path data/raw")

Cache path data/raw


In [28]:
# object for the las vegas grand prix
session = ff1.get_session(2025,'Las Vegas','R')
qualify_session = ff1.get_session(2025,"Las Vegas",'Q')
print(session)

# loads the data for Las Vegas grand prix(laps,time,pit,results...)
session.load()

# laoding the laps data
laps = session.laps
info_laps = laps['Driver'].unique()
info_laps_sorted = laps.sort_values(['Driver','LapNumber','LapTime'],ascending=[True,True,False])
info_laps_sorted[['Driver','LapNumber','LapTime']].head(20)

# specific driver info
laps_Ham = laps.pick_drivers('HAM')
print(laps_Ham.head()[['Driver','LapNumber','LapTime']])
laps_Ver = laps.pick_drivers('VER')
print(laps_Ver.head()[['Driver','LapNumber','LapTime']])

#session result 
result = session.results
print("Result of the Las Vegas Grand prix: ")
print(result)


core           INFO 	Loading data for Las Vegas Grand Prix - Race [v3.7.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


2025 Season Round 22: Las Vegas Grand Prix - Race


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '63', '12', '16', '55', '6', '27', '44', '31', '87', '14', '22', '10', '30', '43', '23', '5', '18', '4', '81']


    Driver  LapNumber                LapTime
350    HAM        1.0 0 days 00:01:50.124000
351    HAM        2.0 0 days 00:01:58.576000
352    HAM        3.0 0 days 00:02:09.181000
353    HAM        4.0 0 days 00:01:40.526000
354    HAM        5.0 0 days 00:01:37.668000
  Driver  LapNumber                LapTime
0    VER        1.0 0 days 00:01:42.062000
1    VER        2.0 0 days 00:01:47.622000
2    VER        3.0 0 days 00:02:13.534000
3    VER        4.0 0 days 00:01:41.671000
4    VER        5.0 0 days 00:01:37.421000
Result of the Las Vegas Grand prix: 
   DriverNumber BroadcastName Abbreviation        DriverId         TeamName  \
1             1  M VERSTAPPEN          VER  max_verstappen  Red Bull Racing   
63           63     G RUSSELL          RUS         russell         Mercedes   
12           12   K ANTONELLI          ANT       antonelli         Mercedes   
16           16     C LECLERC          LEC         leclerc          Ferrari   
55           55       C SAINZ          S

In [29]:
import pandas as pd
import numpy as np

# Checking if the there any missing values in the laps_time
laps_time_missing = laps['LapTime'].isnull().sum()
print("Total number of missig values from the Laps time: ",laps_time_missing)

# time of the laps for prediction
laps['LapTime_s'] = laps['LapTime'].dt.total_seconds()
driver_laps = laps['Driver'].unique()
print(driver_laps)
print(laps['Driver'].value_counts())
laps[['Driver','LapTime_s']].sample(20,random_state=42)
lap_time_mean = laps['LapTime_s'].mean()
laps['LapTime_s'].fillna(lap_time_mean)

# dropping invalid data 
laps = laps[laps['PitInTime'].isna() & laps['PitOutTime'].isna()] # to train model only on the normal laps speed and time and not on pit in and out 
laps = laps.dropna(subset=['LapTime_s'])
print("Checking is any NAN still exists: ",laps['LapTime_s'].isnull().sum())

#def feature and target column
feature_column = ['LapNumber','Stint','TyreLife','Compound','TrackStatus','Driver']
target_column = 'LapTime_s'


data = laps[feature_column + [target_column]].copy()
print(data.head(10))
print(data.shape)

Total number of missig values from the Laps time:  2
['VER' 'RUS' 'ANT' 'LEC' 'SAI' 'HAD' 'HUL' 'HAM' 'OCO' 'BEA' 'ALO' 'TSU'
 'GAS' 'LAW' 'COL' 'ALB' 'BOR' 'STR' 'NOR' 'PIA']
Driver
VER    50
RUS    50
ANT    50
LEC    50
SAI    50
HAD    50
HUL    50
HAM    50
OCO    50
BEA    50
ALO    50
TSU    50
GAS    50
PIA    50
NOR    50
LAW    49
COL    49
ALB    35
BOR     2
STR     1
Name: count, dtype: int64
Checking is any NAN still exists:  0
   LapNumber  Stint  TyreLife Compound TrackStatus Driver  LapTime_s
0        1.0    1.0       1.0   MEDIUM          12    VER    102.062
1        2.0    1.0       2.0   MEDIUM         216    VER    107.622
2        3.0    1.0       3.0   MEDIUM           6    VER    133.534
3        4.0    1.0       4.0   MEDIUM         671    VER    101.671
4        5.0    1.0       5.0   MEDIUM           1    VER     97.421
5        6.0    1.0       6.0   MEDIUM           1    VER     96.594
6        7.0    1.0       7.0   MEDIUM           1    VER     96.383
7 

In [32]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
import numpy as np

# 1. Data prep (already done)
X = data[feature_column]
Y = data[target_column]

numeric_feature = ['LapNumber','Stint']
categorical_feature = ['TyreLife','Compound','TrackStatus']

# 2. Data checks
print("NaNs in X:", X.isna().sum().sum())
print("NaNs in y:", Y.isna().sum())
print("Infs in y:", np.isinf(Y).sum())
print("LapTime_s min/max:", Y.min(), Y.max())

# 3. Split data FIRST
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# 4. Preprocessing
pre_processing = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_feature), 
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_feature)
    ]
)

# SOLUTION 2: Early stopping WITHOUT Pipeline
print("Training with early stopping ...")
pre_processing.fit(X_train, Y_train)
X_train_proc = pre_processing.transform(X_train)
X_test_proc = pre_processing.transform(X_test)

xgb_early = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    nthread=-1,
    early_stopping_rounds=50
)

xgb_early.fit(X_train_proc, Y_train,
              eval_set=[(X_test_proc, Y_test)],
              verbose=False)

print(f"Best iteration (optimal trees): {xgb_early.best_iteration}")
y_pred_early = xgb_early.predict(X_test_proc)
mae_early = mean_absolute_error(Y_test, y_pred_early)
print(f"Early stopping MAE: {mae_early:.3f}s")

# 5. Pipeline for hyperparameter tuning (no early stopping here)
xgb = XGBRegressor(
    n_estimators=300, 
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    nthread=-1
)

model = Pipeline(steps=[("pre-processing", pre_processing), ("model", xgb)])
model.fit(X_train, Y_train)

y_pred_pipeline = model.predict(X_test)
mae_pipeline = mean_absolute_error(Y_test, y_pred_pipeline)
print(f"Pipeline baseline MAE: {mae_pipeline:.3f}s")

# 6. Hyperparameter tuning
par_tune = {
    "model__learning_rate": [0.01, 0.03, 0.05, 0.08],
    "model__max_depth": [4, 6, 8],
    "model__subsample": [0.8, 0.9],
    "model__colsample_bytree": [0.8, 0.9],
    "model__n_estimators": [200, 300, 400]  
}

print("Starting hyperparameter tuning...")
search = RandomizedSearchCV(
    model, par_tune, n_iter=20, 
    scoring="neg_mean_absolute_error", 
    random_state=42, 
    n_jobs=-1,
    cv=3
)
search.fit(X_train, Y_train)

# 7. Results comparison
print("\n=== RESULTS COMPARISON ===")
print(f"Early stopping MAE: {mae_early:.3f}s (best_iteration={xgb_early.best_iteration})")
print(f"Pipeline baseline MAE: {mae_pipeline:.3f}s")
print(f"Best tuned params: {search.best_params_}")
print(f"Best CV MAE: {-search.best_score_:.3f}s")
print(f"Test MAE with best model: {mean_absolute_error(Y_test, search.predict(X_test)):.3f}s")


NaNs in X: 0
NaNs in y: 0
Infs in y: 0
LapTime_s min/max: 93.365 133.563
Training with early stopping ...
Best iteration (optimal trees): 126
Early stopping MAE: 0.780s
Pipeline baseline MAE: 0.781s
Starting hyperparameter tuning...

=== RESULTS COMPARISON ===
Early stopping MAE: 0.780s (best_iteration=126)
Pipeline baseline MAE: 0.781s
Best tuned params: {'model__subsample': 0.9, 'model__n_estimators': 200, 'model__max_depth': 4, 'model__learning_rate': 0.03, 'model__colsample_bytree': 0.8}
Best CV MAE: 0.763s
Test MAE with best model: 0.762s
