In [1]:
import fastf1 as ff1 
ff1.Cache.enable_cache('../data/raw')
print("Cache path data/raw")

Cache path data/raw


In [2]:
# object for the las vegas grand prix
session = ff1.get_session(2025,'Las Vegas','R')
qualify_session = ff1.get_session(2025,"Las Vegas",'Q')
print(session)

# loads the data for Las Vegas grand prix(laps,time,pit,results...)
session.load()

# laoding the laps data
laps = session.laps
info_laps = laps['Driver'].unique()
info_laps_sorted = laps.sort_values(['Driver','LapNumber','LapTime'],ascending=[True,True,False])
info_laps_sorted[['Driver','LapNumber','LapTime']].head(20)

# specific driver info
laps_Ham = laps.pick_drivers('HAM')
print(laps_Ham.head()[['Driver','LapNumber','LapTime']])
laps_Ver = laps.pick_drivers('VER')
print(laps_Ver.head()[['Driver','LapNumber','LapTime']])

#session result 
result = session.results
print("Result of the Las Vegas Grand prix: ")
print(result)


core           INFO 	Loading data for Las Vegas Grand Prix - Race [v3.7.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


2025 Season Round 22: Las Vegas Grand Prix - Race


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '63', '12', '16', '55', '6', '27', '44', '31', '87', '14', '22', '10', '30', '43', '23', '5', '18', '4', '81']


    Driver  LapNumber                LapTime
350    HAM        1.0 0 days 00:01:50.124000
351    HAM        2.0 0 days 00:01:58.576000
352    HAM        3.0 0 days 00:02:09.181000
353    HAM        4.0 0 days 00:01:40.526000
354    HAM        5.0 0 days 00:01:37.668000
  Driver  LapNumber                LapTime
0    VER        1.0 0 days 00:01:42.062000
1    VER        2.0 0 days 00:01:47.622000
2    VER        3.0 0 days 00:02:13.534000
3    VER        4.0 0 days 00:01:41.671000
4    VER        5.0 0 days 00:01:37.421000
Result of the Las Vegas Grand prix: 
   DriverNumber BroadcastName Abbreviation        DriverId         TeamName  \
1             1  M VERSTAPPEN          VER  max_verstappen  Red Bull Racing   
63           63     G RUSSELL          RUS         russell         Mercedes   
12           12   K ANTONELLI          ANT       antonelli         Mercedes   
16           16     C LECLERC          LEC         leclerc          Ferrari   
55           55       C SAINZ          S

In [11]:
import pandas as pd
import numpy as np

# Checking if the there any missing values in the laps_time
laps_time_missing = laps['LapTime'].isnull().sum()
print("Total number of missig values from the Laps time: ",laps_time_missing)

# time of the laps for prediction
laps['LapTime_s'] = laps['LapTime'].dt.total_seconds()
driver_laps = laps['Driver'].unique()
print(driver_laps)
print(laps['Driver'].value_counts())
laps[['Driver','LapTime_s']].sample(20,random_state=42)
lap_time_mean = laps['LapTime_s'].mean()
laps['LapTime_s'].fillna(lap_time_mean)

# dropping invalid data 
laps = laps[laps['PitInTime'].isna() & laps['PitOutTime'].isna()] # to train model only on the normal laps speed and time and not on pit in and out 
laps = laps.dropna(subset=['LapTime_s'])
print("Checking is any NAN still exists: ",laps['LapTime_s'].isnull().sum())

#def feature and target column
feature_column = ['LapNumber','Stint','TyreLife','Compound','TrackStatus']
target_column = 'LapTime_s'


data = laps[feature_column + [target_column]].copy()
print(data.head(10))
print(data.shape)

Total number of missig values from the Laps time:  0
['VER' 'RUS' 'ANT' 'LEC' 'SAI' 'HAD' 'HUL' 'HAM' 'OCO' 'BEA' 'ALO' 'TSU'
 'GAS' 'LAW' 'COL' 'ALB' 'NOR' 'PIA']
Driver
VER    48
RUS    48
ANT    48
LEC    48
SAI    48
HAD    48
HUL    48
HAM    48
OCO    48
BEA    48
ALO    48
GAS    48
NOR    48
PIA    48
COL    47
TSU    46
LAW    45
ALB    30
Name: count, dtype: int64
Checking is any NAN still exists:  0
   LapNumber  Stint  TyreLife Compound TrackStatus  LapTime_s
0        1.0    1.0       1.0   MEDIUM          12    102.062
1        2.0    1.0       2.0   MEDIUM         216    107.622
2        3.0    1.0       3.0   MEDIUM           6    133.534
3        4.0    1.0       4.0   MEDIUM         671    101.671
4        5.0    1.0       5.0   MEDIUM           1     97.421
5        6.0    1.0       6.0   MEDIUM           1     96.594
6        7.0    1.0       7.0   MEDIUM           1     96.383
7        8.0    1.0       8.0   MEDIUM           1     96.602
8        9.0    1.0       9.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

X = data[feature_column]
Y = data[target_column]

numeric_feature = ['LapNumber','Stint']
categorical_feature = ['TyreLife','Compound','TrackStatus']

# pre-processing
pre_processing = ColumnTransformer(transformers=[('num','passthrough',numeric_feature), ("cat",OneHotEncoder(handle_unknown="ignore"),categorical_feature)])

# model defining
xgb = XGBRegressor(n_estimators=300,learning_rate=0.05,max_depth=6,subsample=0.8,colsample_bytree=0.8,random_state=42,nthread=-1)
model = Pipeline(steps=[("pre-processing",pre_processing),("model",xgb)])

print("NaNs in X:", X.isna().sum().sum())
print("NaNs in y:", Y.isna().sum())  # Must be 0
print("Infs in y:", np.isinf(Y).sum())  # Must be 0

# Check LapTime range (should be reasonable, like 80-120 seconds)
print("LapTime_s min/max:", Y.min(), Y.max())

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)
model.fit(X_train,Y_train)
y_pred = model.predict(X_test)
mae = mean_absolute_error(Y_test,y_pred)
print(f"XGBoost MAE (s): {mae:.3f}")


NaNs in X: 0
NaNs in y: 0
Infs in y: 0
LapTime_s min/max: 93.365 133.563
XGBoost MAE (s): 0.822
