In [1]:
import numpy as np
import pandas
from f1winnerprediction import (
   config, 
	io_fastf1
)
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report,roc_auc_score, roc_curve, auc, mean_absolute_error, r2_score, accuracy_score, f1_score
import fastf1
import fastf1.core
import xgboost as xgb

fastf1.Cache.enable_cache(config.FASTF1_RAW_CACHE_DIR)

# Load sessions

In [2]:
sessions: dict[int, list[fastf1.core.Session]] = io_fastf1.load_sessions_from_years()
sessions

{2021: [2021 Season Round 1: Bahrain Grand Prix - Race,
  2021 Season Round 2: Emilia Romagna Grand Prix - Race,
  2021 Season Round 3: Portuguese Grand Prix - Race,
  2021 Season Round 4: Spanish Grand Prix - Race,
  2021 Season Round 5: Monaco Grand Prix - Race,
  2021 Season Round 6: Azerbaijan Grand Prix - Race,
  2021 Season Round 7: French Grand Prix - Race,
  2021 Season Round 8: Styrian Grand Prix - Race,
  2021 Season Round 9: Austrian Grand Prix - Race,
  2021 Season Round 10: British Grand Prix - Race,
  2021 Season Round 11: Hungarian Grand Prix - Race,
  2021 Season Round 12: Belgian Grand Prix - Race,
  2021 Season Round 13: Dutch Grand Prix - Race,
  2021 Season Round 14: Italian Grand Prix - Race,
  2021 Season Round 15: Russian Grand Prix - Race,
  2021 Season Round 16: Turkish Grand Prix - Race,
  2021 Season Round 17: United States Grand Prix - Race,
  2021 Season Round 18: Mexico City Grand Prix - Race,
  2021 Season Round 19: São Paulo Grand Prix - Race,
  2021 Sea

In [3]:
driver_mapping = io_fastf1.build_drivers_dict(sessions)
driver_mapping

{'RUS': {'index': 19},
 'VER': {'index': 19},
 'BOT': {'index': 23},
 'VET': {'index': 21},
 'LAT': {'index': 21},
 'ALO': {'index': 19},
 'STR': {'index': 19},
 'HAM': {'index': 19},
 'LEC': {'index': 19},
 'NOR': {'index': 19},
 'MSC': {'index': 21},
 'PER': {'index': 23},
 'GAS': {'index': 19},
 'RAI': {'index': 21},
 'RIC': {'index': 17},
 'SAI': {'index': 19},
 'TSU': {'index': 19},
 'OCO': {'index': 19},
 'MAZ': {'index': 21},
 'GIO': {'index': 21},
 'KUB': {'index': 13},
 'ALB': {'index': 19},
 'ZHO': {'index': 23},
 'MAG': {'index': 23},
 'HUL': {'index': 19},
 'DEV': {'index': 9},
 'PIA': {'index': 19},
 'SAR': {'index': 14},
 'LAW': {'index': 19},
 'BEA': {'index': 19},
 'COL': {'index': 19},
 'DOO': {'index': 5},
 'HAD': {'index': 19},
 'BOR': {'index': 19},
 'ANT': {'index': 19}}

# Prepare raceonly, qulionly and qualirace DataFrames

In [4]:
df_raceonly, df_qualionly, df_qualirace = io_fastf1.aggregate_driver_race__quali_results(sessions, driver_mapping)

In [5]:
df_raceonly

Unnamed: 0,Abbreviation,Bahrain Grand Prix_2021_Position,Emilia Romagna Grand Prix_2021_Position,Portuguese Grand Prix_2021_Position,Spanish Grand Prix_2021_Position,Monaco Grand Prix_2021_Position,Azerbaijan Grand Prix_2021_Position,French Grand Prix_2021_Position,Styrian Grand Prix_2021_Position,Austrian Grand Prix_2021_Position,...,Dutch Grand Prix_2025_Position,Italian Grand Prix_2025_Position,Azerbaijan Grand Prix_2025_Position,Singapore Grand Prix_2025_Position,United States Grand Prix_2025_Position,Mexico City Grand Prix_2025_Position,São Paulo Grand Prix_2025_Position,Las Vegas Grand Prix_2025_Position,Qatar Grand Prix_2025_Position,Abu Dhabi Grand Prix_2025_Position
0,RUS,14.0,19.0,16.0,14.0,14.0,17.0,12.0,19.0,11.0,...,4.0,5.0,2.0,1.0,6.0,7.0,,,,
1,VER,2.0,1.0,2.0,2.0,1.0,18.0,1.0,1.0,1.0,...,2.0,1.0,1.0,2.0,1.0,3.0,,,,
2,BOT,3.0,18.0,3.0,3.0,19.0,12.0,4.0,3.0,2.0,...,,,,,,,,,,
3,VET,15.0,15.0,13.0,13.0,5.0,2.0,9.0,12.0,17.0,...,,,,,,,,,,
4,LAT,18.0,20.0,18.0,16.0,15.0,16.0,18.0,17.0,16.0,...,,,,,,,,,,
5,ALO,19.0,10.0,8.0,17.0,13.0,6.0,8.0,9.0,10.0,...,8.0,19.0,15.0,7.0,10.0,18.0,,,,
6,STR,10.0,8.0,14.0,11.0,8.0,19.0,10.0,8.0,13.0,...,7.0,18.0,17.0,13.0,12.0,14.0,,,,
7,HAM,1.0,2.0,1.0,1.0,7.0,15.0,2.0,2.0,4.0,...,20.0,6.0,8.0,8.0,4.0,8.0,,,,
8,LEC,6.0,4.0,6.0,4.0,20.0,4.0,16.0,7.0,8.0,...,19.0,4.0,9.0,6.0,3.0,2.0,,,,
9,NOR,4.0,3.0,5.0,8.0,3.0,5.0,5.0,5.0,3.0,...,18.0,2.0,7.0,3.0,2.0,1.0,,,,


In [6]:
df_qualionly

Unnamed: 0,Abbreviation,Bahrain Grand Prix_2021_GridPosition,Emilia Romagna Grand Prix_2021_GridPosition,Portuguese Grand Prix_2021_GridPosition,Spanish Grand Prix_2021_GridPosition,Monaco Grand Prix_2021_GridPosition,Azerbaijan Grand Prix_2021_GridPosition,French Grand Prix_2021_GridPosition,Styrian Grand Prix_2021_GridPosition,Austrian Grand Prix_2021_GridPosition,...,Dutch Grand Prix_2025_GridPosition,Italian Grand Prix_2025_GridPosition,Azerbaijan Grand Prix_2025_GridPosition,Singapore Grand Prix_2025_GridPosition,United States Grand Prix_2025_GridPosition,Mexico City Grand Prix_2025_GridPosition,São Paulo Grand Prix_2025_GridPosition,Las Vegas Grand Prix_2025_GridPosition,Qatar Grand Prix_2025_GridPosition,Abu Dhabi Grand Prix_2025_GridPosition
0,RUS,15.0,12.0,11.0,15.0,15.0,15.0,14.0,10.0,8.0,...,5.0,5.0,5.0,1.0,4.0,4.0,,,,
1,VER,1.0,3.0,3.0,2.0,2.0,3.0,1.0,1.0,1.0,...,3.0,1.0,1.0,2.0,1.0,5.0,,,,
2,BOT,3.0,8.0,1.0,3.0,3.0,10.0,3.0,5.0,5.0,...,,,,,,,,,,
3,VET,20.0,0.0,10.0,13.0,8.0,11.0,12.0,14.0,11.0,...,,,,,,,,,,
4,LAT,17.0,14.0,18.0,19.0,18.0,16.0,16.0,16.0,18.0,...,,,,,,,,,,
5,ALO,9.0,15.0,13.0,10.0,17.0,8.0,9.0,8.0,14.0,...,10.0,8.0,11.0,10.0,10.0,14.0,,,,
6,STR,10.0,10.0,17.0,11.0,13.0,19.0,19.0,9.0,9.0,...,19.0,16.0,14.0,15.0,19.0,19.0,,,,
7,HAM,2.0,1.0,2.0,1.0,7.0,2.0,2.0,2.0,4.0,...,7.0,10.0,12.0,6.0,5.0,3.0,,,,
8,LEC,4.0,4.0,8.0,4.0,1.0,1.0,7.0,7.0,12.0,...,6.0,4.0,10.0,7.0,3.0,2.0,,,,
9,NOR,7.0,7.0,7.0,9.0,5.0,9.0,8.0,3.0,2.0,...,2.0,2.0,7.0,5.0,2.0,1.0,,,,


In [7]:
df_qualirace

Unnamed: 0,Abbreviation,Bahrain Grand Prix_2021_GridPosition,Bahrain Grand Prix_2021_Position,Emilia Romagna Grand Prix_2021_GridPosition,Emilia Romagna Grand Prix_2021_Position,Portuguese Grand Prix_2021_GridPosition,Portuguese Grand Prix_2021_Position,Spanish Grand Prix_2021_GridPosition,Spanish Grand Prix_2021_Position,Monaco Grand Prix_2021_GridPosition,...,Mexico City Grand Prix_2025_GridPosition,Mexico City Grand Prix_2025_Position,São Paulo Grand Prix_2025_GridPosition,São Paulo Grand Prix_2025_Position,Las Vegas Grand Prix_2025_GridPosition,Las Vegas Grand Prix_2025_Position,Qatar Grand Prix_2025_GridPosition,Qatar Grand Prix_2025_Position,Abu Dhabi Grand Prix_2025_GridPosition,Abu Dhabi Grand Prix_2025_Position
0,RUS,15.0,14.0,12.0,19.0,11.0,16.0,15.0,14.0,15.0,...,4.0,7.0,,,,,,,,
1,VER,1.0,2.0,3.0,1.0,3.0,2.0,2.0,2.0,2.0,...,5.0,3.0,,,,,,,,
2,BOT,3.0,3.0,8.0,18.0,1.0,3.0,3.0,3.0,3.0,...,,,,,,,,,,
3,VET,20.0,15.0,0.0,15.0,10.0,13.0,13.0,13.0,8.0,...,,,,,,,,,,
4,LAT,17.0,18.0,14.0,20.0,18.0,18.0,19.0,16.0,18.0,...,,,,,,,,,,
5,ALO,9.0,19.0,15.0,10.0,13.0,8.0,10.0,17.0,17.0,...,14.0,18.0,,,,,,,,
6,STR,10.0,10.0,10.0,8.0,17.0,14.0,11.0,11.0,13.0,...,19.0,14.0,,,,,,,,
7,HAM,2.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,7.0,...,3.0,8.0,,,,,,,,
8,LEC,4.0,6.0,4.0,4.0,8.0,6.0,4.0,4.0,1.0,...,2.0,2.0,,,,,,,,
9,NOR,7.0,4.0,7.0,3.0,7.0,5.0,9.0,8.0,5.0,...,1.0,1.0,,,,,,,,


In [8]:
df_raceonly[df_raceonly.isin([0])].dropna()

Unnamed: 0,Abbreviation,Bahrain Grand Prix_2021_Position,Emilia Romagna Grand Prix_2021_Position,Portuguese Grand Prix_2021_Position,Spanish Grand Prix_2021_Position,Monaco Grand Prix_2021_Position,Azerbaijan Grand Prix_2021_Position,French Grand Prix_2021_Position,Styrian Grand Prix_2021_Position,Austrian Grand Prix_2021_Position,...,Dutch Grand Prix_2025_Position,Italian Grand Prix_2025_Position,Azerbaijan Grand Prix_2025_Position,Singapore Grand Prix_2025_Position,United States Grand Prix_2025_Position,Mexico City Grand Prix_2025_Position,São Paulo Grand Prix_2025_Position,Las Vegas Grand Prix_2025_Position,Qatar Grand Prix_2025_Position,Abu Dhabi Grand Prix_2025_Position


In [9]:
df_qualionly[df_qualionly.isin([0])].dropna()

Unnamed: 0,Abbreviation,Bahrain Grand Prix_2021_GridPosition,Emilia Romagna Grand Prix_2021_GridPosition,Portuguese Grand Prix_2021_GridPosition,Spanish Grand Prix_2021_GridPosition,Monaco Grand Prix_2021_GridPosition,Azerbaijan Grand Prix_2021_GridPosition,French Grand Prix_2021_GridPosition,Styrian Grand Prix_2021_GridPosition,Austrian Grand Prix_2021_GridPosition,...,Dutch Grand Prix_2025_GridPosition,Italian Grand Prix_2025_GridPosition,Azerbaijan Grand Prix_2025_GridPosition,Singapore Grand Prix_2025_GridPosition,United States Grand Prix_2025_GridPosition,Mexico City Grand Prix_2025_GridPosition,São Paulo Grand Prix_2025_GridPosition,Las Vegas Grand Prix_2025_GridPosition,Qatar Grand Prix_2025_GridPosition,Abu Dhabi Grand Prix_2025_GridPosition


In [10]:
df_qualirace[df_qualirace.isin([0])].dropna()

Unnamed: 0,Abbreviation,Bahrain Grand Prix_2021_GridPosition,Bahrain Grand Prix_2021_Position,Emilia Romagna Grand Prix_2021_GridPosition,Emilia Romagna Grand Prix_2021_Position,Portuguese Grand Prix_2021_GridPosition,Portuguese Grand Prix_2021_Position,Spanish Grand Prix_2021_GridPosition,Spanish Grand Prix_2021_Position,Monaco Grand Prix_2021_GridPosition,...,Mexico City Grand Prix_2025_GridPosition,Mexico City Grand Prix_2025_Position,São Paulo Grand Prix_2025_GridPosition,São Paulo Grand Prix_2025_Position,Las Vegas Grand Prix_2025_GridPosition,Las Vegas Grand Prix_2025_Position,Qatar Grand Prix_2025_GridPosition,Qatar Grand Prix_2025_Position,Abu Dhabi Grand Prix_2025_GridPosition,Abu Dhabi Grand Prix_2025_Position


# Aggregate data and create windows

In [11]:
df_windows_raceonly = io_fastf1.create_columns_windows_raceonly(df_raceonly)
df_windows_raceonly

Skipping window window_106 due to NaN at position(s) [4], jumping to window_index 110
Skipping window window_106 due to NaN at position(s) [4], jumping to window_index 110
Skipping window window_86 due to NaN at position(s) [4], jumping to window_index 90
Skipping window window_91 due to NaN at position(s) [0 1 2 3 4], jumping to window_index 95
Skipping window window_96 due to NaN at position(s) [0 1 2 3 4], jumping to window_index 100
Skipping window window_101 due to NaN at position(s) [0 1 2 3 4], jumping to window_index 105
Skipping window window_106 due to NaN at position(s) [0 1 2 3 4], jumping to window_index 110
Skipping window window_18 due to NaN at position(s) [4], jumping to window_index 22
Skipping window window_23 due to NaN at position(s) [0], jumping to window_index 23
Skipping window window_40 due to NaN at position(s) [4], jumping to window_index 44
Skipping window window_45 due to NaN at position(s) [0 1 2 3 4], jumping to window_index 49
Skipping window window_50 d

Unnamed: 0,race0,race1,race2,race3,race4
0,14.0,19.0,16.0,14.0,14.0
1,19.0,16.0,14.0,14.0,17.0
2,16.0,14.0,14.0,17.0,12.0
3,14.0,14.0,17.0,12.0,19.0
4,14.0,17.0,12.0,19.0,11.0
...,...,...,...,...,...
1990,16.0,16.0,10.0,16.0,9.0
1991,16.0,10.0,16.0,9.0,4.0
1992,10.0,16.0,9.0,4.0,5.0
1993,16.0,9.0,4.0,5.0,13.0


In [12]:
df_windows_qualirace = io_fastf1.create_columns_windows_race_quali(df_qualirace)
df_windows_qualirace

Skipping window window_106 due to NaN before position 9, jumping to window_index 115
Skipping window window_106 due to NaN before position 9, jumping to window_index 115
Skipping window window_86 due to NaN before position 9, jumping to window_index 95
Skipping window window_96 due to NaN before position 9, jumping to window_index 105
Skipping window window_106 due to NaN before position 9, jumping to window_index 115
Skipping window window_18 due to NaN before position 9, jumping to window_index 27
Skipping window window_40 due to NaN before position 9, jumping to window_index 49
Skipping window window_50 due to NaN before position 9, jumping to window_index 59
Skipping window window_60 due to NaN before position 9, jumping to window_index 69
Skipping window window_70 due to NaN before position 9, jumping to window_index 79
Skipping window window_80 due to NaN before position 9, jumping to window_index 89
Skipping window window_90 due to NaN before position 9, jumping to window_index 

Unnamed: 0,quali0,race0,quali1,race1,quali2,race2,quali3,race3,quali4,race4
0,15.0,14.0,12.0,19.0,11.0,16.0,15.0,14.0,15.0,14.0
1,12.0,19.0,11.0,16.0,15.0,14.0,15.0,14.0,15.0,17.0
2,11.0,16.0,15.0,14.0,15.0,14.0,15.0,17.0,14.0,12.0
3,15.0,14.0,15.0,14.0,15.0,17.0,14.0,12.0,10.0,19.0
4,15.0,14.0,15.0,17.0,14.0,12.0,10.0,19.0,8.0,11.0
...,...,...,...,...,...,...,...,...,...,...
1915,10.0,16.0,19.0,16.0,15.0,10.0,11.0,16.0,6.0,9.0
1916,19.0,16.0,15.0,10.0,11.0,16.0,6.0,9.0,4.0,4.0
1917,15.0,10.0,11.0,16.0,6.0,9.0,4.0,4.0,4.0,5.0
1918,11.0,16.0,6.0,9.0,4.0,4.0,4.0,5.0,7.0,13.0


# Train Models

## Prepare Data for model training

In [14]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def data_label_split(df_windows: pd.DataFrame):
	X = df_windows.iloc[:, :-1]
	y = df_windows.iloc[:, -1]
	return X, y

def normalize_features(X: pd.DataFrame):
   scaler = StandardScaler()
   X_normalized = scaler.fit_transform(X)
   return X_normalized

## XGBoost Models

### XGBoost Regressor

#### Race only

In [58]:
X, y = data_label_split(df_windows_raceonly)
X_normalized = normalize_features(X)

In [59]:
y.value_counts().sort_index()

race4
1.0     105
2.0     106
3.0     105
4.0     105
5.0     102
6.0     104
7.0     100
8.0      98
9.0      99
10.0    100
11.0     94
12.0     98
13.0     94
14.0     95
15.0     96
16.0     97
17.0     96
18.0    103
19.0    102
20.0     96
Name: count, dtype: int64

In [60]:
y = y - 1

In [61]:
y.value_counts().sort_index()

race4
0.0     105
1.0     106
2.0     105
3.0     105
4.0     102
5.0     104
6.0     100
7.0      98
8.0      99
9.0     100
10.0     94
11.0     98
12.0     94
13.0     95
14.0     96
15.0     97
16.0     96
17.0    103
18.0    102
19.0     96
Name: count, dtype: int64

In [69]:
X_train, X_test, y_train, y_test = train_test_split(
	 X_normalized, y, test_size=0.2, stratify=y
)

In [70]:
model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,

)

In [71]:
model.fit(X_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [72]:
y_pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

MAE: 3.8957786560058594
R² Score: 0.24843478202819824


In [118]:
def train():
	X_train, X_test, y_train, y_test = train_test_split(
	 X_normalized, y, test_size=0.2, stratify=y
	)
	model = xgb.XGBRegressor(
	 n_estimators=100,
	 learning_rate=0.02,
	 max_depth=12,
	 subsample=0.8,
	 colsample_bytree=0.8,

	)
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)
	print("MAE:", mean_absolute_error(y_test, y_pred))
	print("R² Score:", r2_score(y_test, y_pred))
train()

MAE: 4.098126411437988
R² Score: 0.2586515545845032


In [None]:
from sklearn.model_selection import GridSearchCV
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)
# Note: This is a small grid for demonstration. A real search would have more values.
param_grid = {
    'max_depth': range(3, 24),
    'n_estimators': range(50, 300, 10),
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
# scoring='accuracy' is the metric to optimize
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

# 6. Print the best parameters found
print("Best hyperparameters found:")
print(grid_search.best_params_)

# 7. Get the best model and evaluate it on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nAccuracy on test set with best model: {accuracy:.4f}")

Fitting 5 folds for each of 8400 candidates, totalling 42000 fits


#### Quali Race

In [57]:
X, y = data_label_split(df_windows_qualirace)
X_normalized = normalize_features(X)

In [18]:
y = y -1

In [19]:
y.value_counts().sort_index()

race4
0.0     105
1.0     106
2.0     103
3.0     105
4.0      99
5.0     101
6.0      97
7.0      97
8.0      97
9.0      95
10.0     91
11.0     92
12.0     90
13.0     89
14.0     89
15.0     93
16.0     91
17.0     98
18.0     93
19.0     89
Name: count, dtype: int64

In [56]:
X_train, X_test, y_train, y_test = train_test_split(
	 X_normalized, y, test_size=0.2, stratify=y
)

In [21]:
model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,

)

In [22]:
model.fit(X_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [None]:
y_pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

MAE: 3.2987377643585205
R² Score: 0.44992828369140625
