# Data Loading




In [16]:
import pandas as pd
import  numpy as np
from sklearn.model_selection import train_test_split , GridSearchCV
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [17]:
df = pd.read_csv(r'C:\Users\bekho\OneDrive\Desktop\the ultimat study\F1_Prediction_2025\F1_RaceResult_2019_2025.csv')
df.drop(columns=["Driver Number"], inplace=True)
df.head(10)

Unnamed: 0,Year,Circuit Name,Driver Name,Team,Starting Grid,Final Position
0,2019,Australia,Valtteri Bottas,Mercedes,2.0,1
1,2019,Australia,Lewis Hamilton,Mercedes,1.0,2
2,2019,Australia,Max Verstappen,Red Bull Racing Honda,4.0,3
3,2019,Australia,Sebastian Vettel,Ferrari,3.0,4
4,2019,Australia,Charles Leclerc,Ferrari,5.0,5
5,2019,Australia,Kevin Magnussen,Haas Ferrari,7.0,6
6,2019,Australia,Nico Hulkenberg,Renault,11.0,7
7,2019,Australia,Kimi Raikkönen,Alfa Romeo Racing Ferrari,9.0,8
8,2019,Australia,Lance Stroll,Racing Point BWT Mercedes,16.0,9
9,2019,Australia,Daniil Kvyat,Scuderia Toro Rosso Honda,15.0,10


In [18]:
df.dropna(axis=0,inplace=True)
df.isnull().sum()

Year              0
Circuit Name      0
Driver Name       0
Team              0
Starting Grid     0
Final Position    0
dtype: int64

In [19]:
df = df[(df["Final Position"] != "DQ") & (df["Final Position"] != "NC")& (df["Final Position"] != "DNF") &  (df["Starting Grid"]!="Pit Lane")&  (df["Final Position"]!="DNS")]
df["Final Position"] = df["Final Position"].astype(int)
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 2520 entries, 0 to 2876
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Year            2520 non-null   int64 
 1   Circuit Name    2520 non-null   object
 2   Driver Name     2520 non-null   object
 3   Team            2520 non-null   object
 4   Starting Grid   2520 non-null   object
 5   Final Position  2520 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 137.8+ KB


Unnamed: 0,Year,Final Position
count,2520.0,2520.0
mean,2022.000794,9.338095
std,1.940971,5.183031
min,2019.0,1.0
25%,2020.0,5.0
50%,2022.0,9.0
75%,2024.0,14.0
max,2025.0,20.0


In [20]:
categorical_cols = ["Circuit Name", "Driver Name", "Team"]
numeric_cols = ["Year", "Starting Grid"]

In [21]:
categorical_transformer = OneHotEncoder(
    sparse_output=False, handle_unknown="ignore"
)

numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_cols),
        ("num", numeric_transformer, numeric_cols),
    ]
)

In [22]:
xgb_model = xgb.XGBRegressor(
    objective="reg:squarederror",  # regression objective
    n_estimators=200,
    random_state=42,
    learning_rate=0.1,
    gamma=0,
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", xgb_model)
])

help(xgb.XGBRegressor)

Help on class XGBRegressor in module xgboost.sklearn:

class XGBRegressor(sklearn.base.RegressorMixin, XGBModel)
 |  XGBRegressor(*, objective: Union[str, xgboost.sklearn._SklObjWProto, Callable[[Any, Any], Tuple[numpy.ndarray, numpy.ndarray]], NoneType] = 'reg:squarederror', **kwargs: Any) -> None
 |  
 |  Implementation of the scikit-learn API for XGBoost regression.
 |  See :doc:`/python/sklearn_estimator` for more information.
 |  
 |  Parameters
 |  ----------
 |  
 |      n_estimators : typing.Optional[int]
 |          Number of gradient boosted trees.  Equivalent to number of boosting
 |          rounds.
 |  
 |      max_depth :  typing.Optional[int]
 |  
 |          Maximum tree depth for base learners.
 |  
 |      max_leaves : typing.Optional[int]
 |  
 |          Maximum number of leaves; 0 indicates no limit.
 |  
 |      max_bin : typing.Optional[int]
 |  
 |          If using histogram-based algorithm, maximum number of bins per feature
 |  
 |      grow_policy : typing.O

In [23]:
X = df.drop("Final Position", axis=1)
y = df["Final Position"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

# Define explicit parameter grid
param_grid = {
    "regressor__n_estimators": [200, 300, 400, 500],      # number of trees
    "regressor__max_depth": [3, 5, 10],                  # tree depth
    "regressor__learning_rate": [0.01, 0.05, 0.1],   # learning rate
    "regressor__subsample": [0.5, 0.7, 1.0],              # subsample ratio
    "regressor__gamma": [0,0.3, 0.5],               # pruning
    "regressor__colsample_bytree": [0.5, 0.7, 1.0],       # features per tree
    "regressor__min_child_weight": [1, 3, 5]                # regularization
}

# Grid search
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=3,
    scoring="neg_mean_absolute_error",
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters found:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Evaluate on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
error = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error on test set: {error}")

Fitting 3 folds for each of 2916 candidates, totalling 8748 fits
Best parameters found: {'regressor__colsample_bytree': 0.5, 'regressor__gamma': 0.5, 'regressor__learning_rate': 0.05, 'regressor__max_depth': 3, 'regressor__min_child_weight': 1, 'regressor__n_estimators': 500, 'regressor__subsample': 0.7}
Best score: -2.325798749923706
Mean Absolute Error on test set: 2.3464913368225098


In [25]:
import joblib
# Save model
joblib.dump(best_model, "podium.pkl")

['podium.pkl']

In [26]:
df["Circuit Name"].unique()

array(['Australia', 'Bahrain', 'China', 'Azerbaijan', 'Spain', 'Monaco',
       'Canada', 'France', 'Austria', 'Great Britain', 'Germany',
       'Hungary', 'Belgium', 'Italy', 'Singapore', 'Russia', 'Japan',
       'Mexico', 'United States', 'Brazil', 'Abu Dhabi', 'Styria',
       '70th Anniversary', 'Tuscany', 'Eifel', 'Portugal',
       'Emilia Romagna', 'Turkey', 'Sakhir', 'Netherlands', 'Qatar',
       'Saudi Arabia', 'Miami', 'Las Vegas', 'Emilia-Romagna'],
      dtype=object)

In [27]:
df["Driver Name"].unique()

array(['Valtteri Bottas', 'Lewis Hamilton', 'Max Verstappen',
       'Sebastian Vettel', 'Charles Leclerc', 'Kevin Magnussen',
       'Nico Hulkenberg', 'Kimi Raikkönen', 'Lance Stroll',
       'Daniil Kvyat', 'Pierre Gasly', 'Lando Norris', 'Sergio Perez',
       'Alexander Albon', 'Antonio Giovinazzi', 'George Russell',
       'Robert Kubica', 'Daniel Ricciardo', 'Carlos Sainz',
       'Romain Grosjean', 'Esteban Ocon', 'Nicholas Latifi',
       'Jack Aitken', 'Pietro Fittipaldi', 'Yuki Tsunoda',
       'Mick Schumacher', 'Fernando Alonso', 'Nikita Mazepin',
       'Guanyu Zhou', 'Nyck De Vries', 'Logan Sargeant', 'Oscar Piastri',
       'Liam Lawson', 'Oliver Bearman', 'Franco Colapinto', 'Jack Doohan',
       'Kimi Antonelli', 'Isack Hadjar', 'Gabriel Bortoleto',
       'Alex Albon'], dtype=object)

In [28]:
df["Team"].unique() 

array(['Mercedes', 'Red Bull Racing Honda', 'Ferrari', 'Haas Ferrari',
       'Renault', 'Alfa Romeo Racing Ferrari',
       'Racing Point BWT Mercedes', 'Scuderia Toro Rosso Honda',
       'McLaren Renault', 'Williams Mercedes', 'AlphaTauri Honda',
       'McLaren Mercedes', 'Aston Martin Mercedes', 'Alpine Renault',
       'Alfa Romeo Ferrari', 'AlphaTauri RBPT',
       'Aston Martin Aramco Mercedes', 'Red Bull Racing RBPT',
       'Red Bull Racing Honda RBPT', 'AlphaTauri Honda RBPT',
       'Kick Sauber Ferrari', 'RB Honda RBPT', 'Racing Bulls Honda RBPT',
       'Sauber Ferrari', 'Red Bull Racing'], dtype=object)