In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df=pd.read_csv("../data/f1_2023_engineered.csv")

df.head()

Unnamed: 0,number,position,positionText,points,grid,laps,status,Driver.driverId,Driver.permanentNumber,Driver.code,...,podium,constructor_cum_points,driver_avg_last5,driver_podiums_last5,driver_best_last5,driver_worst_last5,teammate_grid,teammate_position,grid_delta_vs_teammate,position_delta_vs_teammate
0,77,8,8,4.0,12,57,Finished,bottas,77,BOT,...,0,0.0,3.2,4.0,3.0,4.0,13,16,-1,-8
1,24,16,16,0.0,13,56,Lapped,zhou,24,ZHO,...,0,4.0,1.4,5.0,1.0,2.0,12,8,1,8
2,22,11,11,0.0,14,57,Finished,tsunoda,22,TSU,...,0,4.0,9.8,0.0,4.0,20.0,19,14,-5,-3
3,21,14,14,0.0,19,56,Lapped,de_vries,21,DEV,...,0,0.0,13.6,0.0,8.0,18.0,14,11,5,3
4,31,18,R,0.0,9,41,Retired,ocon,31,OCO,...,0,2.0,13.2,0.0,6.0,17.0,20,9,-11,9


In [2]:
numeric_features = [
    'grid',
    'constructor_cum_points',
    'driver_avg_last5',
    'driver_podiums_last5',
    'driver_best_last5',
    'driver_worst_last5',
    'grid_delta_vs_teammate'   # optional
]

categorical_features = [
    'Driver.code',
    'Constructor.name'
]

target = 'podium'

df[numeric_features + categorical_features + [target]].head()

Unnamed: 0,grid,constructor_cum_points,driver_avg_last5,driver_podiums_last5,driver_best_last5,driver_worst_last5,grid_delta_vs_teammate,Driver.code,Constructor.name,podium
0,12,0.0,3.2,4.0,3.0,4.0,-1,BOT,Alfa Romeo,0
1,13,4.0,1.4,5.0,1.0,2.0,1,ZHO,Alfa Romeo,0
2,14,4.0,9.8,0.0,4.0,20.0,-5,TSU,AlphaTauri,0
3,19,0.0,13.6,0.0,8.0,18.0,5,DEV,AlphaTauri,0
4,9,2.0,13.2,0.0,6.0,17.0,-11,OCO,Alpine F1 Team,0


In [3]:
X = df[numeric_features + categorical_features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

X_train.shape, X_test.shape

((75, 9), (25, 9))

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Numerical imputer (median works very well for skewed racing data)
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median"))
    ]
)

# Categorical transformer
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [5]:
from sklearn.linear_model import LogisticRegression

log_reg_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", LogisticRegression(max_iter=1000))
    ]
)

log_reg_model.fit(X_train, y_train)

y_pred = log_reg_model.predict(X_test)

print("LOGISTIC REGRESSION ACCURACY:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

LOGISTIC REGRESSION ACCURACY: 0.96

Confusion Matrix:
 [[21  0]
 [ 1  3]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.98        21
           1       1.00      0.75      0.86         4

    accuracy                           0.96        25
   macro avg       0.98      0.88      0.92        25
weighted avg       0.96      0.96      0.96        25



In [6]:
from sklearn.ensemble import RandomForestClassifier

rf_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", RandomForestClassifier(
            n_estimators=300,
            max_depth=12,
            min_samples_split=4,
            random_state=42
        ))
    ]
)

rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

print("RF ACCURACY:", accuracy_score(y_test, rf_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

RF ACCURACY: 0.96

Confusion Matrix:
 [[21  0]
 [ 1  3]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.98        21
           1       1.00      0.75      0.86         4

    accuracy                           0.96        25
   macro avg       0.98      0.88      0.92        25
weighted avg       0.96      0.96      0.96        25



In [18]:
import joblib

joblib.dump(rf, "../model/f1_podium_predictor.pkl")

['../model/f1_podium_predictor.pkl']

In [32]:
df = pd.read_csv("../data/f1_2023_engineered.csv")
df.head()

Unnamed: 0,number,position,positionText,points,grid,laps,status,Driver.driverId,Driver.permanentNumber,Driver.code,...,podium,constructor_cum_points,driver_avg_last5,driver_podiums_last5,driver_best_last5,driver_worst_last5,teammate_grid,teammate_position,grid_delta_vs_teammate,position_delta_vs_teammate
0,77,8,8,4.0,12,57,Finished,bottas,77,BOT,...,0,0.0,3.2,4.0,3.0,4.0,13,16,-1,-8
1,24,16,16,0.0,13,56,Lapped,zhou,24,ZHO,...,0,4.0,1.4,5.0,1.0,2.0,12,8,1,8
2,22,11,11,0.0,14,57,Finished,tsunoda,22,TSU,...,0,4.0,9.8,0.0,4.0,20.0,19,14,-5,-3
3,21,14,14,0.0,19,56,Lapped,de_vries,21,DEV,...,0,0.0,13.6,0.0,8.0,18.0,14,11,5,3
4,31,18,R,0.0,9,41,Retired,ocon,31,OCO,...,0,2.0,13.2,0.0,6.0,17.0,20,9,-11,9


In [33]:
df.columns.tolist()

['number',
 'position',
 'positionText',
 'points',
 'grid',
 'laps',
 'status',
 'Driver.driverId',
 'Driver.permanentNumber',
 'Driver.code',
 'Driver.url',
 'Driver.givenName',
 'Driver.familyName',
 'Driver.dateOfBirth',
 'Driver.nationality',
 'Constructor.constructorId',
 'Constructor.url',
 'Constructor.name',
 'Constructor.nationality',
 'Time.millis',
 'Time.time',
 'FastestLap.rank',
 'FastestLap.lap',
 'FastestLap.Time.time',
 'FastestLap.AverageSpeed.units',
 'FastestLap.AverageSpeed.speed',
 'raceName',
 'round',
 'season',
 'podium',
 'constructor_cum_points',
 'driver_avg_last5',
 'driver_podiums_last5',
 'driver_best_last5',
 'driver_worst_last5',
 'teammate_grid',
 'teammate_position',
 'grid_delta_vs_teammate',
 'position_delta_vs_teammate']

In [34]:
df.corr(numeric_only=True)['podium'].sort_values(ascending=False)

podium                           1.000000e+00
points                           8.365340e-01
driver_podiums_last5             4.011365e-01
constructor_cum_points           2.597422e-01
laps                             1.348881e-01
FastestLap.lap                   1.219636e-01
FastestLap.AverageSpeed.speed    7.095881e-02
Time.millis                      5.044597e-02
round                           -1.294713e-17
Driver.permanentNumber          -1.536834e-01
grid_delta_vs_teammate          -2.004698e-01
position_delta_vs_teammate      -2.571172e-01
teammate_grid                   -2.938356e-01
number                          -3.130686e-01
teammate_position               -3.666874e-01
driver_best_last5               -3.695693e-01
driver_worst_last5              -4.471252e-01
driver_avg_last5                -4.471252e-01
grid                            -5.123911e-01
FastestLap.rank                 -5.129317e-01
position                        -6.192404e-01
season                            