In [109]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from models import init_db, Circuit, Season, RacingWeekend, Driver, Session, SessionResult, Lap


## Collect data into df

In [110]:
def create_dataframe():
    # initialize db connection and session
    db_engine, db_session = init_db()

    # query data from the database
    query = db_session.query(
        RacingWeekend.year,
        RacingWeekend.round,
        Circuit.circuit_name,
        Driver.driver_name,
        Driver.driver_short,
        Lap.lap_num,
        Lap.lap_time,
        Lap.tyre,
        Lap.pit,
        Session.session_type
    ).join(RacingWeekend.circuit) \
     .join(RacingWeekend.sessions) \
     .join(Session.laps) \
     .join(Lap.driver) \
     .join(RacingWeekend.season) \
     .all()

    # convert result to list of dicts
    data = []
    for row in query:
        data.append({
            'year': row.year,
            'round': row.round,
            'circuit_name': row.circuit_name,
            'driver_name': row.driver_name,
            'driver_short': row.driver_short,
            'lap_num': row.lap_num,
            'lap_time': row.lap_time,
            'tyre': row.tyre,
            'pit': row.pit,
            'session_type': row.session_type
        })

    # create dataframe
    df = pd.DataFrame(data)

    # encode 'tyre' and 'pit' as categories
    df['tyre'] = df['tyre'].astype('category')
    df['pit'] = df['pit'].astype('category')

    # one-hot encode categorical variables
    df = pd.get_dummies(df, columns=['circuit_name', 'driver_name', 'driver_short', 'tyre', 'session_type'], drop_first=True)

    return df

# create the dataframe and print it
df = create_dataframe()
print(df)


        year  round  lap_num  lap_time    pit  circuit_name_Baku  \
0       2019      1        2   135.190   True              False   
1       2019      1        3    88.612  False              False   
2       2019      1        4   113.581  False              False   
3       2019      1        5    86.424  False              False   
4       2019      1        6   123.898  False              False   
...      ...    ...      ...       ...    ...                ...   
290690  2024     24       19    90.415  False              False   
290691  2024     24       20    91.165  False              False   
290692  2024     24       21    90.684  False              False   
290693  2024     24       22   113.677  False              False   
290694  2024     24       23   127.299  False              False   

        circuit_name_Barcelona  circuit_name_Budapest  \
0                        False                  False   
1                        False                  False   
2           

In [111]:
# define features (X) and target (y)
X_class = df.drop(columns=['pit'])
y_class = df['pit']

# split the data into training and testing sets
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)

# initialize the random forest classifier model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# train the model
rf_classifier.fit(X_train_class, y_train_class)

# predict on the test set
y_pred_class = rf_classifier.predict(X_test_class)

# evaluate the model
print(classification_report(y_test_class, y_pred_class))

# feature importance
feature_importances_class = rf_classifier.feature_importances_
print(f'Feature Importances: {feature_importances_class}')

# print predicted vs actual race strategy
for true, pred in zip(y_test_class, y_pred_class):
    print(f'True: {true}, Predicted: {pred}')


              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98     55379
         1.0       0.76      0.33      0.46      2760

    accuracy                           0.96     58139
   macro avg       0.86      0.66      0.72     58139
weighted avg       0.96      0.96      0.96     58139

Feature Importances: [4.06782079e-02 5.08411197e-02 2.19032973e-01 3.29532212e-01
 3.77344225e-03 6.32277287e-03 5.60763488e-03 1.26943978e-03
 3.63387509e-03 1.55236643e-03 2.20803877e-03 1.23799717e-03
 7.18007380e-03 3.47031278e-03 2.14195589e-03 8.60271144e-03
 4.58757758e-03 1.81721564e-03 6.89930359e-03 3.40661627e-03
 5.91838490e-03 4.40722600e-03 4.01325623e-03 8.97048387e-04
 2.49401628e-03 4.67499914e-03 1.56876281e-03 3.93428683e-03
 1.00968111e-03 2.08315613e-03 4.38169727e-03 6.27582127e-03
 2.86083636e-03 5.97162634e-03 2.70698780e-03 4.29520029e-03
 2.92192059e-03 4.10325036e-06 1.76989966e-03 2.04273402e-06
 5.61082961e-06 3.20303000e-06 3.55743364e

In [112]:
# train on the full dataset but predict on one specific race

# filter for the specific race you're interested in
year = 2020 
round_num = 1 

# train on the full dataset
X_full = df.drop(columns=['pit'])
y_full = df['pit']

# split the dataset into training and testing sets
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

# initialize the random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# train the model on the entire dataset
rf_classifier.fit(X_train_full, y_train_full)

# filter the data for the specific race for predictions
df_single_race = df[(df['year'] == year) & (df['round'] == round_num)]

# define features (X) and target (y) for the single race prediction
X_single_race = df_single_race.drop(columns=['pit'])
y_single_race = df_single_race['pit']

# predict on the filtered race data
y_pred_single_race = rf_classifier.predict(X_single_race)

# model performance
print(f"Classification report for Race {year} Round {round_num}:")
print(classification_report(y_single_race, y_pred_single_race))


# print predicted vs actual for the single race
for true, pred in zip(y_single_race, y_pred_single_race):
    print(f'True: {true}, Predicted: {pred}')


Classification report for Race 2020 Round 1:
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00      2691
         1.0       0.99      0.80      0.88       115

    accuracy                           0.99      2806
   macro avg       0.99      0.90      0.94      2806
weighted avg       0.99      0.99      0.99      2806

True: False, Predicted: 0.0
True: False, Predicted: 0.0
True: False, Predicted: 0.0
True: False, Predicted: 0.0
True: False, Predicted: 0.0
True: False, Predicted: 0.0
True: False, Predicted: 0.0
True: False, Predicted: 0.0
True: False, Predicted: 0.0
True: False, Predicted: 0.0
True: False, Predicted: 0.0
True: False, Predicted: 0.0
True: False, Predicted: 0.0
True: False, Predicted: 0.0
True: False, Predicted: 0.0
True: False, Predicted: 0.0
True: False, Predicted: 0.0
True: False, Predicted: 0.0
True: True, Predicted: 1.0
True: False, Predicted: 1.0
True: False, Predicted: 0.0
True: False, Predicted: 0.0
True: False, 

In [113]:
driver = 'HAM'
race_year = 2020 
race_round = 1 

# filter for the specific race and driver
df_single_race_driver = df_single_race[  
    (df_single_race[f'session_type_Race'] == 1) &
    (df_single_race[f'driver_short_{driver}'] == 1) & 
    (df_single_race['year'] == race_year) & 
    (df_single_race['round'] == race_round)
]

# define features (X) and target (y) for the prediction
X_single_race_driver = df_single_race_driver.drop(columns=['pit'])
y_single_race_driver = df_single_race_driver['pit']

# predict on the filtered race data
y_pred_single_race_driver = rf_classifier.predict(X_single_race_driver)

# add the predicted pit values to the dataframe
df_single_race_driver['predicted_pit'] = y_pred_single_race_driver

# get the laps where the driver actually pitted
actual_pit_laps = df_single_race_driver[df_single_race_driver['pit'] == 1][['lap_num', 'pit']]

# get the laps where the driver was predicted to pit
predicted_pit_laps = df_single_race_driver[df_single_race_driver['predicted_pit'] == 1][['lap_num', 'predicted_pit']]

# print actual and predicted pit laps
print(f"Driver {driver} - Actual Pit Laps in Race {race_year} Round {race_round}:")
print(actual_pit_laps)

print(f"\nDriver {driver} - Predicted Pit Laps in Race {race_year} Round {race_round}:")
print(predicted_pit_laps)


Driver HAM - Actual Pit Laps in Race 2020 Round 1:
Empty DataFrame
Columns: [lap_num, pit]
Index: []

Driver HAM - Predicted Pit Laps in Race 2020 Round 1:
       lap_num  predicted_pit
53238       52            1.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_single_race_driver['predicted_pit'] = y_pred_single_race_driver


In [114]:
# check unique values of the pit column
print(df_single_race['pit'].unique())

# filter for specific race and driver
df_single_race_driver = df_single_race[  
    (df_single_race[f'session_type_Race'] == 1) & 
    (df_single_race['year'] == race_year) & 
    (df_single_race['round'] == race_round)
]

print(df_single_race_driver[['lap_num', 'pit']].head())

# define features and target
X_single_race_driver = df_single_race_driver.drop(columns=['pit'])
y_single_race_driver = df_single_race_driver['pit']

# predict on filtered data
y_pred_single_race_driver = rf_classifier.predict(X_single_race_driver)

# add predicted pit values
df_single_race_driver['predicted_pit'] = y_pred_single_race_driver

# get actual and predicted pit laps
actual_pit_laps = df_single_race_driver[df_single_race_driver['pit'] == 1][['lap_num', 'pit']]
predicted_pit_laps = df_single_race_driver[df_single_race_driver['predicted_pit'] == 1][['lap_num', 'predicted_pit']]

print(f"Driver {driver} - Actual Pit Laps in Race {race_year} Round {race_round}:")
print(actual_pit_laps)

print(f"\nDriver {driver} - Predicted Pit Laps in Race {race_year} Round {race_round}:")
print(predicted_pit_laps)


[False, True]
Categories (2, bool): [False, True]
       lap_num    pit
52675        1  False
52676        2  False
52677        3  False
52678        4  False
52679        5  False
Driver HAM - Actual Pit Laps in Race 2020 Round 1:
Empty DataFrame
Columns: [lap_num, pit]
Index: []

Driver HAM - Predicted Pit Laps in Race 2020 Round 1:
       lap_num  predicted_pit
52726       52            1.0
52762       22            1.0
52792       52            1.0
52828       22            1.0
52858       52            1.0
52928       22            1.0
52958       52            1.0
52990       22            1.0
53020       52            1.0
53096       52            1.0
53142       22            1.0
53167       47            1.0
53172       52            1.0
53238       52            1.0
53274       22            1.0
53340       22            1.0
53366       48            1.0
53370       52            1.0
53406       22            1.0
53431       47            1.0
53436       52            1.0
53

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_single_race_driver['predicted_pit'] = y_pred_single_race_driver
