In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../")))
from DB.models import init_db, Circuit, Season, RacingWeekend, Driver, Session, SessionResult, Lap

In [None]:
def create_dataframe():
	# initialize db connection and session
	db_engine, db_session = init_db()

	# query data from the database
	query = db_session.query(
		RacingWeekend.year,
		RacingWeekend.round,
		Circuit.circuit_name,
		Driver.driver_name,
		Driver.driver_short,
		Lap.lap_num,
		Lap.lap_time,
		Lap.tyre,
		Lap.pit,
		Session.session_type
	).join(RacingWeekend.circuit) \
	 .join(RacingWeekend.sessions) \
	 .join(Session.laps) \
	 .join(Lap.driver) \
	 .join(RacingWeekend.season) \
	 .all()

	# convert result to list of dicts
	data = []
	for row in query:
		data.append({
			'year': row.year,
			'round': row.round,
			'circuit_name': row.circuit_name,
			'driver_name': row.driver_name,
			'driver_short': row.driver_short,
			'lap_num': row.lap_num,
			'lap_time': row.lap_time,
			'tyre': row.tyre,
			'pit': row.pit,
			'session_type': row.session_type
		})

	# create dataframe
	df = pd.DataFrame(data)
	
	# encode 'tyre' and 'pit' as categories
	df['tyre'] = df['tyre'].astype('category')

	# one-hot encode categorical variables
	df = pd.get_dummies(df, columns=['circuit_name', 'driver_name', 'driver_short', 'session_type'], drop_first=True)

	return df

# create the dataframe and print it
df = create_dataframe()


        year  round circuit_name      driver_name driver_short  lap_num  \
0       2019      1    Melbourne     Pierre Gasly          GAS        2   
1       2019      1    Melbourne     Pierre Gasly          GAS        3   
2       2019      1    Melbourne     Pierre Gasly          GAS        4   
3       2019      1    Melbourne     Pierre Gasly          GAS        5   
4       2019      1    Melbourne     Pierre Gasly          GAS        6   
...      ...    ...          ...              ...          ...      ...   
290690  2024     24   Yas Island  Valtteri Bottas          BOT       19   
290691  2024     24   Yas Island  Valtteri Bottas          BOT       20   
290692  2024     24   Yas Island  Valtteri Bottas          BOT       21   
290693  2024     24   Yas Island  Valtteri Bottas          BOT       22   
290694  2024     24   Yas Island  Valtteri Bottas          BOT       23   

        lap_time  tyre    pit session_type  
0        135.190     1   True   Practice 1  
1        

In [3]:
# split data into training (2019, 2020, 2021) and testing (2022)
train_data = df[df['year'].isin([2019, 2020, 2021])]
test_data = df[df['year'] == 2022]

# define features (X) and targets (y) for training
X_train = train_data.drop(columns=['pit', 'tyre', 'lap_time'])
y_train = train_data[['pit', 'tyre']]

# define features (X) and targets (y) for testing
X_test = test_data.drop(columns=['pit', 'tyre', 'lap_time'])
y_test = test_data[['pit', 'tyre']]

# train the multioutput random forest model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
multi_output_rf = MultiOutputClassifier(rf_classifier)
multi_output_rf.fit(X_train, y_train)

# predict on the entire 2022 season
y_pred_test = multi_output_rf.predict(X_test)

# add predictions to the test data
test_data['predicted_pit'] = y_pred_test[:, 0]
test_data['predicted_tyre'] = y_pred_test[:, 1]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_pit'] = y_pred_test[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_tyre'] = y_pred_test[:, 1]


In [4]:
# accuracy for both pit and tyre
accuracy_pit = accuracy_score(y_test['pit'], y_pred_test[:, 0])
accuracy_tyre = accuracy_score(y_test['tyre'], y_pred_test[:, 1])
print(f"Accuracy (Pit): {accuracy_pit:.2f}")
print(f"Accuracy (Tyre): {accuracy_tyre:.2f}")

# confusion matrix for both pit and tyre
conf_matrix_pit = confusion_matrix(y_test['pit'], y_pred_test[:, 0])
conf_matrix_tyre = confusion_matrix(y_test['tyre'], y_pred_test[:, 1])

print("\nConfusion Matrix (Pit):")
print(conf_matrix_pit)

print("\nConfusion Matrix (Tyre):")
print(conf_matrix_tyre)

# classification report for both pit and tyre
class_report_pit = classification_report(y_test['pit'], y_pred_test[:, 0])
class_report_tyre = classification_report(y_test['tyre'], y_pred_test[:, 1])

print("\nClassification Report (Pit):")
print(class_report_pit)

print("\nClassification Report (Tyre):")
print(class_report_tyre)


Accuracy (Pit): 0.95
Accuracy (Tyre): 0.49

Confusion Matrix (Pit):
[[46969   510]
 [ 2188   315]]

Confusion Matrix (Tyre):
[[    0   300   449   127     0     0]
 [  107 11086  3837  2380   300    43]
 [  291  4324  6932  4650   187    18]
 [   14  1645  3064  6396    26     0]
 [    0  1405   690   610   289    19]
 [    0   476   200   117     0     0]]

Classification Report (Pit):
              precision    recall  f1-score   support

       False       0.96      0.99      0.97     47479
        True       0.38      0.13      0.19      2503

    accuracy                           0.95     49982
   macro avg       0.67      0.56      0.58     49982
weighted avg       0.93      0.95      0.93     49982


Classification Report (Tyre):
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       876
           1       0.58      0.62      0.60     17753
           2       0.46      0.42      0.44     16402
           3       0.45      0.57  

In [5]:
# specify driver and race to analyze
driver = 'BOT'
race_year = 2022
race_round = 6

# filter test data for the specific race and driver
df_race_driver = test_data[(
	test_data['session_type_Race'] == 1) &  # race sessions only
	(test_data[f'driver_short_{driver}'] == 1) &  # driver filter (VER)
	(test_data['year'] == race_year) &  # year filter
	(test_data['round'] == race_round)  # round filter
]

# get the laps where the driver actually pitted
actual_pit_laps = df_race_driver[df_race_driver['pit'] == 1][['lap_num', 'pit', 'tyre']]

# get the laps where the model predicted a pit stop
predicted_pit_laps = df_race_driver[df_race_driver['predicted_pit'] == 1][['lap_num', 'predicted_pit', 'predicted_tyre']]

# define tyre mapping
tyre_mapping = {
	1: 'SOFT',
	2: 'MEDIUM',
	3: 'HARD',
	4: 'INTERMEDIATE',
	5: 'WET',
}

# function to decode tyre value from integer
def decode_tyres(value):
	return tyre_mapping.get(value, None)  # return None if no matching tyre value

# apply the decoding function to actual and predicted pit laps
actual_pit_laps['decoded_tyre'] = actual_pit_laps['tyre'].apply(decode_tyres)
predicted_pit_laps['decoded_tyre'] = predicted_pit_laps['predicted_tyre'].apply(decode_tyres)

# print actual and predicted pit stops
print(f"Driver {driver} - Actual Pit Stops in Race {race_year} Round {race_round}:")
print(actual_pit_laps[['lap_num', 'pit', 'decoded_tyre']])

print(f"\nDriver {driver} - Predicted Pit Stops in Race {race_year} Round {race_round}:")
print(predicted_pit_laps[['lap_num', 'predicted_pit', 'decoded_tyre']])


Driver BOT - Actual Pit Stops in Race 2022 Round 6:
        lap_num   pit decoded_tyre
155055       15  True       MEDIUM
155075       35  True       MEDIUM

Driver BOT - Predicted Pit Stops in Race 2022 Round 6:
        lap_num  predicted_pit decoded_tyre
155064       24            1.0       MEDIUM


In [None]:
# Define the race and driver to analyze
driver = 'BOT'
race_year = 2022
race_round = 6

# Filter training data: all races up to the selected round and practice/qualifying of the selected round
train_data = df[
	(df['year'] < race_year) |  # all years before the selected year
	((df['year'] == race_year) & (
		(df['round'] < race_round) |  # rounds before the selected round in the same year
		((df['round'] == race_round) & (df['session_type_Race'] == 0))  # practice/qualifying of the selected round
	))
]

# Filter testing data: race session of the selected round
test_data = df[
	(df['year'] == race_year) & 
	(df['round'] == race_round) & 
	(df['session_type_Race'] == 1)  # race session only
]

# Define features (X) and targets (y) for training
X_train = train_data.drop(columns=['pit', 'tyre', 'lap_time'])
y_train = train_data[['pit', 'tyre']]

# Define features (X) for testing
X_test = test_data.drop(columns=['pit', 'tyre', 'lap_time'])

# Train the multioutput random forest model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
multi_output_rf = MultiOutputClassifier(rf_classifier)
multi_output_rf.fit(X_train, y_train)

# Predict on the race session of the selected round
y_pred_test = multi_output_rf.predict(X_test)

# Add predictions to the test data
test_data['predicted_pit'] = y_pred_test[:, 0]
test_data['predicted_tyre'] = y_pred_test[:, 1]

# Filter test data for the specific race and driver
df_race_driver = test_data[
	(test_data[f'driver_short_{driver}'] == 1)  # filter by driver
]

# Get the laps where the driver actually pitted
actual_pit_laps = df_race_driver[df_race_driver['pit'] == 1][['lap_num', 'pit', 'tyre']]

# Get the laps where the model predicted a pit stop
predicted_pit_laps = df_race_driver[df_race_driver['predicted_pit'] == 1][['lap_num', 'predicted_pit', 'predicted_tyre']]

# Define tyre mapping
tyre_mapping = {
	1: 'SOFT',
	2: 'MEDIUM',
	3: 'HARD',
	4: 'INTERMEDIATE',
	5: 'WET',
}

# Function to decode tyre value from integer
def decode_tyres(value):
	return tyre_mapping.get(value, None)  # return None if no matching tyre value

# Apply the decoding function to actual and predicted pit laps
actual_pit_laps['decoded_tyre'] = actual_pit_laps['tyre'].apply(decode_tyres)
predicted_pit_laps['decoded_tyre'] = predicted_pit_laps['predicted_tyre'].apply(decode_tyres)

# Print actual and predicted pit stops
print(f"Driver {driver} - Actual Pit Stops in Race {race_year} Round {race_round}:")
print(actual_pit_laps[['lap_num', 'pit', 'decoded_tyre']])

print(f"\nDriver {driver} - Predicted Pit Stops in Race {race_year} Round {race_round}:")
print(predicted_pit_laps[['lap_num', 'predicted_pit', 'decoded_tyre']])
