In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../")))
from DB.models import init_db, Circuit, Season, RacingWeekend, Driver, Session, SessionResult, Lap

In [2]:
def create_dataframe():
    # initialize db connection and session
    db_engine, db_session = init_db()

    # query data from the database
    query = db_session.query(
        RacingWeekend.year,
        RacingWeekend.round,
        Circuit.circuit_name,
        Driver.driver_name,
        Driver.driver_short,
        Lap.lap_num,
        Lap.lap_time,
        Lap.tyre,
        Lap.pit,
        Session.session_type
    ).join(RacingWeekend.circuit) \
     .join(RacingWeekend.sessions) \
     .join(Session.laps) \
     .join(Lap.driver) \
     .join(RacingWeekend.season) \
     .all()

    # convert result to list of dicts
    data = []
    for row in query:
        data.append({
            'year': row.year,
            'round': row.round,
            'circuit_name': row.circuit_name,
            'driver_name': row.driver_name,
            'driver_short': row.driver_short,
            'lap_num': row.lap_num,
            'lap_time': row.lap_time,
            'tyre': row.tyre,
            'pit': row.pit,
            'session_type': row.session_type
        })

    # create dataframe
    df = pd.DataFrame(data)

    # encode 'tyre' and 'pit' as categories
    df['tyre'] = df['tyre'].astype('category')

    # one-hot encode categorical variables
    df = pd.get_dummies(df, columns=['circuit_name', 'driver_name', 'driver_short', 'session_type'], drop_first=True)

    return df

# create the dataframe and print it
df = create_dataframe()
print(df.columns.tolist())


['year', 'round', 'lap_num', 'lap_time', 'tyre', 'pit', 'circuit_name_Baku', 'circuit_name_Barcelona', 'circuit_name_Budapest', 'circuit_name_Hockenheim', 'circuit_name_Imola', 'circuit_name_Istanbul', 'circuit_name_Jeddah', 'circuit_name_Las Vegas', 'circuit_name_Le Castellet', 'circuit_name_Lusail', 'circuit_name_Marina Bay', 'circuit_name_Melbourne', 'circuit_name_Mexico City', 'circuit_name_Miami', 'circuit_name_Monaco', 'circuit_name_Monte Carlo', 'circuit_name_Montréal', 'circuit_name_Monza', 'circuit_name_Mugello', 'circuit_name_Nürburgring', 'circuit_name_Portimão', 'circuit_name_Sakhir', 'circuit_name_Shanghai', 'circuit_name_Silverstone', 'circuit_name_Singapore', 'circuit_name_Sochi', 'circuit_name_Spa-Francorchamps', 'circuit_name_Spielberg', 'circuit_name_Suzuka', 'circuit_name_São Paulo', 'circuit_name_Yas Island', 'circuit_name_Zandvoort', 'driver_name_Alexander Albon', 'driver_name_Andrea Kimi Antonelli', 'driver_name_Antonio Giovinazzi', 'driver_name_Arthur Leclerc', '

In [3]:
# Define the race and driver to analyze
race_year = 2022
race_round = 10

# Filter training data: all races up to the selected round and practice/qualifying of the selected round
train_data = df[
    (df['year'] < race_year) |  # all years before the selected year
    ((df['year'] == race_year) & (
        (df['round'] < race_round) |  # rounds before the selected round in the same year
        ((df['round'] == race_round) & (df['session_type_Race'] == 0))  # practice/qualifying of the selected round
    ))
]

# Filter testing data: race session of the selected round
test_data = df[
    (df['year'] == race_year) & 
    (df['round'] == race_round) & 
    (df['session_type_Race'] == 1)  # race session only
]

# Define features (X) and targets (y) for training
X_train = train_data.drop(columns=['pit', 'tyre', 'lap_time'])
y_train = train_data[['pit', 'tyre']]

# Define features (X) and targets (y) for testing
X_test = test_data.drop(columns=['pit', 'tyre', 'lap_time'])
y_test = test_data[['pit', 'tyre']]

# Train the multioutput random forest model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
multi_output_rf = MultiOutputClassifier(rf_classifier)
multi_output_rf.fit(X_train, y_train)

# Predict on the race session of the selected round
y_pred_test = multi_output_rf.predict(X_test)

# Add predictions to the test data
test_data['predicted_pit'] = y_pred_test[:, 0]
test_data['predicted_tyre'] = y_pred_test[:, 1]

# Calculate accuracy for pit and tyre predictions
accuracy_pit = accuracy_score(y_test['pit'], y_pred_test[:, 0])
accuracy_tyre = accuracy_score(y_test['tyre'], y_pred_test[:, 1])

print(f"Accuracy (Pit): {accuracy_pit:.2f}")
print(f"Accuracy (Tyre): {accuracy_tyre:.2f}")

# Confusion matrix for pit and tyre predictions
conf_matrix_pit = confusion_matrix(y_test['pit'], y_pred_test[:, 0])
conf_matrix_tyre = confusion_matrix(y_test['tyre'], y_pred_test[:, 1])

print("\nConfusion Matrix (Pit):")
print(conf_matrix_pit)

print("\nConfusion Matrix (Tyre):")
print(conf_matrix_tyre)

# Classification report for pit and tyre predictions
class_report_pit = classification_report(y_test['pit'], y_pred_test[:, 0])
class_report_tyre = classification_report(y_test['tyre'], y_pred_test[:, 1])

print("\nClassification Report (Pit):")
print(class_report_pit)

print("\nClassification Report (Tyre):")
print(class_report_tyre)


Accuracy (Pit): 0.97
Accuracy (Tyre): 0.49

Confusion Matrix (Pit):
[[747   0]
 [ 26   0]]

Confusion Matrix (Tyre):
[[  4  79 115]
 [  0 241 155]
 [  0  47 132]]

Classification Report (Pit):
              precision    recall  f1-score   support

       False       0.97      1.00      0.98       747
        True       0.00      0.00      0.00        26

    accuracy                           0.97       773
   macro avg       0.48      0.50      0.49       773
weighted avg       0.93      0.97      0.95       773


Classification Report (Tyre):
              precision    recall  f1-score   support

           1       1.00      0.02      0.04       198
           2       0.66      0.61      0.63       396
           3       0.33      0.74      0.45       179

    accuracy                           0.49       773
   macro avg       0.66      0.46      0.38       773
weighted avg       0.67      0.49      0.44       773



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_pit'] = y_pred_test[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_tyre'] = y_pred_test[:, 1]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:
driver = 'NOR'

# Define tyre mapping
tyre_mapping = {
    1: 'SOFT',
    2: 'MEDIUM',
    3: 'HARD',
    4: 'INTERMEDIATE',
    5: 'WET',
}

# Function to decode tyre value from integer
def decode_tyres(value):
    return tyre_mapping.get(value, None)  # return None if no matching tyre value

# Filter test data for the specific race and driver
df_race_driver = test_data[
    (test_data[f'driver_short_{driver}'] == 1)  # filter by driver
]

# Get the laps where the driver actually pitted
actual_pit_laps = df_race_driver[df_race_driver['pit'] == 1][['lap_num', 'pit', 'tyre']]

# Get the laps where the model predicted a pit stop
predicted_pit_laps = df_race_driver[df_race_driver['predicted_pit'] == 1][['lap_num', 'predicted_pit', 'predicted_tyre']]

# Apply the decoding function to actual and predicted pit laps
actual_pit_laps['decoded_tyre'] = actual_pit_laps['tyre'].apply(decode_tyres)
predicted_pit_laps['decoded_tyre'] = predicted_pit_laps['predicted_tyre'].apply(decode_tyres)

# Print actual and predicted pit stops
print(f"Driver {driver} - Actual Pit Stops in Race {race_year} Round {race_round}:")
print(actual_pit_laps[['lap_num', 'pit', 'decoded_tyre']])

print(f"\nDriver {driver} - Predicted Pit Stops in Race {race_year} Round {race_round}:")
print(predicted_pit_laps[['lap_num', 'predicted_pit', 'decoded_tyre']])


Driver NOR - Actual Pit Stops in Race 2022 Round 10:
        lap_num   pit decoded_tyre
165394       35  True         HARD

Driver NOR - Predicted Pit Stops in Race 2022 Round 10:
Empty DataFrame
Columns: [lap_num, predicted_pit, decoded_tyre]
Index: []


In [5]:
# Define tyre mapping
tyre_mapping = {
    1: 'SOFT',
    2: 'MEDIUM',
    3: 'HARD',
    4: 'INTERMEDIATE',
    5: 'WET',
}

# Function to decode tyre value from integer
def decode_tyres(value):
    return tyre_mapping.get(value, None)  # return None if no matching tyre value

# Function to extract driver name based on driver_short columns
def get_driver_name(row):
    for col in test_data.columns:
        if col.startswith('driver_short_') and row[col] == 1:
            return col.replace('driver_short_', '')  # Extract driver name
    return None

# Add driver names to the test data
test_data['driver_name'] = test_data.apply(get_driver_name, axis=1)

# Get the laps where any driver actually pitted
actual_pit_laps = test_data[test_data['pit'] == 1][['lap_num', 'driver_name', 'pit', 'tyre']]

# Get the laps where the model predicted a pit stop
predicted_pit_laps = test_data[test_data['predicted_pit'] == 1][['lap_num', 'driver_name', 'predicted_pit', 'predicted_tyre']]

# Apply the decoding function to actual and predicted pit laps
actual_pit_laps['decoded_tyre'] = actual_pit_laps['tyre'].apply(decode_tyres)
predicted_pit_laps['decoded_tyre'] = predicted_pit_laps['predicted_tyre'].apply(decode_tyres)

# Print actual pit stops for all drivers
print(f"Actual Pit Stops in Race {race_year} Round {race_round}:")
print(actual_pit_laps[['lap_num', 'driver_name', 'decoded_tyre']])

# Print predicted pit stops for all drivers
print(f"\nPredicted Pit Stops in Race {race_year} Round {race_round}:")
print(predicted_pit_laps[['lap_num', 'driver_name', 'decoded_tyre']])


Actual Pit Stops in Race 2022 Round 10:
        lap_num driver_name decoded_tyre
164912       13         VER       MEDIUM
164923       24         VER         HARD
164939       40         VER         SOFT
164966       17         GAS         HARD
164980        6         PER       MEDIUM
165014       40         PER         SOFT
165057       34         ALO         HARD
165063       40         ALO         SOFT
165099       26         LEC         HARD
165137       16         STR         HARD
165162       41         STR       MEDIUM
165195       23         MAG       MEDIUM
165241       19         TSU         HARD
165261       39         TSU         SOFT
165294       21         RIC         HARD
165306       33         RIC         SOFT
165346       23         OCO         HARD
165394       35         NOR         HARD
165442       34         HAM         HARD
165476       20         SCH         HARD
165496       40         SCH         SOFT
165513        7         VET       MEDIUM
165546       40  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['driver_name'] = test_data.apply(get_driver_name, axis=1)
