In [1]:
from sqlalchemy import create_engine, func
from sqlalchemy.orm import sessionmaker
import sys
import os
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../")))
from DB.models import init_db, Circuit, Season, RacingWeekend, Driver, Session, SessionResult, Lap, Team, DriverTeamSession, TeamCircuitStats, PitStop
from utils import setup_race_data

import numpy as np
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from itertools import product

# Initialize database connection
global db_session
engine, db_session = init_db()

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option('display.max_colwidth', None) 


## 1) Query race into df and pre process

In [None]:
def get_race_df(year, circuit):
	race_session = (db_session.query(Session)
				.join(RacingWeekend, Session.weekend_id == RacingWeekend.racing_weekend_id)
				.join(Circuit, RacingWeekend.circuit_id == Circuit.circuit_id)
				.filter(
					RacingWeekend.year == year,
					Circuit.circuit_name == circuit,
					Session.session_type == "Race"
				)
				.first())
	
	laps = race_session.laps

	session_results = (
		db_session.query(SessionResult.position, Driver.driver_num)
		.join(Session, Session.session_id == SessionResult.session_id)
		.join(Driver, Driver.driver_id == SessionResult.driver_id)
		.filter(SessionResult.session_id == race_session.session_id)
		.all()
	)

	# Convert session results to a dictionary
	starting_positions = {driver_num: position for position, driver_num in session_results}

	# Convert to DataFrame
	laps_data = []
	for lap in laps:
		# Add a row for Sector 1
		laps_data.append({
			"lap_num": lap.lap_num,
			"sector": 1,
			"stint_num": lap.stint_num,
			"stint_lap": lap.stint_lap,
			"position": lap.position,
			"driver_name": lap.driver.driver_name,
			"driver_number": lap.driver.driver_num,
			"sector_time": lap.s1_time,
			"tyre_type": lap.tyre_type,
			"tyre_laps": lap.tyre_laps,
			"pit": lap.pit,
			"track_status": lap.track_status
		})

		# Add a row for Sector 2
		laps_data.append({
			"lap_num": lap.lap_num,
			"sector": 2,
			"stint_num": lap.stint_num,
			"stint_lap": lap.stint_lap,
			"position": lap.position,
			"driver_name": lap.driver.driver_name,
			"driver_number": lap.driver.driver_num,
			"sector_time": lap.s2_time,
			"tyre_type": lap.tyre_type,
			"tyre_laps": lap.tyre_laps,
			"pit": lap.pit,
			"track_status": lap.track_status
		})

		# Add a row for Sector 3
		laps_data.append({
			"lap_num": lap.lap_num,
			"sector": 3,
			"stint_num": lap.stint_num,
			"stint_lap": lap.stint_lap,
			"position": lap.position,
			"driver_name": lap.driver.driver_name,
			"driver_number": lap.driver.driver_num,
			"sector_time": lap.s3_time,
			"tyre_type": lap.tyre_type,
			"tyre_laps": lap.tyre_laps,
			"pit": lap.pit,
			"track_status": lap.track_status
		})

	# Create a DataFrame from the list of dictionaries
	df = pd.DataFrame(laps_data)
	
	df["starting_position"] = None  # Initialize column with None
	for driver_num, grid_pos in starting_positions.items():
		# Find the first occurrence of the driver
		first_row_index = df[df["driver_number"] == driver_num].index[0]
		df.at[first_row_index, "starting_position"] = grid_pos

	df = df.sort_values(["lap_num", "sector", "position"]).reset_index(drop=True)

	return df


def add_race_data(df):
	# Calculate cumulative race time for each driver
	df["cumulative_time"] = df.groupby("driver_name")["sector_time"].cumsum()
 
	# Calculate rolling pace (average lap time over the last 5 laps)
	df["pace"] = (
		df.groupby(["driver_name", "sector"])["sector_time"]
		.rolling(window=5, min_periods=1)
		.mean()
		.reset_index(level=[0, 1], drop=True)
	)

	# Get car ahead"s cumulative time (car immediately ahead in position for each lap)
	df["front_cumulative_time"] = df.groupby(["lap_num", "sector"])["cumulative_time"].shift(1)
	# This gap is calculated only for drivers who are not in the lead position (position > 1)
	df["gap"] = df["cumulative_time"] - df["front_cumulative_time"]
	df["gap"] = df["gap"].fillna(0)  # Leader has no car ahead, so gap is 0

	# Calculate tyre difference (compared to car immediately ahead in THIS Sector)
	df["front_tyre"] = df.groupby(["lap_num", "sector"])["tyre_type"].shift(1)
	df["tyre_diff"] = df["front_tyre"] - df["tyre_type"]
	df["tyre_diff"] = df["tyre_diff"].fillna(0)  # Leader has no car ahead

	# Calculate tyre age difference (compared to car immediately ahead in THIS Sector)
	df["front_laps"] = df.groupby(["lap_num", "sector"])["stint_lap"].shift(1)
	df["stint_laps_diff"] = df["front_laps"] - df["stint_lap"]
	df["stint_laps_diff"] = df["stint_laps_diff"].fillna(0)  # Leader has no car ahead

	# Calculate DRS availability (within 1s of car ahead IN THIS Sector)
	df["drs_available"] = (
		(df["gap"] <= 1) &
		(df["position"] > 1) &
		(df["lap_num"] > 1)
	)

	# Create target variable for overtaking model (done where its 1 if the driver got overtaken (improved accuracy))
	df["next_position"] = df.groupby("driver_name")["position"].shift(1) 
	df["overtaken"] = ((df["next_position"] < df["position"]) | 
					  (df["next_position"].isna()))
	

	# Cleanup and final sorting
	df = df.drop(columns=["front_cumulative_time", "front_tyre", "next_position"])
	# df = df.sort_values(["lap_num", "sector", "position"]).reset_index(drop=True)

	try:
		new_order = [
			"lap_num", "sector", "stint_num", "stint_lap", "position", "driver_name",
			"driver_number", "sector_time", "gap", "cumulative_time", "tyre_type", "tyre_laps", 
			"pit", "drs_available", "overtaken", "tyre_diff", "front_laps", "stint_laps_diff", "track_status", "pace", "starting_position"
		]

		df = df[new_order]
	except:
		print("dont care")
	return df


df = get_race_df(2023, "Sakhir")
df = add_race_data(df)

# df[df["sector_time"].isna()]
# df[df["driver_name"]=="Logan Sargeant"]
df.head(5)

Unnamed: 0,lap_num,sector,stint_num,stint_lap,position,driver_name,driver_number,sector_time,gap,cumulative_time,tyre_type,tyre_laps,pit,drs_available,overtaken,tyre_diff,front_laps,stint_laps_diff,track_status,pace,starting_position
0,1,1,1,1,1,Max Verstappen,33,,0.0,,1,4,False,False,True,0.0,,0.0,12,,1
1,1,1,1,1,2,Charles Leclerc,16,,0.0,,1,1,False,False,True,0.0,1.0,0.0,12,,19
2,1,1,1,1,3,Sergio Perez,11,,0.0,,1,4,False,False,True,0.0,1.0,0.0,12,,2
3,1,1,1,1,4,Carlos Sainz,55,,0.0,,1,4,False,False,True,0.0,1.0,0.0,12,,4
4,1,1,1,1,5,Lewis Hamilton,44,,0.0,,1,4,False,False,True,0.0,1.0,0.0,12,,5


### Now create overtaking model

In [3]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.utils.class_weight import compute_sample_weight

# Define features and target
features = [
	"gap",
	"pace",
	"tyre_diff",
	"stint_laps_diff",
	"drs_available",
	"cumulative_time",
	"sector_time",
	"pit"
]

X = df[features]
y = df["overtaken"]

from imblearn.over_sampling import SMOTE



# Train the model on the resampled data

# Handle missing values
X = X.fillna(X.mean())

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


# Base model
gbc = GradientBoostingClassifier(
	n_estimators=200,
	learning_rate=0.05,
	max_depth=3,
	
	subsample=0.8,
	random_state=42
)



# # Calibrate for better probabilities
model = CalibratedClassifierCV(gbc, method="sigmoid", cv=3)

# # Train on all data
model.fit(X_resampled, y_resampled)

# from lightgbm import LGBMClassifier

# lgbm = LGBMClassifier(
# 	n_estimators=100,
# 	learning_rate=0.1,
# 	max_depth=3,
# 	subsample=0.8,
# 	random_state=42
# )
# model = CalibratedClassifierCV(lgbm, method="isotonic", cv=3)
# model.fit(X, y, sample_weight=sample_weights)


feature_means = X.mean()

def predict_overtake(new_race_df):
	# Select the same features used during training
	X_new = new_race_df[features]
	
	# Handle missing values (if any)
	X_new = X_new.fillna(feature_means)
	
	# Make predictions using the trained model
	new_race_df["predicted_overtake"] = model.predict(X_new)
	
	return new_race_df

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Load the 2023 race data


# Predict overtakes using the rule-based function
new_race_df = predict_overtake(df)

# Calculate accuracy
actual_overtakes = new_race_df["overtaken"]
predicted_overtakes = new_race_df["predicted_overtake"]
accuracy = accuracy_score(actual_overtakes, predicted_overtakes)
print(f"Accuracy: {accuracy:.3f}")

# Generate classification report
print("\nClassification Report:")
print(classification_report(
	actual_overtakes,
	predicted_overtakes,
	target_names=["No Overtake", "Overtaken"]
))

Accuracy: 0.997

Classification Report:
              precision    recall  f1-score   support

 No Overtake       1.00      1.00      1.00      3068
   Overtaken       0.96      0.93      0.94        97

    accuracy                           1.00      3165
   macro avg       0.98      0.96      0.97      3165
weighted avg       1.00      1.00      1.00      3165



Accuracy: 0.997

Classification Report:
			  precision    recall  f1-score   support

 No Overtake       1.00      1.00      1.00      3068
	Overtaken       0.96      0.93      0.94        97

	accuracy                           1.00      3165
	macro avg       0.98      0.96      0.97      3165
weighted avg       1.00      1.00      1.00      3165

In [5]:
race_data = setup_race_data(df)




In [12]:
import pandas as pd

def race_sim(precomputed_data, given_driver=None, simulated_strategy=None):
	"""
	simulated_strategy
	"""
	# Extract precomputed data
	driver_tyre_coefficients = precomputed_data["driver_tyre_coefficients"]
	driver_strategies = precomputed_data["driver_strategies"]
	max_laps = precomputed_data["max_laps"]
	drivers = precomputed_data["drivers"]
	driver_names = precomputed_data["driver_names"]
	initial_positions = precomputed_data["initial_positions"]
	base_sector_times = precomputed_data["base_sector_times"]
	fuel_corrections = precomputed_data["fuel_corrections"]

	drivers_data = []

	if given_driver and simulated_strategy:
		driver_strategies[given_driver] = simulated_strategy

	for driver in drivers:
		drivers_data.append({
			"driver_number": driver,
			"driver_name": driver_names[driver],
			"pit_schedule": driver_strategies[driver],
			"tyre_type": driver_strategies[driver][1],
			"lap_num": 1,
			"sector": 0,
			"sector_time": 0,
			"stint_lap": 1,
			"cumulative_time": 0.0,
			"gap": 0,
			"pit": False,
			"position": initial_positions[driver],
			"consecutive_laps_within_2s": 0,
			"base_sector_times": base_sector_times[driver],
			"pace": 0,  # Initialize pace as 0
			"tyre_diff": 0,  # Initialize tyre difference as 0
			"stint_laps_diff": 0,  # Initialize stint laps difference as 0
			"drs_available": False,  # Initialize DRS availability as False
		})
		
	drivers_df = pd.DataFrame(drivers_data)

	simulated_data = []
	for lap in range(2, max_laps + 1):

		drivers_df["lap_num"] += 1
		drivers_df["stint_lap"] += 1

		for sector in range(1, 4):
			drivers_df["sector"] = sector
			for index, row in drivers_df.iterrows():

				# Handle pit stops at the start of a lap (sector 1)
				if sector == 1 and lap in row["pit_schedule"]:
					drivers_df.at[index, "pit"] = True  # Mark pit stop
					drivers_df.at[index, "cumulative_time"] += 20  # Add pit stop penalty
					drivers_df.at[index, "stint_lap"] = 1  # Reset stint lap
					drivers_df.at[index, "tyre_type"] = row["pit_schedule"][lap]  # Change tyre

				else:
					drivers_df.at[index, "pit"] = False

				# Calculate tyre degradation coefficients
				a, b, c = driver_tyre_coefficients[row["driver_number"]][sector][drivers_df.at[index, "tyre_type"]]
				sector_time = (
					row["base_sector_times"][sector]  # Base sector time
					+ (a * drivers_df.at[index, "stint_lap"]**2 + b * drivers_df.at[index, "stint_lap"] + c)  # Tyre degradation
					+ fuel_corrections[lap]  # Fuel effect
				)

				# Update sector time and cumulative time
				drivers_df.at[index, "sector_time"] = sector_time
				drivers_df.at[index, "cumulative_time"] += sector_time

				# drivers_df = drivers_df.sort_values(by="cumulative_time", ascending=True)

				# Reassign positions based on the sorted order
				# drivers_df["position"] = range(1, len(drivers_df) + 1)

				ahead_pos = row["position"] - 1

				if ahead_pos > 0:
					ahead_time = drivers_df.loc[drivers_df["position"] == ahead_pos, "cumulative_time"].values[0]

					gap = ahead_time - row["cumulative_time"]
					drivers_df.at[index, "gap"] = gap
					
				else:
					gap = 0
					drivers_df.at[index, "gap"] = 0

				# calc overtakes
				if gap < 0:
					if drivers_df.at[ahead_index, "cumulative_time"] > drivers_df.at[index, "cumulative_time"]:
						drivers_df.at[ahead_index, "cumulative_time"], drivers_df.at[index, "cumulative_time"] = (
							drivers_df.at[index, "cumulative_time"],
							drivers_df.at[ahead_index, "cumulative_time"],
						)

				elif gap < 0.8 and ahead_pos > 0:
					if ahead_pos > 0:
						ahead_row = drivers_df.loc[drivers_df["position"] == ahead_pos].iloc[0]
						   
						df["pace"] = (
							df.groupby(["driver_name", "sector"])["sector_time"]
							.rolling(window=5, min_periods=1)
							.mean()
							.reset_index(level=[0, 1], drop=True)
						)
						
						# Tyre difference
						drivers_df.at[index, "tyre_diff"] = ahead_row["tyre_type"] - row["tyre_type"]

						# Stint laps difference
						drivers_df.at[index, "stint_laps_diff"] = ahead_row["stint_lap"] - row["stint_lap"]

						# DRS availability
						drivers_df.at[index, "drs_available"] = gap <= 1.0

					else:
						drivers_df.at[index, "pace"] = (
							drivers_df.loc[
								(drivers_df["driver_number"] == row["driver_number"]) &
								(drivers_df["lap_num"] >= lap - 5),
								"sector_time"
							].mean()
						)

						drivers_df.at[index, "tyre_diff"] = 0

						# Stint laps difference
						drivers_df.at[index, "stint_laps_diff"] = 0

						# DRS availability
						drivers_df.at[index, "drs_available"] = 0

					
					drivers_df = predict_overtake(drivers_df)

					ahead_index = drivers_df[drivers_df["position"] == ahead_pos].index[0]
					# Check if an overtake is predicted
					if drivers_df.at[index, "predicted_overtake"]:
						# Swap positions between the current driver and the driver ahead
						current_position = drivers_df.at[index, "position"]

						# Swap positions
						drivers_df.at[index, "position"], drivers_df.at[ahead_index, "position"] = (
							drivers_df.at[ahead_index, "position"],
							drivers_df.at[index, "position"],
						)

						drivers_df.at[index, "cumulative_time"], drivers_df.at[ahead_index, "cumulative_time"] = (
							drivers_df.at[ahead_index, "cumulative_time"],
							drivers_df.at[index, "cumulative_time"],
						)

					else:
						# if the driver behind is faster overall, then keep them behind
						if drivers_df.at[ahead_index, "cumulative_time"] > drivers_df.at[index, "cumulative_time"]:
							drivers_df.at[ahead_index, "cumulative_time"], drivers_df.at[index, "cumulative_time"] = (
								drivers_df.at[index, "cumulative_time"],
								drivers_df.at[ahead_index, "cumulative_time"],
							)


	return drivers_df
import cProfile
import pstats
if __name__ == "__main__":

	# Wrap the race_sim function with cProfile
	profiler = cProfile.Profile()
	profiler.enable()

	# Run the simulation
	sim_df = race_sim(race_data, 16, {1: 1, 15: 1, 35: 3})

	# Disable the profiler
	profiler.disable()

	# Print profiling results
	stats = pstats.Stats(profiler).sort_stats('cumulative')
	stats.print_stats(50)  # Show the top 20 most time-consuming functions

	stats.dump_stats("race_sim_profile.prof")

  drivers_df.at[index, "sector_time"] = sector_time
  drivers_df.at[index, "gap"] = gap


         4530273 function calls (4461734 primitive calls) in 5.627 seconds

   Ordered by: cumulative time
   List reduced from 904 to 50 due to restriction <50>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.214    0.214    5.630    5.630 /tmp/ipykernel_100396/884401343.py:3(race_sim)
6438/3246    0.030    0.000    1.525    0.000 /home/ben/Individual_Project/env/lib/python3.10/site-packages/pandas/core/indexing.py:1176(__getitem__)
     3192    0.019    0.000    1.429    0.000 /home/ben/Individual_Project/env/lib/python3.10/site-packages/pandas/core/indexing.py:1365(_getitem_tuple)
     3192    0.036    0.000    1.395    0.000 /home/ben/Individual_Project/env/lib/python3.10/site-packages/pandas/core/indexing.py:1032(_getitem_lowerdim)
     6410    0.028    0.000    1.304    0.000 /home/ben/Individual_Project/env/lib/python3.10/site-packages/pandas/core/indexing.py:1397(_getitem_axis)
     3218    0.014    0.000    0.952    0.000 /home/ben/Indiv

In [10]:
%pip install snakeviz

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting snakeviz
  Downloading snakeviz-2.2.2-py3-none-any.whl (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.5/183.5 KB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: snakeviz
Successfully installed snakeviz-2.2.2
Note: you may need to restart the kernel to use updated packages.


In [None]:
def get_accuracy(year, circuit, sim_df):
	# Fetch the actual race session from the database
	race_session = (
		db_session.query(Session)
		.join(RacingWeekend, Session.weekend_id == RacingWeekend.racing_weekend_id)
		.join(Circuit, RacingWeekend.circuit_id == Circuit.circuit_id)
		.filter(
			RacingWeekend.year == year,
			Circuit.circuit_name == circuit,
			Session.session_type == "Race"
		)
		.first()
	)

	if not race_session:
		raise ValueError(f"No race session found for {year} at {circuit}")

	# Fetch the actual race laps and results
	laps = race_session.laps
	session_results = (
		db_session.query(SessionResult.position, Driver.driver_num)
		.join(Session, Session.session_id == SessionResult.session_id)
		.join(Driver, Driver.driver_id == SessionResult.driver_id)
		.filter(SessionResult.session_id == race_session.session_id)
		.all()
	)

	# Convert session_results to a dictionary for easy lookup
	actual_results = {driver_num: position for position, driver_num in session_results}

	# Extract simulated results from sim_df
	sim_results = sim_df.groupby("driver_number").last()["position"].to_dict()

	# Ensure both results have the same drivers
	common_drivers = set(actual_results.keys()).intersection(sim_results.keys())
	if not common_drivers:
		raise ValueError("No common drivers found between actual and simulated results")

	# Filter results to only include common drivers
	actual_positions = [actual_results[driver] for driver in common_drivers]
	sim_positions = [sim_results[driver] for driver in common_drivers]

	# Calculate accuracy metrics
	position_accuracy = sum(1 for a, s in zip(actual_positions, sim_positions) if a == s) / len(common_drivers)
	top_3_accuracy = sum(1 for a, s in zip(actual_positions, sim_positions) if (a <= 3 and s <= 3)) / len(common_drivers)
	mean_error = sum(abs(a - s) for a, s in zip(actual_positions, sim_positions)) / len(common_drivers)
	total_error = sum(abs(a - s) for a, s in zip(actual_positions, sim_positions))

	# Return accuracy metrics
	return {
		"position_accuracy": position_accuracy,
		"top_3_accuracy": top_3_accuracy,
		"mean_error": mean_error,
		"total_error": total_error,


	}

get_accuracy(2023, "Sakhir", drivers_df)

{'position_accuracy': 1.0,
 'top_3_accuracy': 0.15,
 'mean_error': 0.0,
 'total_error': 0}