In [133]:
from sqlalchemy import create_engine, func
from sqlalchemy.orm import sessionmaker
import sys
import os
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../")))
from DB.models import init_db, Circuit, Season, RacingWeekend, Driver, Session, SessionResult, Lap, Team, DriverTeamSession, TeamCircuitStats, PitStop
from utils import correct_fuel_effect, extract_driver_strategies, calculate_base_sector_times, get_tyre_deg_per_driver

import numpy as np
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from itertools import product

# Initialize database connection
global db_session
engine, db_session = init_db()

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option('display.max_colwidth', None) 


## 1) Query race into df and pre process

In [None]:
def get_race_session(year, circuit):
	race_session = (db_session.query(Session)
				.join(RacingWeekend, Session.weekend_id == RacingWeekend.racing_weekend_id)
				.join(Circuit, RacingWeekend.circuit_id == Circuit.circuit_id)
				.filter(
					RacingWeekend.year == year,
					Circuit.circuit_name == circuit,
					Session.session_type == "Race"
				)
				.first())
	
	return race_session

def get_race_df(race_session_db):

	laps = race_session_db.laps

	# Convert to DataFrame
	laps_data = []
	for lap in laps:
		# Add a row for Sector 1
		laps_data.append({
			"lap_num": lap.lap_num,
			"sector": 1,
			"stint_num": lap.stint_num,
			"stint_lap": lap.stint_lap,
			"position": lap.position,
			"driver_name": lap.driver.driver_name,
			"driver_number": lap.driver.driver_num,
			"sector_time": lap.s1_time,
			"tyre_type": lap.tyre_type,
			"tyre_laps": lap.tyre_laps,
			"pit": lap.pit,
			"track_status": lap.track_status
		})

		# Add a row for Sector 2
		laps_data.append({
			"lap_num": lap.lap_num,
			"sector": 2,
			"stint_num": lap.stint_num,
			"stint_lap": lap.stint_lap,
			"position": lap.position,
			"driver_name": lap.driver.driver_name,
			"driver_number": lap.driver.driver_num,
			"sector_time": lap.s2_time,
			"tyre_type": lap.tyre_type,
			"tyre_laps": lap.tyre_laps,
			"pit": lap.pit,
			"track_status": lap.track_status
		})

		# Add a row for Sector 3
		laps_data.append({
			"lap_num": lap.lap_num,
			"sector": 3,
			"stint_num": lap.stint_num,
			"stint_lap": lap.stint_lap,
			"position": lap.position,
			"driver_name": lap.driver.driver_name,
			"driver_number": lap.driver.driver_num,
			"sector_time": lap.s3_time,
			"tyre_type": lap.tyre_type,
			"tyre_laps": lap.tyre_laps,
			"pit": lap.pit,
			"track_status": lap.track_status
		})

	# Create a DataFrame from the list of dictionaries
	df = pd.DataFrame(laps_data)
	

	df = df.sort_values(["lap_num", "sector", "position"]).reset_index(drop=True)

	return df


def add_race_data(df):
	# Calculate cumulative race time for each driver
	df["cumulative_time"] = df.groupby("driver_name")["sector_time"].cumsum()
 
	# Calculate rolling pace (average lap time over the last 5 laps)
	df["pace"] = (
		df.groupby(["driver_name", "sector"])["sector_time"]
		.rolling(window=5, min_periods=1)
		.mean()
		.reset_index(level=[0, 1], drop=True)
	)

	# Get car ahead"s cumulative time (car immediately ahead in position for each lap)
	df["front_cumulative_time"] = df.groupby(["lap_num", "sector"])["cumulative_time"].shift(1)
	# This gap is calculated only for drivers who are not in the lead position (position > 1)
	df["gap"] = df["cumulative_time"] - df["front_cumulative_time"]
	df["gap"] = df["gap"].fillna(0)  # Leader has no car ahead, so gap is 0

	# Calculate tyre difference (compared to car immediately ahead in THIS Sector)
	df["front_tyre"] = df.groupby(["lap_num", "sector"])["tyre_type"].shift(1)
	df["tyre_diff"] = df["front_tyre"] - df["tyre_type"]
	df["tyre_diff"] = df["tyre_diff"].fillna(0)  # Leader has no car ahead

	# Calculate tyre age difference (compared to car immediately ahead in THIS Sector)
	df["front_laps"] = df.groupby(["lap_num", "sector"])["stint_lap"].shift(1)
	df["stint_laps_diff"] = df["front_laps"] - df["stint_lap"]
	df["stint_laps_diff"] = df["stint_laps_diff"].fillna(0)  # Leader has no car ahead

	# Calculate DRS availability (within 1s of car ahead IN THIS Sector)
	df["drs_available"] = (
		(df["gap"] <= 1) &
		(df["position"] > 1) &
		(df["lap_num"] > 1)
	)

	# Create target variable for overtaking model (done where its 1 if the driver got overtaken (improved accuracy))
	df["next_position"] = df.groupby("driver_name")["position"].shift(1) 
	df["overtaken"] = ((df["next_position"] < df["position"]) | 
					  (df["next_position"].isna()))
	

	# Cleanup and final sorting
	df = df.drop(columns=["front_cumulative_time", "front_tyre", "next_position"])
	# df = df.sort_values(["lap_num", "sector", "position"]).reset_index(drop=True)

	try:
		new_order = [
			"lap_num", "sector", "stint_num", "stint_lap", "position", "driver_name",
			"driver_number", "sector_time", "gap", "cumulative_time", "tyre_type", "tyre_laps", 
			"pit", "drs_available", "overtaken", "tyre_diff", "front_laps", "stint_laps_diff", "track_status", "pace"
		]

		df = df[new_order]
	except:
		print("dont care")
	return df

race_session_db = get_race_session(2022, "Sakhir")

df = get_race_df(race_session_db)
df = add_race_data(df)


# df

In [None]:
def setup_race_data(race_df, race_session_db):
	driver_strategies = extract_driver_strategies(race_df)
	
	# Extract tyre degradation curves
	driver_tyre_coefficients = get_tyre_deg_per_driver(race_df)
	
	# Precompute driver strategies

	# Correct fuel effects in the race data
	max_laps = race_df["lap_num"].max()
	race_df = correct_fuel_effect(race_df)

	drivers = race_df["driver_number"].unique()

	base_sector_times = calculate_base_sector_times(race_df)

	driver_names = {
		driver: race_df[race_df["driver_number"] == driver]["driver_name"].iloc[0]
		for driver in drivers
	}

	# Precompute fuel corrections
	max_fuel_kg = 110
	fuel_effect_per_kg = 0.03
	fuel_corrections = {
		lap: (max_fuel_kg - (lap - 1) * (max_fuel_kg / max_laps)) * fuel_effect_per_kg
		for lap in range(1, max_laps + 1)
	}


	# 
	session_results = (
		db_session.query(SessionResult.grid_pos, Driver.driver_num, SessionResult.end_status)
		.join(Session, Session.session_id == SessionResult.session_id)
		.join(Driver, Driver.driver_id == SessionResult.driver_id)
		.filter(SessionResult.session_id == race_session_db.session_id)
		.all()
	)
	# Convert session results to a dictionary
	starting_positions = {driver_num: grid_pos for grid_pos, driver_num, _ in session_results}

	# Initialize an empty dictionary to store retirements by lap
	retirements_by_lap = {}

	# Iterate through session results to determine retirements
	for driver_id, driver_num, end_status in session_results:
		# Check if the driver retired (end_status is not "Finished" or "+1 Lap")
		if end_status and not (end_status.startswith("Finished") or end_status.startswith("+")):
			# Find the maximum lap number for the driver (last recorded lap)
			lap_retired = df[df["driver_number"] == driver_num]["lap_num"].max()
			
			# Add the driver to the list of retirees for the corresponding lap
			if lap_retired not in retirements_by_lap:
				retirements_by_lap[lap_retired] = []
			retirements_by_lap[lap_retired].append(driver_num)

	# Output the retirements dictionary
	print(retirements_by_lap)

	# Get initial positions (starting grid positions) for each driver

	df["starting_position"] = None  # Initialize column with None
	for driver_num, grid_pos in starting_positions.items():
		# Find the first occurrence of the driver
		first_row_index = df[df["driver_number"] == driver_num].index[0]
		df.at[first_row_index, "starting_position"] = grid_pos

		
	
	# Get safety car laps
	filtered_df = race_df[race_df["track_status"] != 1]
	
	# Group by lap_num and convert each group to a list of dictionaries
	slow_laps = {}
	for lap_num, group in filtered_df.groupby("lap_num"):
		slow_laps[lap_num] = group.to_dict(orient="records")
	

	return {
		"driver_tyre_coefficients": driver_tyre_coefficients,
		"driver_strategies": driver_strategies,
		"race_df": race_df,
		"max_laps": max_laps,
		"drivers": drivers,
		"driver_names": driver_names,
		"initial_positions": starting_positions,
		"base_sector_times": base_sector_times,
		"fuel_corrections": fuel_corrections,
		"slow_laps": slow_laps,
		"retirement_laps": retirements_by_lap
	}

### Now create overtaking model

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.utils.class_weight import compute_sample_weight

# Define features and target
features = [
	"gap",
	"pace",
	"tyre_diff",
	"stint_laps_diff",
	"drs_available",
	"cumulative_time",
	"sector_time",
	"pit"
]

X = df[features]
y = df["overtaken"]

from imblearn.over_sampling import SMOTE



# Train the model on the resampled data

# Handle missing values
X = X.fillna(X.mean())

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


# Base model
gbc = GradientBoostingClassifier(
	n_estimators=200,
	learning_rate=0.05,
	max_depth=3,
	subsample=0.8,
	random_state=42
)



# Calibrate for better probabilities
model = CalibratedClassifierCV(gbc, method="sigmoid", cv=3)

# Train on all data
model.fit(X_resampled, y_resampled)


feature_means = X.mean()

def predict_overtake(new_race_df):
	# Select the same features used during training
	X_new = new_race_df[features]
	
	# Handle missing values (if any)
	X_new = X_new.fillna(feature_means)
	
	# Make predictions using the trained model
	new_race_df["predicted_overtake"] = model.predict(X_new)
	
	return new_race_df

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Load the 2023 race data


# Predict overtakes using the rule-based function
new_race_df = predict_overtake(df)

# Calculate accuracy
actual_overtakes = new_race_df["overtaken"]
predicted_overtakes = new_race_df["predicted_overtake"]
accuracy = accuracy_score(actual_overtakes, predicted_overtakes)
print(f"Accuracy: {accuracy:.3f}")

# Generate classification report
print("\nClassification Report:")
print(classification_report(
	actual_overtakes,
	predicted_overtakes,
	target_names=["No Overtake", "Overtaken"]
))

Accuracy: 0.993

Classification Report:
              precision    recall  f1-score   support

 No Overtake       1.00      1.00      1.00      3248
   Overtaken       0.90      0.90      0.90       127

    accuracy                           0.99      3375
   macro avg       0.95      0.95      0.95      3375
weighted avg       0.99      0.99      0.99      3375



In [138]:
race_data = setup_race_data(df, race_session_db)

{np.int64(57): [11], np.int64(54): [1], np.int64(45): [10]}


In [None]:
import pandas as pd

def race_sim(precomputed_data, given_driver=None, simulated_strategy=None):
	"""
	simulated_strategy
	"""
	# Extract precomputed data
	driver_tyre_coefficients = precomputed_data["driver_tyre_coefficients"]
	driver_strategies = precomputed_data["driver_strategies"]
	max_laps = precomputed_data["max_laps"]
	drivers = precomputed_data["drivers"]
	driver_names = precomputed_data["driver_names"]
	initial_positions = precomputed_data["initial_positions"]
	base_sector_times = precomputed_data["base_sector_times"]
	fuel_corrections = precomputed_data["fuel_corrections"]
	slow_laps = precomputed_data["slow_laps"]
	retirement_laps = precomputed_data["retirement_laps"]

	drivers_data = []

	if given_driver and simulated_strategy:
		driver_strategies[given_driver] = simulated_strategy

	for driver in drivers:
		drivers_data.append({
			"driver_number": driver,
			"driver_name": driver_names[driver],
			"pit_schedule": driver_strategies[driver],
			"tyre_type": driver_strategies[driver][1],
			"lap_num": 1,
			"sector": 0,
			"sector_time": 0.0,
			"stint_lap": 1,
			"cumulative_time": 0.0,
			"gap": 0.0,
			"pit": False,
			"position": initial_positions[driver],
			"starting_pos": initial_positions[driver],
			"base_sector_times": base_sector_times[driver],
			"pace": 0,  # Initialize pace as 0
			"tyre_diff": 0,  # Initialize tyre difference as 0
			"stint_laps_diff": 0,  # Initialize stint laps difference as 0
			"drs_available": False,  # Initialize DRS availability as False
			"retired": False,
		})
		
	drivers_df = pd.DataFrame(drivers_data)
	
	simulated_data = []
	num_overtakes = 0
	for lap in range(1, max_laps + 1):
		if lap in slow_laps:
			slow = True
		else:
			slow = False

		if lap in retirement_laps:
			drivers_df = drivers_df.sort_values(by="position", ascending=True)
			for index, row in drivers_df.iterrows():
				if row["driver_number"] in retirement_laps[lap]:  # Check if the driver retired on this lap
					# move everyone behind up 1
					
					drivers_df.loc[drivers_df["position"] > row["position"], "position"] -= 1

					drivers_df.at[index, "retired"] = True  # Mark the driver as retired
					drivers_df.at[index, "position"] = 999

			drivers_df = drivers_df.sort_values(by="position", ascending=True)
			
	
		drivers_df["lap_num"] += 1
		drivers_df["stint_lap"] += 1

		for sector in range(1, 4):
			# if sector == 2:
			# 	break
			drivers_df["sector"] = sector
			
			for index, row in drivers_df.iterrows():
				if row["retired"]:
					continue
	
				# Handle pit stops at the start of a lap (sector 1)
				if sector == 1 and lap in row["pit_schedule"]:
					drivers_df.at[index, "pit"] = True  # Mark pit stop
					drivers_df.at[index, "cumulative_time"] += 20  # Add pit stop penalty
					drivers_df.at[index, "stint_lap"] = 1  # Reset stint lap
					drivers_df.at[index, "tyre_type"] = row["pit_schedule"][lap]  # Change tyre


					# TODO - fix reorder when pit
					# current_time = drivers_df.at[index, "cumulative_time"]
					# for pos in range(row["position"], 21):
					# 	if drivers_df.loc[drivers_df["position"] == pos].iloc[0]["cumulative_time"] > current_time:
					# 		print(pos)


					# drivers_df = drivers_df[~drivers_df["retired"]].sort_values(by="cumulative_time", ascending=True)
					# drivers_df["position"] = range(1, len(drivers_df) + 1)
		
				else:
					drivers_df.at[index, "pit"] = False


				a, b, c = driver_tyre_coefficients[row["driver_number"]][sector][row["tyre_type"]]
				sector_time = (
					row["base_sector_times"][sector]  # Base sector time for specific driver
					+ (a * drivers_df.at[index, "stint_lap"]**2 + b * drivers_df.at[index, "stint_lap"] + c)  # Tyre degradation
					+ fuel_corrections[lap]  # Fuel effect
				)

				# Update sector time and cumulative time
				drivers_df.at[index, "sector_time"] = sector_time
				# if slow:
				# 	sector_time *= 1.5
					
				drivers_df.at[index, "cumulative_time"] += sector_time

				# drivers_df = drivers_df.sort_values(by="cumulative_time", ascending=True)

				# Reassign positions based on the sorted order
				# drivers_df["position"] = range(1, len(drivers_df) + 1)

			# drivers_df = drivers_df.sort_values(by="cumulative_time", ascending=True)
			# drivers_df["position"] = range(1, len(drivers_df) + 1)

			for index, row in drivers_df.iterrows():
				if row["retired"]:
					continue
				ahead_pos = row["position"] - 1

				if ahead_pos > 0:
					ahead_index = drivers_df[drivers_df["position"] == ahead_pos].index[0]

					ahead_pos_time = drivers_df.at[ahead_index, "cumulative_time"]
				
					current_pos_time = row["cumulative_time"]
					if ahead_pos_time > current_pos_time:
						drivers_df.at[ahead_index, "cumulative_time"] = current_pos_time - 0.5

						ahead_pos_time = current_pos_time - 0.5
						
					gap = current_pos_time - ahead_pos_time


					drivers_df.at[index, "gap"] = gap

					
				else:

					gap = 0
					drivers_df.at[index, "gap"] = 0

					

				if gap < 1 and ahead_pos > 0:
					# if ahead_pos > 0:
					ahead_row = drivers_df.loc[drivers_df["position"] == ahead_pos].iloc[0]
						
					drivers_df["pace"] = (
						drivers_df.groupby(["driver_name", "sector"])["sector_time"]
						.rolling(window=5, min_periods=1)
						.mean()
						.reset_index(level=[0, 1], drop=True)
					)
					
					# Tyre difference
					drivers_df.at[index, "tyre_diff"] = ahead_row["tyre_type"] - row["tyre_type"]

					# Stint laps difference
					drivers_df.at[index, "stint_laps_diff"] = ahead_row["stint_lap"] - row["stint_lap"]

					# DRS availability
					drivers_df.at[index, "drs_available"] = True

					
					drivers_df = predict_overtake(drivers_df)

					ahead_index = drivers_df[drivers_df["position"] == ahead_pos].index[0]
					# Check if an overtake is predicted
					if drivers_df.at[index, "predicted_overtake"]:
						num_overtakes += 1
						# Swap positions between the current driver and the driver ahead
						current_position = drivers_df.at[index, "position"]

						# Swap positions
						drivers_df.at[index, "position"], drivers_df.at[ahead_index, "position"] = drivers_df.at[ahead_index, "position"], drivers_df.at[index, "position"],

						drivers_df.at[index, "cumulative_time"], drivers_df.at[ahead_index, "cumulative_time"] = drivers_df.at[ahead_index, "cumulative_time"], drivers_df.at[index, "cumulative_time"]
						

	
	drivers_df.drop(columns=["base_sector_times"], inplace=True)
	print(num_overtakes)
	return drivers_df

# print(race_data)
sim_df = race_sim(race_data)


sim_df = sim_df.sort_values(by="position", ascending=True)

# sim_df = sim_df.sort_values(by="cumulative_time", ascending=True)
# sim_df["position"] = range(1, len(sim_df) + 1)

# Reset the index (optional, for cleaner output)
sim_df = sim_df.reset_index(drop=True)
sim_df

5
7
9
10
14
16
7
8
9
10
11
12
14
16
14
16
17
16
17
19
20
19
20
20
14
20
12
13
14
16
17
18
20
20
3
10
11
12
13
14
15
16
17
18
19
20
14
15
16
17
18
19
20
19
20
20
20
11
12
13
14
16
17
18
19
20
7
8
9
10
11
12
13
14
16
17
18
19
20
9
10
11
12
13
14
15
16
17
18
19
20
2
3
4
7
9
10
11
12
13
14
15
16
17
18
19
20
7
13
14
15
16
17
18
19
20
17
18
20
15
16
17
18
19
20
9
10
11
12
13
14
15
16
17
18
19
20
14
15
17
18
19
20
18
19
20
8
9
10
11
12
13
14
15
16
17
18
19
20
16
18
19
20
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
19
20
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
15
16
17
18
19
20
7
8
9
10
11
12
13
14
15
16
17
18
19
20
8
9
10
11
12
13
14
15
16
17
18
19
20
4
7
8
9
10
11
12
13
14
15
16
17
18
19
20
7
8
9
10
11
12
13
14
15
16
17
18
19
20
20
15
16
17
18
19
20
19
20
12
13
14
15
16
17
18
19
20
11
12
13
14
15
16
17
18
19
20
10
11
12
13
14
15
16
17
18
19
20
18
19
20
9
10
11
12
13
14
15
16
17
18
19
20
14
15
16
17
18
19
20
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
5
7
8
9
10
11
12
13
14
15
16
1

IndexError: single positional indexer is out-of-bounds

In [149]:
import pandas as pd
import numpy as np

def compare_simulation_with_actual(sim_df, race_df):
	"""
	Compares the simulated race results with the actual race data and calculates the Mean Absolute Error (MAE).
	
	Parameters:
		sim_df (pd.DataFrame): The simulated race DataFrame containing lap-by-lap and sector-by-sector data.
		race_df (pd.DataFrame): The actual race DataFrame containing lap-by-lap and sector-by-sector data.
	
	Returns:
		pd.DataFrame: A DataFrame comparing each driver's simulated and actual cumulative times.
		float: The total Mean Absolute Error (MAE) across all drivers.
	"""
	# Initialize a list to store comparison results
	comparison_results = []
	
	# Get unique drivers from the simulated DataFrame
	drivers = sim_df["driver_number"].unique()
	
	# Calculate cumulative times for each driver in both simulated and actual data
	for driver in drivers:
		# Simulated cumulative time for the driver
		sim_cumulative_time = sim_df[sim_df["driver_number"] == driver]["cumulative_time"].max()
		
		# Actual cumulative time for the driver
		actual_cumulative_time = race_df[race_df["driver_number"] == driver]["sector_time"].sum()

		# if actual_cumulative_time < 5000:
		# 	continue
		# Calculate the absolute error for the driver
		absolute_error = abs(sim_cumulative_time - actual_cumulative_time)
		
		# Store the results for the driver
		comparison_results.append({
			"driver_number": driver,
			"simulated_cumulative_time": sim_cumulative_time,
			"actual_cumulative_time": actual_cumulative_time,
			"absolute_error": absolute_error
		})
	
	# Convert the results to a DataFrame
	comparison_df = pd.DataFrame(comparison_results)
	
	# Calculate the total Mean Absolute Error (MAE)
	total_mae = comparison_df["absolute_error"].mean()
	
	return comparison_df, total_mae


# Example usage
# Assuming `sim_df` is the output of your simulation and `df` is the actual race DataFrame
comparison_df, total_mae = compare_simulation_with_actual(sim_df, df)



# Print the total MAE
print("\nTotal Mean Absolute Error (MAE):", total_mae)
comparison_df


Total Mean Absolute Error (MAE): 95.07084189624742


Unnamed: 0,driver_number,simulated_cumulative_time,actual_cumulative_time,absolute_error
0,31,5746.072552,5836.644,90.571448
1,63,5746.312439,5829.718,83.405561
2,18,5745.750769,5861.397,115.646231
3,27,5744.750044,5879.751,135.000956
4,77,5745.250044,5833.181,87.930956
5,55,5745.840846,5825.318,79.477154
6,14,5762.564182,5839.862,77.297818
7,23,5762.015226,5870.755,108.739774
8,3,5761.886768,5870.792,108.905232
9,24,5761.54179,5837.604,76.06221


	driver_number	simulated_cumulative_time	actual_cumulative_time	absolute_error
0	16	5778.326647	5820.618	42.291353
1	55	5786.386771	5825.318	38.931229
2	44	5803.477681	5828.807	25.329319
3	63	5850.364421	5829.718	20.646421
4	6	5851.746338	5877.044	25.297662
5	20	5851.847248	5833.711	18.136248
6	18	5857.635768	5861.397	3.761232
7	27	5858.135768	5879.751	21.615232
8	14	5857.690179	5839.862	17.828179
9	77	5858.190179	5833.181	25.009179
10	47	5875.457842	5849.868	25.589842
11	31	5924.478333	5836.644	87.834333
12	22	5955.200365	5836.620	118.580365
13	23	6003.485113	5870.755	132.730113
14	3	6006.011511	5870.792	135.219511
15	24	6028.952961	5837.604	191.348961
16	4	6060.327838	5872.684	187.643838
17	10	4579.656906	4398.915	180.741906
18	1	5399.092729	5564.174	165.081271
19	11	5706.843566	5730.537	23.693434

In [141]:
def get_accuracy(year, circuit, sim_df):
	# Fetch the actual race session from the database
	race_session = (
		db_session.query(Session)
		.join(RacingWeekend, Session.weekend_id == RacingWeekend.racing_weekend_id)
		.join(Circuit, RacingWeekend.circuit_id == Circuit.circuit_id)
		.filter(
			RacingWeekend.year == year,
			Circuit.circuit_name == circuit,
			Session.session_type == "Race"
		)
		.first()
	)

	if not race_session:
		raise ValueError(f"No race session found for {year} at {circuit}")

	# Fetch the actual race laps and results
	laps = race_session.laps
	session_results = (
		db_session.query(SessionResult.position, Driver.driver_num)
		.join(Session, Session.session_id == SessionResult.session_id)
		.join(Driver, Driver.driver_id == SessionResult.driver_id)
		.filter(SessionResult.session_id == race_session.session_id)
		.all()
	)

	# Convert session_results to a dictionary for easy lookup
	actual_results = {driver_num: position for position, driver_num in session_results}

	# Extract simulated results from sim_df
	sim_df = sim_df[~sim_df["retired"]]
	sim_results = sim_df.groupby("driver_number").last()["position"].to_dict()

	# Ensure both results have the same drivers
	common_drivers = set(actual_results.keys()).intersection(sim_results.keys())
	if not common_drivers:
		raise ValueError("No common drivers found between actual and simulated results")

	# Filter results to only include common drivers
	actual_positions = [actual_results[driver] for driver in common_drivers]
	sim_positions = [sim_results[driver] for driver in common_drivers]

	# Calculate accuracy metrics
	position_accuracy = sum(1 for a, s in zip(actual_positions, sim_positions) if a == s) / len(common_drivers)
	top_3_accuracy = sum(1 for a, s in zip(actual_positions, sim_positions) if (a <= 3 and s <= 3)) / 3
	mean_error = sum(abs(a - s) for a, s in zip(actual_positions, sim_positions)) / len(common_drivers)
	total_error = sum(abs(a - s) for a, s in zip(actual_positions, sim_positions))

	# Return accuracy metrics
	return {
		"position_accuracy": position_accuracy,
		"top_3_accuracy": top_3_accuracy,
		"mean_error": mean_error,
		"total_error": total_error,
	}

get_accuracy(2022, "Sakhir", sim_df)

{'position_accuracy': 0.35294117647058826,
 'top_3_accuracy': 1.0,
 'mean_error': 2.9411764705882355,
 'total_error': 50}

{'position_accuracy': 0.35294117647058826,
 'top_3_accuracy': 1.0,
 'mean_error': 2.823529411764706,
 'total_error': 48}

In [None]:
from skopt import gp_minimize
from skopt.space import Integer, Categorical

def optimize_strategy_up_to_3_stops(precomputed_data, driver_number=14, n_calls=100):

	max_laps = precomputed_data["max_laps"]

	# Define the search space
	space = [
		Categorical([1, 2, 3], name='start_tyre'),       # Starting tyre (1=Hard, 2=Medium, 3=Soft)
		Integer(2, max_laps - 1, name='pit1_lap'),      # First pit stop lap (optional)
		Categorical([1, 2, 3], name='pit1_tyre'),        # First pit stop tyre (optional)
		Integer(2, max_laps - 1, name='pit2_lap'),      # Second pit stop lap (optional)
		Categorical([1, 2, 3], name='pit2_tyre'),        # Second pit stop tyre (optional)
		Integer(2, max_laps - 1, name='pit3_lap'),      # Third pit stop lap (optional)
		Categorical([1, 2, 3], name='pit3_tyre'),        # Third pit stop tyre (optional)
	]

	def objective(params):
		start_tyre, pit1_lap, pit1_tyre, pit2_lap, pit2_tyre, pit3_lap, pit3_tyre = params

		# Construct the strategy dictionary
		strategy = {1: start_tyre}  # Always include the starting tyre

		# Add pit stops only if their laps are valid and in ascending order
		pit_laps = sorted(set([pit1_lap, pit2_lap, pit3_lap]))
		tyres = [pit1_tyre, pit2_tyre, pit3_tyre]

		# Ensure pit laps are within bounds and in ascending order
		valid_pit_laps = []
		valid_tyres = []
		for lap, tyre in zip(pit_laps, tyres):
			if 2 <= lap < max_laps:  # Only include valid pit laps
				valid_pit_laps.append(lap)
				valid_tyres.append(tyre)

		# Add valid pit stops to the strategy
		for lap, tyre in zip(valid_pit_laps, valid_tyres):
			strategy[lap] = tyre

		# Penalize strategies with fewer than 2 distinct tyre types
		tyre_types_used = set(strategy.values())
		if len(tyre_types_used) < 2:
			return 20.0  # Penalize with worst position

		# Run the simulation
		try:
			drivers_df = race_sim(precomputed_data, given_driver=driver_number, simulated_strategy=strategy)
			final_position = drivers_df[drivers_df['driver_number'] == driver_number]['position'].iloc[-1]
			return final_position  # Minimize finishing position
		except Exception as e:
			print(f"Error during simulation: {e}")
			return 20.0  # Return worst position in case of errors

	# Perform Bayesian optimization
	result = gp_minimize(
		objective,
		space,
		n_calls=n_calls,
		random_state=42,
		verbose=True
	)

	# Extract best parameters and construct the optimal strategy
	best_params = result.x
	best_strategy = {1: best_params[0]}  # Always include the starting tyre

	# Add valid pit stops to the strategy
	pit_laps = sorted(set([best_params[1], best_params[3], best_params[5]]))
	tyres = [best_params[2], best_params[4], best_params[6]]

	for lap, tyre in zip(pit_laps, tyres):
		if 2 <= lap < precomputed_data["max_laps"]:
			best_strategy[lap] = tyre

	best_position = result.fun

	print(f"Best Strategy: {best_strategy}")
	print(f"Best Finishing Position: {best_position}")

	return best_strategy, best_position

# Example usage
# best_strategy, best_position = optimize_strategy_up_to_3_stops(race_data, driver_number=10, n_calls=100)
# print(f"Best Strategy: {best_strategy}")
# print(f"Best Finishing Position: {best_position}")

In [None]:
from bayes_opt import BayesianOptimization
import numpy as np

def bayesian_strategy_optimization(race_data, given_driver, max_iterations=50):
	"""
	Optimize the pit strategy for a given driver using Bayesian Optimization.
	
	Args:
		race_data (dict): Precomputed race data.
		given_driver (int): The driver number to optimize the strategy for.
		max_iterations (int): Maximum number of iterations for optimization.
		
	Returns:
		tuple: Best strategy (dict) and best finishing position (int).
	"""
	# Extract precomputed data
	max_laps = race_data["max_laps"]
	initial_strategy = race_data["driver_strategies"][given_driver]  # Use the driver's actual race strategy

	# Define the objective function for Bayesian Optimization
	def objective_function(starting_tyre, pit1_lap, pit1_tyre, pit2_lap, pit2_tyre, pit3_lap, pit3_tyre):
		# Ensure pit laps are unique and within valid range
		pit_laps = sorted(set([int(pit1_lap), int(pit2_lap), int(pit3_lap)]))
		tyres = [int(pit1_tyre), int(pit2_tyre), int(pit3_tyre)]
		
		# Filter out invalid pit laps (e.g., out of bounds or overlapping)
		valid_pit_laps = []
		valid_tyres = []
		for lap, tyre in zip(pit_laps, tyres):
			if 2 <= lap < max_laps:  # Only include valid pit laps
				valid_pit_laps.append(lap)
				valid_tyres.append(tyre)

		# Construct the strategy dictionary
		strategy = {1: int(starting_tyre)}  # Starting tyre
		for lap, tyre in zip(valid_pit_laps, valid_tyres):
			strategy[lap] = tyre

		# Penalize strategies with fewer than 2 distinct tyre types
		tyre_types_used = set(strategy.values())
		if len(tyre_types_used) < 2:
			return -20.0  # Penalize with worst position (negative because we're maximizing)

		# Evaluate the strategy using the race simulation
		try:
			sim_df = race_sim(race_data, given_driver=given_driver, simulated_strategy=strategy)
			final_position = sim_df[sim_df["driver_number"] == given_driver]["position"].iloc[-1]
			return -final_position  # Negative because BayesianOptimization maximizes by default
		except Exception as e:
			print(f"Error during simulation: {e}")
			return -20.0  # Return worst position in case of errors

	# Set up the parameter bounds for Bayesian Optimization
	pbounds = {
		"starting_tyre": (1, 3),  # Starting tyre (1=Hard, 2=Medium, 3=Soft)
		"pit1_lap": (2, max_laps - 1),  # First pit stop lap (optional)
		"pit1_tyre": (1, 3),  # First pit stop tyre (optional)
		"pit2_lap": (2, max_laps - 1),  # Second pit stop lap (optional)
		"pit2_tyre": (1, 3),  # Second pit stop tyre (optional)
		"pit3_lap": (2, max_laps - 1),  # Third pit stop lap (optional)
		"pit3_tyre": (1, 3),  # Third pit stop tyre (optional)
	}

	# Initialize the Bayesian Optimizer
	optimizer = BayesianOptimization(
		f=objective_function,
		pbounds=pbounds,
		verbose=2,
		random_state=42
	)

	# Perform the optimization
	optimizer.maximize(init_points=5, n_iter=max_iterations)

	# Extract the best strategy from the optimizer
	best_params = optimizer.max["params"]
	best_starting_tyre = int(best_params["starting_tyre"])
	pit_laps = sorted(set([int(best_params["pit1_lap"]), int(best_params["pit2_lap"]), int(best_params["pit3_lap"])]))
	tyres = [int(best_params["pit1_tyre"]), int(best_params["pit2_tyre"]), int(best_params["pit3_tyre"])]

	# Construct the best strategy dictionary
	best_strategy = {1: best_starting_tyre}
	for lap, tyre in zip(pit_laps, tyres):
		if 2 <= lap < max_laps:  # Only include valid pit laps
			best_strategy[lap] = tyre

	# Evaluate the best strategy to get the finishing position
	sim_df = race_sim(race_data, given_driver=given_driver, simulated_strategy=best_strategy)
	best_position = sim_df[sim_df["driver_number"] == given_driver]["position"].iloc[-1]

	print(f"Best Strategy: {best_strategy}")
	print(f"Best Finishing Position: {best_position}")

	return best_strategy, best_position

# Example Usage
# best_strategy, best_position = bayesian_strategy_optimization(
# 	race_data=race_data,
# 	given_driver=10,
# 	max_iterations=20
# )


ModuleNotFoundError: No module named 'bayes_opt'