In [1]:
import numpy as np
import pandas as pd

In [2]:
circuits = pd.read_csv('circuits.csv')
constructor_results = pd.read_csv('constructor_results.csv')
constructor_standings = pd.read_csv('constructor_standings.csv')
constructors = pd.read_csv('constructors.csv')
driver_standings = pd.read_csv('driver_standings.csv')
drivers = pd.read_csv('drivers.csv')
lap_times = pd.read_csv('lap_times.csv')
pit_stops = pd.read_csv('pit_stops.csv')
qualifying = pd.read_csv('qualifying.csv')
races = pd.read_csv('races.csv')
results = pd.read_csv('results.csv')
seasons = pd.read_csv('seasons.csv')
sprint_results = pd.read_csv('sprint_results.csv')
status = pd.read_csv('status.csv')

In [3]:
# Merge lap times with race info  
df = lap_times.merge(races[['raceId', 'circuitId', 'year', 'name']], on='raceId', how='left')

# Merge with circuit info  
df = df.merge(circuits[['circuitId', 'circuitRef', 'location', 'country']], on='circuitId', how='left')

# Merge with driver info  
df = df.merge(drivers[['driverId', 'driverRef', 'nationality']], on='driverId', how='left')

# Merge with constructor info (via results table)  
df = df.merge(results[['raceId', 'driverId', 'constructorId']], on=['raceId', 'driverId'], how='left')  
df = df.merge(constructors[['constructorId', 'constructorRef']], on='constructorId', how='left')

# Merge with pit stop info  
df = df.merge(pit_stops[['raceId', 'driverId', 'lap', 'duration']], on=['raceId', 'driverId', 'lap'], how='left')

# Rename columns for clarity  
df.rename(columns={'name': 'race_name', 'circuitRef': 'circuit_name', 'constructorRef': 'constructor_name'}, inplace=True)

In [4]:
df

Unnamed: 0,raceId,driverId,lap,position,time,milliseconds,circuitId,year,race_name,circuit_name,location,country,driverRef,nationality,constructorId,constructor_name,duration
0,841,20,1,1,1:38.109,98109,1,2011,Australian Grand Prix,albert_park,Melbourne,Australia,vettel,German,9,red_bull,
1,841,20,2,1,1:33.006,93006,1,2011,Australian Grand Prix,albert_park,Melbourne,Australia,vettel,German,9,red_bull,
2,841,20,3,1,1:32.713,92713,1,2011,Australian Grand Prix,albert_park,Melbourne,Australia,vettel,German,9,red_bull,
3,841,20,4,1,1:32.803,92803,1,2011,Australian Grand Prix,albert_park,Melbourne,Australia,vettel,German,9,red_bull,
4,841,20,5,1,1:32.342,92342,1,2011,Australian Grand Prix,albert_park,Melbourne,Australia,vettel,German,9,red_bull,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
589076,1144,857,54,10,1:27.731,87731,24,2024,Abu Dhabi Grand Prix,yas_marina,Abu Dhabi,UAE,piastri,Australian,1,mclaren,
589077,1144,857,55,10,1:27.781,87781,24,2024,Abu Dhabi Grand Prix,yas_marina,Abu Dhabi,UAE,piastri,Australian,1,mclaren,
589078,1144,857,56,10,1:27.816,87816,24,2024,Abu Dhabi Grand Prix,yas_marina,Abu Dhabi,UAE,piastri,Australian,1,mclaren,
589079,1144,857,57,10,1:28.554,88554,24,2024,Abu Dhabi Grand Prix,yas_marina,Abu Dhabi,UAE,piastri,Australian,1,mclaren,


In [5]:
df['prev_lap_time'] = df.groupby(['raceId', 'driverId'])['milliseconds'].shift(1)
df['rolling_avg_lap_time'] = df.groupby(['raceId', 'driverId'])['milliseconds'].rolling(3, min_periods=1).mean().reset_index(level=[0,1], drop=True)


In [6]:
# Average lap time of constructor at the track
constructor_avg_lap = df.groupby(['circuitId', 'constructorId'])['milliseconds'].mean().reset_index()
constructor_avg_lap.rename(columns={'milliseconds': 'constructor_avg_lap_time'}, inplace=True)

df = df.merge(constructor_avg_lap, on=['circuitId', 'constructorId'], how='left')


In [7]:
df['pit_stop_delay'] = df['duration'].fillna(0)  # Fill NaNs with 0 for drivers who didn't pit
df['lap_progression'] = df['lap'] / df.groupby('raceId')['lap'].transform('max')  # Normalize lap number


In [8]:
df

Unnamed: 0,raceId,driverId,lap,position,time,milliseconds,circuitId,year,race_name,circuit_name,...,driverRef,nationality,constructorId,constructor_name,duration,prev_lap_time,rolling_avg_lap_time,constructor_avg_lap_time,pit_stop_delay,lap_progression
0,841,20,1,1,1:38.109,98109,1,2011,Australian Grand Prix,albert_park,...,vettel,German,9,red_bull,,,98109.000000,99218.570541,0,0.017241
1,841,20,2,1,1:33.006,93006,1,2011,Australian Grand Prix,albert_park,...,vettel,German,9,red_bull,,98109.0,95557.500000,99218.570541,0,0.034483
2,841,20,3,1,1:32.713,92713,1,2011,Australian Grand Prix,albert_park,...,vettel,German,9,red_bull,,93006.0,94609.333333,99218.570541,0,0.051724
3,841,20,4,1,1:32.803,92803,1,2011,Australian Grand Prix,albert_park,...,vettel,German,9,red_bull,,92713.0,92840.666667,99218.570541,0,0.068966
4,841,20,5,1,1:32.342,92342,1,2011,Australian Grand Prix,albert_park,...,vettel,German,9,red_bull,,92803.0,92619.333333,99218.570541,0,0.086207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
589076,1144,857,54,10,1:27.731,87731,24,2024,Abu Dhabi Grand Prix,yas_marina,...,piastri,Australian,1,mclaren,,88168.0,88130.000000,103840.093373,0,0.931034
589077,1144,857,55,10,1:27.781,87781,24,2024,Abu Dhabi Grand Prix,yas_marina,...,piastri,Australian,1,mclaren,,87731.0,87893.333333,103840.093373,0,0.948276
589078,1144,857,56,10,1:27.816,87816,24,2024,Abu Dhabi Grand Prix,yas_marina,...,piastri,Australian,1,mclaren,,87781.0,87776.000000,103840.093373,0,0.965517
589079,1144,857,57,10,1:28.554,88554,24,2024,Abu Dhabi Grand Prix,yas_marina,...,piastri,Australian,1,mclaren,,87816.0,88050.333333,103840.093373,0,0.982759


In [9]:
from sklearn.preprocessing import LabelEncoder

for col in ['driverId', 'constructorId', 'circuitId']:
    df[col] = LabelEncoder().fit_transform(df[col])


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Features (X) and Target (y)
features = ['driverId', 'constructorId', 'circuitId', 'lap', 'lap_progression',
            'prev_lap_time', 'rolling_avg_lap_time', 'constructor_avg_lap_time',
            'pit_stop_delay']
target = 'milliseconds'  # Lap time in milliseconds

# Drop NaN rows that might have been introduced in feature engineering
df = df.dropna(subset=features + [target])

X = df[features]
y = df[target]

# Train-Test Split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
     ------------------------------------- 124.9/124.9 MB 22.6 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.1.4
Note: you may need to restart the kernel to use updated packages.


In [15]:
# Convert pit_stop_delay to numeric (if it's stored as a string)
X_train['pit_stop_delay'] = pd.to_numeric(X_train['pit_stop_delay'], errors='coerce')
X_test['pit_stop_delay'] = pd.to_numeric(X_test['pit_stop_delay'], errors='coerce')

# Check if any column is still an object
print(X_train.dtypes)


driverId                      int64
constructorId                 int64
circuitId                     int64
lap                           int64
lap_progression             float64
prev_lap_time               float64
rolling_avg_lap_time        float64
constructor_avg_lap_time    float64
pit_stop_delay              float64
dtype: object


In [16]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error

# XGBoost Model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Predictions
y_pred = xgb_model.predict(X_test)

# Evaluate Model
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"Mean Absolute Error: {mae:.2f} ms")
print(f"Root Mean Squared Error: {rmse:.2f} ms")


Mean Absolute Error: 2547.10 ms
Root Mean Squared Error: 18035.32 ms


In [33]:
import numpy as np

# Example custom input
custom_input = {
    'driverId': 277,  # Replace with a real driverId
    'constructorId': 24,  # Replace with a real constructorId
    'circuitId': 14,  # Replace with a real circuitId
    'lap': 25,  # Lap number
    'lap_progression': 25 / 60,  # Normalize using max laps in race
    'prev_lap_time': 92000,  # Last lap time in milliseconds (92 seconds)
    'rolling_avg_lap_time': 93000,  # Avg lap time in the race so far
    'constructor_avg_lap_time': 92500,  # Avg constructor lap time for this circuit
    'pit_stop_delay': 0  # Set to actual pit stop delay if applicable
}


In [34]:
import pandas as pd

# Convert dictionary to DataFrame
custom_input_df = pd.DataFrame([custom_input])




In [35]:
# Convert milliseconds to mm:ss.sss format
def format_lap_time(ms):
    minutes = int(ms // 60000)  # Get minutes
    seconds = (ms % 60000) / 1000  # Get remaining seconds
    return f"{minutes}:{seconds:06.3f}"  # Format as mm:ss.sss

# Predict lap time
predicted_lap_time = xgb_model.predict(custom_input_df)[0]

# Convert to standard format
formatted_lap_time = format_lap_time(predicted_lap_time)

print(f"Predicted Lap Time: {formatted_lap_time}")



Predicted Lap Time: 1:33.449
