In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

In [7]:
drivers_df = pd.read_csv('data/drivers.csv')
qualifying_df = pd.read_csv('data/qualifying.csv')
races_df = pd.read_csv('data/races.csv')
circuits_df = pd.read_csv('data/circuits.csv')


In [8]:
print(drivers_df.columns)
print(qualifying_df.columns)
print(races_df.columns)
print(circuits_df.columns)



Index(['driverId', 'driverRef', 'number', 'code', 'forename', 'surname', 'dob',
       'nationality', 'url'],
      dtype='object')
Index(['qualifyId', 'raceId', 'driverId', 'constructorId', 'number',
       'position', 'q1', 'q2', 'q3'],
      dtype='object')
Index(['raceId', 'year', 'round', 'circuitId', 'name', 'date', 'time', 'url',
       'fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time',
       'quali_date', 'quali_time', 'sprint_date', 'sprint_time'],
      dtype='object')
Index(['circuitId', 'circuitRef', 'name', 'location', 'country', 'lat', 'lng',
       'alt', 'url'],
      dtype='object')


In [9]:
# Merge qualifying data with driver data
qualifying_df = qualifying_df.merge(drivers_df[['driverId', 'code', 'forename', 'surname']], 
                                     on='driverId', how='left', 
                                     suffixes=('', '_driver'))

qualifying_df.head()

Unnamed: 0,qualifyId,raceId,driverId,constructorId,number,position,q1,q2,q3,code,forename,surname
0,1,18,1,1,22,1,1:26.572,1:25.187,1:26.714,HAM,Lewis,Hamilton
1,2,18,9,2,4,2,1:26.103,1:25.315,1:26.869,KUB,Robert,Kubica
2,3,18,5,1,23,3,1:25.664,1:25.452,1:27.079,KOV,Heikki,Kovalainen
3,4,18,13,6,2,4,1:25.994,1:25.691,1:27.178,MAS,Felipe,Massa
4,5,18,2,2,3,5,1:25.960,1:25.518,1:27.236,HEI,Nick,Heidfeld


In [10]:

# Merge qualifying data with race data (and avoid duplication of 'year' column)
qualifying_df = qualifying_df.merge(races_df[['raceId', 'name', 'year']], 
                                     on='raceId', how='left')

qualifying_df.head()

Unnamed: 0,qualifyId,raceId,driverId,constructorId,number,position,q1,q2,q3,code,forename,surname,name,year
0,1,18,1,1,22,1,1:26.572,1:25.187,1:26.714,HAM,Lewis,Hamilton,Australian Grand Prix,2008.0
1,2,18,9,2,4,2,1:26.103,1:25.315,1:26.869,KUB,Robert,Kubica,Australian Grand Prix,2008.0
2,3,18,5,1,23,3,1:25.664,1:25.452,1:27.079,KOV,Heikki,Kovalainen,Australian Grand Prix,2008.0
3,4,18,13,6,2,4,1:25.994,1:25.691,1:27.178,MAS,Felipe,Massa,Australian Grand Prix,2008.0
4,5,18,2,2,3,5,1:25.960,1:25.518,1:27.236,HEI,Nick,Heidfeld,Australian Grand Prix,2008.0


In [None]:

qualifying_df['race_year'] = qualifying_df['year']  # Rename 'year' to 'race_year' to avoid confusion

qualifying_df.head()

Unnamed: 0,qualifyId,raceId,driverId,constructorId,number,position,q1,q2,q3,code,forename,surname,name,year,race_year
0,1,18,1,1,22,1,1:26.572,1:25.187,1:26.714,HAM,Lewis,Hamilton,Australian Grand Prix,2008.0,2008.0
1,2,18,9,2,4,2,1:26.103,1:25.315,1:26.869,KUB,Robert,Kubica,Australian Grand Prix,2008.0,2008.0
2,3,18,5,1,23,3,1:25.664,1:25.452,1:27.079,KOV,Heikki,Kovalainen,Australian Grand Prix,2008.0,2008.0
3,4,18,13,6,2,4,1:25.994,1:25.691,1:27.178,MAS,Felipe,Massa,Australian Grand Prix,2008.0,2008.0
4,5,18,2,2,3,5,1:25.960,1:25.518,1:27.236,HEI,Nick,Heidfeld,Australian Grand Prix,2008.0,2008.0


In [None]:
qualifying_df = qualifying_df.drop(columns=['year'])
qualifying_filtered = qualifying_df[qualifying_df['race_year'].isin([2021, 2022, 2023, 2024])]

qualifying_filtered.head()

Unnamed: 0,qualifyId,raceId,driverId,constructorId,number,position,q1,q2,q3,code,forename,surname,name,race_year
8694,8735,1052,830,9,33,1,1:30.499,1:30.318,1:28.997,VER,Max,Verstappen,Bahrain Grand Prix,2021.0
8695,8736,1052,1,131,44,2,1:30.617,1:30.085,1:29.385,HAM,Lewis,Hamilton,Bahrain Grand Prix,2021.0
8696,8737,1052,822,131,77,3,1:31.200,1:30.186,1:29.586,BOT,Valtteri,Bottas,Bahrain Grand Prix,2021.0
8697,8738,1052,844,6,16,4,1:30.691,1:30.010,1:29.678,LEC,Charles,Leclerc,Bahrain Grand Prix,2021.0
8698,8739,1052,842,213,10,5,1:30.848,1:30.513,1:29.809,GAS,Pierre,Gasly,Bahrain Grand Prix,2021.0


In [None]:
qualifying_filtered = qualifying_filtered.rename(columns={
    'name': 'race_name',
    'code': 'driver_code',
    'forename': 'driver_name',
    'surname': 'driver_surname'
})

qualifying_filtered.head()


Unnamed: 0,qualifyId,raceId,driverId,constructorId,number,position,q1,q2,q3,driver_code,driver_name,driver_surname,race_name,race_year
8694,8735,1052,830,9,33,1,1:30.499,1:30.318,1:28.997,VER,Max,Verstappen,Bahrain Grand Prix,2021.0
8695,8736,1052,1,131,44,2,1:30.617,1:30.085,1:29.385,HAM,Lewis,Hamilton,Bahrain Grand Prix,2021.0
8696,8737,1052,822,131,77,3,1:31.200,1:30.186,1:29.586,BOT,Valtteri,Bottas,Bahrain Grand Prix,2021.0
8697,8738,1052,844,6,16,4,1:30.691,1:30.010,1:29.678,LEC,Charles,Leclerc,Bahrain Grand Prix,2021.0
8698,8739,1052,842,213,10,5,1:30.848,1:30.513,1:29.809,GAS,Pierre,Gasly,Bahrain Grand Prix,2021.0


In [None]:


# Create a dictionary for track features (Example)
all_track_features = {
    'Albert Park Grand Prix Circuit': {'altitude': 10, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Sepang International Circuit': {'altitude': 18, 'track_speed': 'medium', 'track_corners': 'tight', 'elevation_change': 'medium'},
    'Bahrain International Circuit': {'altitude': 7, 'track_speed': 'medium', 'track_corners': 'tight', 'elevation_change': 'low'},
    'Circuit de Barcelona-Catalunya': {'altitude': 109, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Istanbul Park': {'altitude': 130, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'high'},
    'Circuit de Monaco': {'altitude': 7, 'track_speed': 'low', 'track_corners': 'tight', 'elevation_change': 'high'},
    'Circuit Gilles Villeneuve': {'altitude': 13, 'track_speed': 'high', 'track_corners': 'tight', 'elevation_change': 'low'},
    'Circuit de Nevers Magny-Cours': {'altitude': 228, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Silverstone Circuit': {'altitude': 153, 'track_speed': 'high', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Hockenheimring': {'altitude': 103, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Hungaroring': {'altitude': 264, 'track_speed': 'low', 'track_corners': 'tight', 'elevation_change': 'low'},
    'Valencia Street Circuit': {'altitude': 4, 'track_speed': 'medium', 'track_corners': 'tight', 'elevation_change': 'low'},
    'Circuit de Spa-Francorchamps': {'altitude': 401, 'track_speed': 'high', 'track_corners': 'fast', 'elevation_change': 'high'},
    'Autodromo Nazionale di Monza': {'altitude': 162, 'track_speed': 'high', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Marina Bay Street Circuit': {'altitude': 18, 'track_speed': 'low', 'track_corners': 'tight', 'elevation_change': 'low'},
    'Fuji Speedway': {'altitude': 583, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'high'},
    'Shanghai International Circuit': {'altitude': 5, 'track_speed': 'medium', 'track_corners': 'tight', 'elevation_change': 'low'},
    'Autódromo José Carlos Pace': {'altitude': 785, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'high'},
    'Indianapolis Motor Speedway': {'altitude': 223, 'track_speed': 'high', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Nürburgring': {'altitude': 578, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'high'},
    'Autodromo Enzo e Dino Ferrari': {'altitude': 37, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Suzuka Circuit': {'altitude': 45, 'track_speed': 'high', 'track_corners': 'fast', 'elevation_change': 'high'},
    'Las Vegas Strip Street Circuit': {'altitude': 642, 'track_speed': 'high', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Yas Marina Circuit': {'altitude': 3, 'track_speed': 'medium', 'track_corners': 'tight', 'elevation_change': 'low'},
    'Autódromo Juan y Oscar Gálvez': {'altitude': 8, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Circuito de Jerez': {'altitude': 37, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Autódromo do Estoril': {'altitude': 130, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Okayama International Circuit': {'altitude': 266, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Adelaide Street Circuit': {'altitude': 58, 'track_speed': 'medium', 'track_corners': 'tight', 'elevation_change': 'low'},
    'Kyalami': {'altitude': 1460, 'track_speed': 'high', 'track_corners': 'fast', 'elevation_change': 'high'},
    'Donington Park': {'altitude': 88, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Autódromo Hermanos Rodríguez': {'altitude': 2227, 'track_speed': 'high', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Phoenix street circuit': {'altitude': 345, 'track_speed': 'medium', 'track_corners': 'tight', 'elevation_change': 'low'},
    'Circuit Paul Ricard': {'altitude': 432, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Korean International Circuit': {'altitude': 0, 'track_speed': 'medium', 'track_corners': 'tight', 'elevation_change': 'low'},
    'Autódromo Internacional Nelson Piquet': {'altitude': 1126, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'high'},
    'Detroit Street Circuit': {'altitude': 177, 'track_speed': 'medium', 'track_corners': 'tight', 'elevation_change': 'low'},
    'Brands Hatch': {'altitude': 145, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Circuit Park Zandvoort': {'altitude': 6, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'high'},
    'Zolder': {'altitude': 36, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Dijon-Prenois': {'altitude': 484, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Fair Park': {'altitude': 139, 'track_speed': 'medium', 'track_corners': 'tight', 'elevation_change': 'low'},
    'Long Beach': {'altitude': 12, 'track_speed': 'medium', 'track_corners': 'tight', 'elevation_change': 'low'},
    'Las Vegas Street Circuit': {'altitude': 639, 'track_speed': 'high', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Jarama': {'altitude': 609, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Watkins Glen': {'altitude': 485, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Scandinavian Raceway': {'altitude': 153, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Mosport International Raceway': {'altitude': 332, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Montjuïc': {'altitude': 79, 'track_speed': 'medium', 'track_corners': 'tight', 'elevation_change': 'low'},
    'Nivelles-Baulers': {'altitude': 139, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Charade Circuit': {'altitude': 790, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'high'},
    'Circuit Mont-Tremblant': {'altitude': 214, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Rouen-Les-Essarts': {'altitude': 81, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Le Mans': {'altitude': 67, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Reims-Gueux': {'altitude': 88, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Prince George Circuit': {'altitude': 15, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Zeltweg': {'altitude': 676, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'high'},
    'Aintree': {'altitude': 20, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Circuito da Boavista': {'altitude': 28, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Riverside': {'altitude': 0, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'AVUS': {'altitude': 0, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Monsanto': {'altitude': 0, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Sebring': {'altitude': 0, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Ain-Diab': {'altitude': 0, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Pescara': {'altitude': 0, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Bremgarten': {'altitude': 0, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Pedralbes': {'altitude': 0, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Buddh International Circuit': {'altitude': 0, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Circuit of the Americas': {'altitude': 0, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Red Bull Ring': {'altitude': 0, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Sochi Autodrom': {'altitude': 0, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Baku City Circuit': {'altitude': 0, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Autódromo Internacional do Algarve': {'altitude': 108, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'high'},
    'Autodromo Internazionale del Mugello': {'altitude': 255, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'high'},
    'Jeddah Corniche Circuit': {'altitude': 15, 'track_speed': 'high', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Losail International Circuit': {'altitude': 12, 'track_speed': 'high', 'track_corners': 'fast', 'elevation_change': 'low'},
    'Miami International Autodrome': {'altitude': 0, 'track_speed': 'medium', 'track_corners': 'fast', 'elevation_change': 'low'}
}


# Extracting Track Altitudes from data/circuits.csv
tracks = {}
track_names = circuits_df['circuitRef'].unique()

for track in track_names:
    track_data = circuits_df[circuits_df['circuitRef'] == track]
    track_features = {
        'name': track_data['name'].values[0],
        'alt': track_data['alt'].values[0],
        'city': track_data['location'].values[0],
        'country': track_data['country'].values[0],
        'lat': track_data['lat'].values[0],
        'lng': track_data['lng'].values[0]
    }
    tracks[track] = track_features


# updated tracks
for keys, values in tracks.items():
    for keys2, values2 in all_track_features.items():
        if values['name'] == keys2:
            values['track_speed'] = values2['track_speed']
            values['track_corners'] = values2['track_corners']
            values['elevation_change'] = values2['elevation_change']


tracks_df = pd.DataFrame.from_dict(tracks, orient='index')              # <- This is the new Track DataFrame
tracks_df.index.name = 'track_id'
tracks_df.head()


Unnamed: 0_level_0,name,alt,city,country,lat,lng,track_speed,track_corners,elevation_change
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
albert_park,Albert Park Grand Prix Circuit,10,Melbourne,Australia,-37.8497,144.968,medium,fast,low
sepang,Sepang International Circuit,18,Kuala Lumpur,Malaysia,2.76083,101.738,medium,tight,medium
bahrain,Bahrain International Circuit,7,Sakhir,Bahrain,26.0325,50.5106,medium,tight,low
catalunya,Circuit de Barcelona-Catalunya,109,Montmeló,Spain,41.57,2.26111,medium,fast,low
istanbul,Istanbul Park,130,Istanbul,Turkey,40.9517,29.405,medium,fast,high


In [41]:
# categorical_columns = ['track_speed', 'track_corners', 'elevation_change']
# numerical_columns = ['altitude', 'q1_time', 'q2_time', 'q3_time']

In [42]:
# def lap_time_to_seconds(lap_time):
#     try:
#         # Handle null or missing data
#         if lap_time == '\\N' or pd.isnull(lap_time):
#             return None  # or a default value like 0 or a specific marker for missing data
        
#         # Handle times in the format "mm:ss"
#         minutes, seconds = lap_time.split(':')
#         return int(minutes) * 60 + float(seconds)
    
#     except ValueError:
#         # Handle cases where lap_time might not be in the expected format (e.g., '1.234' seconds)
#         try:
#             return float(lap_time)  # If it's already a float or something like '1.234'
#         except ValueError:
#             return None  # or handle it as appropriate

# # Apply the function to convert times if column names are correct
# for col in ['q1_time', 'q2_time', 'q3_time']:  # Adjust column names if necessary
#     quali_data[col] = quali_data[col].apply(lap_time_to_seconds)

# # Check the updated dataset
# print(quali_data.head())



In [43]:
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OrdinalEncoder(), categorical_columns),
#         ('num', Pipeline(steps=[
#             ('imputer', SimpleImputer(strategy='mean')),
#             ('scaler', StandardScaler())
#         ]), numerical_columns)
#     ])


In [44]:
# model_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),  # Apply preprocessing
#     ('model', RandomForestRegressor())  # RandomForest as an example model
# ])