Installing the fastf1 api


In [1]:
!pip install fastf1 




[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\aadha\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Now to set up a cache to avoid making repeated calls to fastf1.

In [3]:
import fastf1
fastf1.Cache.enable_cache('../data_raw/f1_cache')


In [4]:
# Cell 3: Load Singapore 2023 Qualifying Session

session = fastf1.get_session(2023, 'Singapore', 'Q')  # 'Q' = full qualifying (Q1+Q2+Q3)
session.load()  # pulls data; cached automatically if already downloaded

laps_2023_sg = session.laps  # FastF1 provides laps as a DataFrame

laps_2023_sg.head()


core           INFO 	Loading data for Singapore Grand Prix - Qualifying [v3.7.0]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INF

Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,...,FreshTyre,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,DeletedReason,FastF1Generated,IsAccurate
0,0 days 00:18:13.606000,SAI,55,NaT,1.0,1.0,0 days 00:16:01.998000,NaT,NaT,0 days 00:00:51.737000,...,True,Ferrari,0 days 00:16:01.998000,2023-09-16 13:01:02.637,1,,False,,False,False
1,0 days 00:19:46.571000,SAI,55,0 days 00:01:32.965000,2.0,1.0,NaT,NaT,0 days 00:00:27.296000,0 days 00:00:39.411000,...,True,Ferrari,0 days 00:18:13.606000,2023-09-16 13:03:14.245,1,,False,,False,True
2,0 days 00:22:28.473000,SAI,55,NaT,3.0,1.0,NaT,NaT,0 days 00:00:45.025000,0 days 00:01:08.058000,...,True,Ferrari,0 days 00:19:46.571000,2023-09-16 13:04:47.210,1,,False,,False,False
3,0 days 00:24:49.134000,SAI,55,0 days 00:02:20.661000,4.0,1.0,NaT,NaT,0 days 00:00:47.129000,0 days 00:00:58.322000,...,True,Ferrari,0 days 00:22:28.473000,2023-09-16 13:07:29.112,1,,False,,False,True
4,0 days 00:26:21.473000,SAI,55,0 days 00:01:32.339000,5.0,1.0,NaT,NaT,0 days 00:00:27.074000,0 days 00:00:39.093000,...,True,Ferrari,0 days 00:24:49.134000,2023-09-16 13:09:49.773,1,,False,,False,True


In [5]:
import pandas as pd

years = list(range(2018, 2025))  # 2018–2024
all_sg_laps = []

for y in years:
    try:
        print(f"Loading Singapore {y} Qualifying...")
        session = fastf1.get_session(y, 'Singapore', 'Q')
        session.load()
        
        laps = session.laps.copy()
        laps['year'] = y
        all_sg_laps.append(laps)
        
        print(f"✔ Success: {y}")
    except Exception as e:
        print(f"✘ Failed {y}: {e}")

# Combine into one DataFrame
df_sg = pd.concat(all_sg_laps, ignore_index=True)

df_sg.head()


Loading Singapore 2018 Qualifying...


core           INFO 	Loading data for Singapore Grand Prix - Qualifying [v3.7.0]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INF

✔ Success: 2018
Loading Singapore 2019 Qualifying...


core           INFO 	Loading data for Singapore Grand Prix - Qualifying [v3.7.0]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INF

✔ Success: 2019
Loading Singapore 2020 Qualifying...


core           INFO 	Loading data for Hungarian Grand Prix - Qualifying [v3.7.0]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INF

✔ Success: 2020
Loading Singapore 2021 Qualifying...


core           INFO 	Loading data for Hungarian Grand Prix - Qualifying [v3.7.0]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INF

✔ Success: 2021
Loading Singapore 2022 Qualifying...


core           INFO 	Loading data for Singapore Grand Prix - Qualifying [v3.7.0]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INF

✔ Success: 2022
Loading Singapore 2023 Qualifying...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['55', '63', '16', '4', '44', '20', '14', '31', '27', '40', '1', '10', '11', '23', '22', '77', '81', '2', '24', '18']


✔ Success: 2023
Loading Singapore 2024 Qualifying...


core           INFO 	Loading data for Singapore Grand Prix - Qualifying [v3.7.0]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INF

✔ Success: 2024


Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,...,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,DeletedReason,FastF1Generated,IsAccurate,year
0,0 days 00:22:24.178000,HAM,44,NaT,1.0,1.0,0 days 00:20:09.764000,NaT,NaT,0 days 00:00:53.146000,...,Mercedes,0 days 00:20:09.764000,2018-09-15 13:05:09.815,1,,False,,False,False,2018
1,0 days 00:24:03.895000,HAM,44,0 days 00:01:39.717000,2.0,1.0,NaT,NaT,0 days 00:00:27.045000,0 days 00:00:38.321000,...,Mercedes,0 days 00:22:24.178000,2018-09-15 13:07:24.229,1,,False,,False,True,2018
2,0 days 00:26:43.809000,HAM,44,NaT,3.0,1.0,NaT,NaT,0 days 00:00:44.518000,0 days 00:01:07.300000,...,Mercedes,0 days 00:24:03.895000,2018-09-15 13:09:03.946,1,,False,,False,False,2018
3,0 days 00:28:23.212000,HAM,44,0 days 00:01:39.403000,4.0,1.0,NaT,NaT,0 days 00:00:27.076000,0 days 00:00:38.278000,...,Mercedes,0 days 00:26:43.809000,2018-09-15 13:11:43.860,1,,False,,False,True,2018
4,0 days 00:31:07.494000,HAM,44,NaT,5.0,1.0,NaT,NaT,0 days 00:00:37.070000,0 days 00:01:19.316000,...,Mercedes,0 days 00:28:23.212000,2018-09-15 13:13:23.263,1,,False,,False,False,2018


In [6]:
df_sg.head()


Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,...,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,DeletedReason,FastF1Generated,IsAccurate,year
0,0 days 00:22:24.178000,HAM,44,NaT,1.0,1.0,0 days 00:20:09.764000,NaT,NaT,0 days 00:00:53.146000,...,Mercedes,0 days 00:20:09.764000,2018-09-15 13:05:09.815,1,,False,,False,False,2018
1,0 days 00:24:03.895000,HAM,44,0 days 00:01:39.717000,2.0,1.0,NaT,NaT,0 days 00:00:27.045000,0 days 00:00:38.321000,...,Mercedes,0 days 00:22:24.178000,2018-09-15 13:07:24.229,1,,False,,False,True,2018
2,0 days 00:26:43.809000,HAM,44,NaT,3.0,1.0,NaT,NaT,0 days 00:00:44.518000,0 days 00:01:07.300000,...,Mercedes,0 days 00:24:03.895000,2018-09-15 13:09:03.946,1,,False,,False,False,2018
3,0 days 00:28:23.212000,HAM,44,0 days 00:01:39.403000,4.0,1.0,NaT,NaT,0 days 00:00:27.076000,0 days 00:00:38.278000,...,Mercedes,0 days 00:26:43.809000,2018-09-15 13:11:43.860,1,,False,,False,True,2018
4,0 days 00:31:07.494000,HAM,44,NaT,5.0,1.0,NaT,NaT,0 days 00:00:37.070000,0 days 00:01:19.316000,...,Mercedes,0 days 00:28:23.212000,2018-09-15 13:13:23.263,1,,False,,False,False,2018


In [7]:
df_sg['Driver'].unique()



array(['HAM', 'VER', 'VET', 'BOT', 'RAI', 'RIC', 'PER', 'GRO', 'OCO',
       'HUL', 'ALO', 'SAI', 'LEC', 'ERI', 'GAS', 'MAG', 'HAR', 'VAN',
       'SIR', 'STR', 'ALB', 'NOR', 'GIO', 'KVY', 'RUS', 'KUB', 'LAT',
       'TSU', 'MAZ', 'MSC', 'ZHO', 'LAW', 'PIA', 'SAR', 'COL'],
      dtype=object)

In [8]:
df_sg['year'].value_counts()



year
2022    339
2020    324
2023    297
2019    278
2024    276
2018    269
2021    253
Name: count, dtype: int64

In [9]:
session.event


RoundNumber                                                         18
Country                                                      Singapore
Location                                                    Marina Bay
OfficialEventName    FORMULA 1 SINGAPORE AIRLINES SINGAPORE GRAND P...
EventDate                                          2024-09-22 00:00:00
EventName                                         Singapore Grand Prix
EventFormat                                               conventional
Session1                                                    Practice 1
Session1Date                                 2024-09-20 17:30:00+08:00
Session1DateUtc                                    2024-09-20 09:30:00
Session2                                                    Practice 2
Session2Date                                 2024-09-20 21:00:00+08:00
Session2DateUtc                                    2024-09-20 13:00:00
Session3                                                    Practice 3
Sessio

In [10]:
session = fastf1.get_session(2020, 'Singapore', 'Q')
session.load()
session.event



core           INFO 	Loading data for Hungarian Grand Prix - Qualifying [v3.7.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '77', '18', '11', '5', '16', '33', '4', '55', '10', '3', '63', '23', '31', '6', '20', '26', '8', '99', '7']


RoundNumber                                             3
Country                                           Hungary
Location                                         Budapest
OfficialEventName    FORMULA 1 ARAMCO MAGYAR NAGYDÍJ 2020
EventDate                             2020-07-19 00:00:00
EventName                            Hungarian Grand Prix
EventFormat                                  conventional
Session1                                       Practice 1
Session1Date                    2020-07-17 11:00:00+02:00
Session1DateUtc                       2020-07-17 09:00:00
Session2                                       Practice 2
Session2Date                    2020-07-17 15:00:00+02:00
Session2DateUtc                       2020-07-17 13:00:00
Session3                                       Practice 3
Session3Date                    2020-07-18 12:00:00+02:00
Session3DateUtc                       2020-07-18 10:00:00
Session4                                       Qualifying
Session4Date  

In [11]:
# Cell: Clean Singapore qualifying laps (2018–2024)

# 1. Remove in-laps and out-laps (laps that start or end in the pits)
df_clean = df_sg[
    df_sg['PitOutTime'].isna() &  # not an out-lap
    df_sg['PitInTime'].isna()     # not an in-lap
].copy()

# 2. Remove deleted laps (track limits, etc.)
if 'Deleted' in df_clean.columns:
    df_clean = df_clean[df_clean['Deleted'] == False]

# 3. Keep only laps with a valid lap time
df_clean = df_clean[df_clean['LapTime'].notna()]

# 4. Keep only laps with all three sector times present
df_clean = df_clean[
    df_clean['Sector1Time'].notna() &
    df_clean['Sector2Time'].notna() &
    df_clean['Sector3Time'].notna()
]

# 5. Save cleaned data to CSV so we can reuse it later
df_clean.to_csv('../data_cleaned/singapore_quali_clean_2018_2024.csv', index=False)

# Quick sanity checks
print("Clean shape:", df_clean.shape)
print("Years:\n", df_clean['year'].value_counts())
print("Drivers:", df_clean['Driver'].nunique())
df_clean.head()


Clean shape: (797, 32)
Years:
 year
2022    179
2020    134
2023    104
2019    103
2018     98
2021     92
2024     87
Name: count, dtype: int64
Drivers: 35


Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,...,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,DeletedReason,FastF1Generated,IsAccurate,year
1,0 days 00:24:03.895000,HAM,44,0 days 00:01:39.717000,2.0,1.0,NaT,NaT,0 days 00:00:27.045000,0 days 00:00:38.321000,...,Mercedes,0 days 00:22:24.178000,2018-09-15 13:07:24.229,1,,False,,False,True,2018
3,0 days 00:28:23.212000,HAM,44,0 days 00:01:39.403000,4.0,1.0,NaT,NaT,0 days 00:00:27.076000,0 days 00:00:38.278000,...,Mercedes,0 days 00:26:43.809000,2018-09-15 13:11:43.860,1,,False,,False,True,2018
7,0 days 00:45:18.757000,HAM,44,0 days 00:01:37.344000,8.0,2.0,NaT,NaT,0 days 00:00:26.634000,0 days 00:00:37.478000,...,Mercedes,0 days 00:43:41.413000,2018-09-15 13:28:41.464,1,,False,,False,True,2018
10,0 days 00:56:03.959000,HAM,44,0 days 00:01:38.889000,11.0,3.0,NaT,NaT,0 days 00:00:27.288000,0 days 00:00:37.353000,...,Mercedes,0 days 00:54:25.070000,2018-09-15 13:39:25.121,1,,False,,False,True,2018
13,0 days 01:07:30.810000,HAM,44,0 days 00:01:36.015000,14.0,4.0,NaT,NaT,0 days 00:00:26.263000,0 days 00:00:36.967000,...,Mercedes,0 days 01:05:54.795000,2018-09-15 13:50:54.846,1,,False,,False,True,2018


In [15]:
import pandas as pd

# Only years where Singapore GP actually happened
years = [2018, 2019, 2022, 2023, 2024]
all_sg_laps = []

for y in years:
    try:
        print(f"Loading Singapore {y} Qualifying...")
        session = fastf1.get_session(y, 'Singapore', 'Q')
        session.load()

        # Just to sanity check in the log:
        print("  Event loaded:", session.event.EventName)

        laps = session.laps.copy()
        laps['year'] = y
        all_sg_laps.append(laps)

        print(f"✔ Success: {y}")
    except Exception as e:
        print(f"✘ Failed {y}: {e}")

df_sg = pd.concat(all_sg_laps, ignore_index=True)

# Check years present
df_sg['year'].value_counts()


Loading Singapore 2018 Qualifying...


core           INFO 	Loading data for Singapore Grand Prix - Qualifying [v3.7.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '33', '5', '77', '7', '3', '11', '8', '31', '27', '14', '55', '16', '9', '10', '20', '28', '2', '35', '18']


  Event loaded: Singapore Grand Prix
✔ Success: 2018
Loading Singapore 2019 Qualifying...


core           INFO 	Loading data for Singapore Grand Prix - Qualifying [v3.7.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '44', '5', '33', '77', '23', '55', '27', '4', '11', '99', '10', '7', '20', '26', '18', '8', '63', '88', '3']


  Event loaded: Singapore Grand Prix
✔ Success: 2019
Loading Singapore 2022 Qualifying...


core           INFO 	Loading data for Singapore Grand Prix - Qualifying [v3.7.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '11', '44', '55', '14', '4', '10', '1', '20', '22', '63', '18', '47', '5', '24', '77', '3', '31', '23', '6']


  Event loaded: Singapore Grand Prix
✔ Success: 2022
Loading Singapore 2023 Qualifying...


core           INFO 	Loading data for Singapore Grand Prix - Qualifying [v3.7.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['55', '63', '16', '4', '44', '20', '14', '31', '27', '40', '1', '10', '11', '23', '22', '77', '81', '2', '24', '18']


  Event loaded: Singapore Grand Prix
✔ Success: 2023
Loading Singapore 2024 Qualifying...


core           INFO 	Loading data for Singapore Grand Prix - Qualifying [v3.7.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['4', '1', '44', '63', '81', '27', '14', '22', '16', '55', '23', '43', '11', '20', '31', '3', '18', '10', '77', '24']


  Event loaded: Singapore Grand Prix
✔ Success: 2024


year
2022    339
2023    297
2019    278
2024    276
2018    269
Name: count, dtype: int64

In [16]:
# 1. Remove in-laps and out-laps
df_clean = df_sg[
    df_sg['PitOutTime'].isna() &
    df_sg['PitInTime'].isna()
].copy()

# 2. Remove deleted laps
if 'Deleted' in df_clean.columns:
    df_clean = df_clean[df_clean['Deleted'] == False]

# 3. Valid lap times only
df_clean = df_clean[df_clean['LapTime'].notna()]

# 4. Complete sector times only
df_clean = df_clean[
    df_clean['Sector1Time'].notna() &
    df_clean['Sector2Time'].notna() &
    df_clean['Sector3Time'].notna()
]

df_clean.to_csv('../data_cleaned/singapore_quali_clean_2018_2024.csv', index=False)

print("Clean shape:", df_clean.shape)
print("Years:\n", df_clean['year'].value_counts())
print("Drivers:", df_clean['Driver'].nunique())
df_clean.head()


Clean shape: (571, 32)
Years:
 year
2022    179
2023    104
2019    103
2018     98
2024     87
Name: count, dtype: int64
Drivers: 34


Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,...,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,DeletedReason,FastF1Generated,IsAccurate,year
1,0 days 00:24:03.895000,HAM,44,0 days 00:01:39.717000,2.0,1.0,NaT,NaT,0 days 00:00:27.045000,0 days 00:00:38.321000,...,Mercedes,0 days 00:22:24.178000,2018-09-15 13:07:24.229,1,,False,,False,True,2018
3,0 days 00:28:23.212000,HAM,44,0 days 00:01:39.403000,4.0,1.0,NaT,NaT,0 days 00:00:27.076000,0 days 00:00:38.278000,...,Mercedes,0 days 00:26:43.809000,2018-09-15 13:11:43.860,1,,False,,False,True,2018
7,0 days 00:45:18.757000,HAM,44,0 days 00:01:37.344000,8.0,2.0,NaT,NaT,0 days 00:00:26.634000,0 days 00:00:37.478000,...,Mercedes,0 days 00:43:41.413000,2018-09-15 13:28:41.464,1,,False,,False,True,2018
10,0 days 00:56:03.959000,HAM,44,0 days 00:01:38.889000,11.0,3.0,NaT,NaT,0 days 00:00:27.288000,0 days 00:00:37.353000,...,Mercedes,0 days 00:54:25.070000,2018-09-15 13:39:25.121,1,,False,,False,True,2018
13,0 days 01:07:30.810000,HAM,44,0 days 00:01:36.015000,14.0,4.0,NaT,NaT,0 days 00:00:26.263000,0 days 00:00:36.967000,...,Mercedes,0 days 01:05:54.795000,2018-09-15 13:50:54.846,1,,False,,False,True,2018


In [17]:
# Cell: Build feature table for Singapore (fastest lap per driver/year)

import numpy as np

# 1. Take the fastest lap for each driver in each year
fastlaps = (
    df_clean
    .sort_values(['year', 'Driver', 'LapTime'])
    .groupby(['year', 'Driver'])
    .head(1)                 # keep fastest lap per (year, driver)
    .reset_index(drop=True)
)

print("Fastlaps shape:", fastlaps.shape)

# 2. Convert time columns (Timedelta) -> seconds (float)
time_cols = ['LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time']

for col in time_cols:
    fastlaps[col + '_s'] = fastlaps[col].dt.total_seconds()

# 3. Clean TyreLife (sometimes float/NaN)
fastlaps['TyreLife'] = fastlaps['TyreLife'].fillna(0).astype(int)

# 4. Encode categorical features as integers (for XGBoost)
for col in ['Driver', 'Team', 'Compound']:
    fastlaps[col + '_id'] = fastlaps[col].astype('category').cat.codes

# 5. Create target: grid position = rank of fastest lap time within each year
fastlaps['final_grid_pos'] = (
    fastlaps
    .groupby('year')['LapTime_s']
    .rank(method='first')    # 1 = fastest lap in that year, 2 = 2nd fastest, ...
    .astype(int)
)

# 6. Choose the feature columns we want for the first vanilla model
feature_cols = [
    'year',
    'Sector1Time_s', 'Sector2Time_s', 'Sector3Time_s',
    'TyreLife',
    'Driver_id', 'Team_id', 'Compound_id'
]

df_feat = fastlaps[feature_cols + ['final_grid_pos']].copy()

# 7. Save to features folder
df_feat.to_csv('../data_features/sg_features_v1.csv', index=False)

print("Feature table shape:", df_feat.shape)
df_feat.head()


Fastlaps shape: (100, 32)
Feature table shape: (100, 9)


Unnamed: 0,year,Sector1Time_s,Sector2Time_s,Sector3Time_s,TyreLife,Driver_id,Team_id,Compound_id,final_grid_pos
0,2018,26.84,38.065,33.736,2,1,8,0,11
1,2018,26.468,37.284,32.95,2,2,9,0,4
2,2018,27.077,38.141,34.148,2,4,14,0,14
3,2018,27.253,38.28,34.081,2,5,15,0,15
4,2018,26.772,37.737,33.811,1,7,6,0,8


In [18]:
# Cell 8: Train / Eval / Test split by year

feature_cols = [
    'year',
    'Sector1Time_s', 'Sector2Time_s', 'Sector3Time_s',
    'TyreLife',
    'Driver_id', 'Team_id', 'Compound_id'
]

target_col = 'final_grid_pos'

# Train on earlier years
train = df_feat[df_feat['year'].isin([2018, 2019, 2022])].copy()
# Use 2023 to check model while developing
eval_  = df_feat[df_feat['year'] == 2023].copy()
# Keep 2024 as "final exam"
test  = df_feat[df_feat['year'] == 2024].copy()

X_train, y_train = train[feature_cols], train[target_col]
X_eval,  y_eval  = eval_[feature_cols],  eval_[target_col]
X_test,  y_test  = test[feature_cols],  test[target_col]

print("Train size:", X_train.shape)
print("Eval size:",  X_eval.shape)
print("Test size:",  X_test.shape)


Train size: (60, 8)
Eval size: (20, 8)
Test size: (20, 8)


In [19]:
# Cell 9: Install and train vanilla XGBoost

!pip install xgboost scikit-learn

from xgboost import XGBRegressor

# simple vanilla-ish model; CPU-friendly
model = XGBRegressor(
    tree_method="hist",
    random_state=42
)

model.fit(X_train, y_train)

print("Done training.")



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\aadha\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Collecting xgboost
  Downloading xgboost-3.1.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Downloading joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading xgboost-3.1.3-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/72.0 MB 9.4 MB/s eta 0:00:08
    --------------------------------------- 1.3/72.0 MB 16.1 MB/s eta 0:00:05
   - -------------------------------------- 2.7/72.0 MB 21.4 MB/s eta 0:00:04
   -- ------------------------------------- 3.8/72.0 MB 22.2 MB/s eta 0:00:04
   --- ------------------------------------ 5.7/72.0 MB 24.3 MB/s eta 0:00:03
   --- ------------------------------------ 7.2/72.0 MB 27.1 MB/s e

In [None]:
# Cell 10: Evaluate model on train / eval / test

import numpy as np
from sklearn.metrics import mean_absolute_error

def evaluate_split(name, X, y_true):
    y_pred = model.predict(X)
    
    mae = mean_absolute_error(y_true, y_pred)
    abs_err = np.abs(y_pred - y_true)

    within1 = (abs_err <= 1).mean()
    within2 = (abs_err <= 2).mean()
    within3 = (abs_err <= 3).mean()

    print(f"=== {name} ===")
    print(f"MAE (grid positions): {mae:.2f}")
    print(f"Within ±1 position: {within1*100:.1f}%")
    print(f"Within ±2 positions: {within2*100:.1f}%")
    print(f"Within ±3 positions: {within3*100:.1f}%")
    print()

evaluate_split("TRAIN", X_train, y_train)
evaluate_split("EVAL",  X_eval,  y_eval)
evaluate_split("TEST",  X_test,  y_test)


=== TRAIN ===
MAE (grid positions): 0.00
Within ±1 position: 100.0%
Within ±2 positions: 100.0%
Within ±3 positions: 100.0%

=== EVAL ===
MAE (grid positions): 4.14
Within ±1 position: 25.0%
Within ±2 positions: 30.0%
Within ±3 positions: 40.0%

=== TEST ===
MAE (grid positions): 4.79
Within ±1 position: 5.0%
Within ±2 positions: 20.0%
Within ±3 positions: 35.0%



: 