# Learning How to USE FASTF1 API

In [1]:
import pandas as pd

def diagnose_dataframes(files):
    """
    Diagnose column names in multiple CSV files
    
    Args:
        files (list): List of CSV file paths
    """
    for file in files:
        print(f"\nColumns in {file}:")
        df = pd.read_csv(file)
        print(df.columns.tolist())
        print("\nFirst few rows:")
        print(df.head())

# Paths to your CSV files
qualifying_files = ['australian_gp_qualifying_2023.csv', 'australian_gp_qualifying_2024.csv']
race_files = ['australian_gp_race_2023.csv', 'australian_gp_race_2024.csv']

# Run diagnosis
print("Diagnosing Qualifying Files:")
diagnose_dataframes(qualifying_files)

print("\n\nDiagnosing Race Files:")
diagnose_dataframes(race_files)

Diagnosing Qualifying Files:

Columns in australian_gp_qualifying_2023.csv:
['DriverNumber', 'BroadcastName', 'Abbreviation', 'DriverId', 'TeamName', 'TeamColor', 'TeamId', 'FirstName', 'LastName', 'FullName', 'HeadshotUrl', 'CountryCode', 'Position', 'ClassifiedPosition', 'GridPosition', 'Q1', 'Q2', 'Q3', 'Time', 'Status', 'Points', 'Year']

First few rows:
   DriverNumber BroadcastName Abbreviation        DriverId         TeamName  \
0             1  M VERSTAPPEN          VER  max_verstappen  Red Bull Racing   
1            63     G RUSSELL          RUS         russell         Mercedes   
2            44    L HAMILTON          HAM        hamilton         Mercedes   
3            14      F ALONSO          ALO          alonso     Aston Martin   
4            55       C SAINZ          SAI           sainz          Ferrari   

  TeamColor        TeamId FirstName    LastName         FullName  ...  \
0    3671C6      red_bull       Max  Verstappen   Max Verstappen  ...   
1    6CD3BF      m

In [2]:
import pandas as pd
import numpy as np

def load_data(files):
    """
    Load and concatenate data from multiple CSV files
    
    Args:
        files (list): List of CSV file paths
    
    Returns:
        pd.DataFrame: Combined dataframe
    """
    dataframes = [pd.read_csv(file) for file in files]
    return pd.concat(dataframes, ignore_index=True)

def main():
    # Paths to your CSV files
    qualifying_files = ['australian_gp_qualifying_2023.csv', 'australian_gp_qualifying_2024.csv']
    race_files = ['australian_gp_race_2023.csv', 'australian_gp_race_2024.csv']
    
    # Load qualifying and race data
    qualifying_data = load_data(qualifying_files)
    race_data = load_data(race_files)
    
    # Step 1: Inspect the data
    print("Qualifying Data Columns:")
    print(qualifying_data.columns.tolist())
    
    print("\nRace Data Columns:")
    print(race_data.columns.tolist())
    
    # Step 2: Check for common columns for merging
    print("\nCommon Columns for Merging:")
    common_columns = list(set(qualifying_data.columns) & set(race_data.columns))
    print(common_columns)
    
    # Step 3: Display basic information about the datasets
    print("\nQualifying Data Info:")
    print(qualifying_data.info())
    
    print("\nRace Data Info:")
    print(race_data.info())
    
    # Step 4: Check for missing values
    print("\nMissing Values in Qualifying Data:")
    print(qualifying_data.isnull().sum())
    
    print("\nMissing Values in Race Data:")
    print(race_data.isnull().sum())

if __name__ == "__main__":
    main()

Qualifying Data Columns:
['DriverNumber', 'BroadcastName', 'Abbreviation', 'DriverId', 'TeamName', 'TeamColor', 'TeamId', 'FirstName', 'LastName', 'FullName', 'HeadshotUrl', 'CountryCode', 'Position', 'ClassifiedPosition', 'GridPosition', 'Q1', 'Q2', 'Q3', 'Time', 'Status', 'Points', 'Year']

Race Data Columns:
['DriverNumber', 'BroadcastName', 'Abbreviation', 'DriverId', 'TeamName', 'TeamColor', 'TeamId', 'FirstName', 'LastName', 'FullName', 'HeadshotUrl', 'CountryCode', 'Position', 'ClassifiedPosition', 'GridPosition', 'Q1', 'Q2', 'Q3', 'Time', 'Status', 'Points', 'Year']

Common Columns for Merging:
['FirstName', 'Q3', 'HeadshotUrl', 'Position', 'Time', 'TeamColor', 'TeamId', 'Q2', 'TeamName', 'FullName', 'Q1', 'GridPosition', 'Year', 'DriverId', 'LastName', 'Status', 'Points', 'Abbreviation', 'BroadcastName', 'CountryCode', 'DriverNumber', 'ClassifiedPosition']

Qualifying Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 22 column

In [7]:
import pandas as pd

def process_qualifying_data(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Create a new DataFrame with required columns
    result = pd.DataFrame()
    result['DriverName'] = df['BroadcastName']
    result['DriverNumber'] = df['DriverNumber']
    result['GridPosition'] = df['Position']
    
    # Determine qualifying time based on position
    def get_quali_time(row):
        if row['Position'] <= 10:
            return row['Q3']
        elif row['Position'] <= 15:
            return row['Q2']
        else:
            return row['Q1']
    
    result['QualifyTime'] = df.apply(get_quali_time, axis=1)
    
    # Convert time strings to timedelta
    result['QualifyTime'] = pd.to_timedelta(result['QualifyTime'])
    
    # Sort by GridPosition
    result = result.sort_values('GridPosition')
    
    return result

# Process data for both years
quali_2024 = process_qualifying_data('australian_gp_qualifying_2024.csv')
quali_2023 = process_qualifying_data('australian_gp_qualifying_2023.csv')

# Display the results
print("2024 Qualifying Results:")
print(quali_2024)
print("\n2023 Qualifying Results:")
print(quali_2023)


2024 Qualifying Results:
      DriverName  DriverNumber  GridPosition            QualifyTime
0   M VERSTAPPEN             1           1.0 0 days 00:01:15.915000
1        C SAINZ            55           2.0 0 days 00:01:16.185000
2        S PEREZ            11           3.0 0 days 00:01:16.274000
3       L NORRIS             4           4.0 0 days 00:01:16.315000
4      C LECLERC            16           5.0 0 days 00:01:16.435000
5      O PIASTRI            81           6.0 0 days 00:01:16.572000
6      G RUSSELL            63           7.0 0 days 00:01:16.724000
7      Y TSUNODA            22           8.0 0 days 00:01:16.788000
8       L STROLL            18           9.0 0 days 00:01:17.072000
9       F ALONSO            14          10.0 0 days 00:01:17.552000
10    L HAMILTON            44          11.0 0 days 00:01:16.960000
11       A ALBON            23          12.0 0 days 00:01:17.167000
12      V BOTTAS            77          13.0 0 days 00:01:17.340000
13   K MAGNUSSEN       

In [8]:
import pandas as pd

def process_race_results(file_path):
    """
    Process race results data to extract required columns.
    
    Args:
        file_path (str): Path to the race results CSV file.
    
    Returns:
        pd.DataFrame: Processed DataFrame with DriverName, DriverNumber, Points, TeamName, and RacePosition.
    """
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Create a new DataFrame with required columns
    result = pd.DataFrame()
    result['DriverName'] = df['BroadcastName']
    result['DriverNumber'] = df['DriverNumber']
    result['Points'] = df['Points']
    result['TeamName'] = df['TeamName']
    result['RacePosition'] = df['Position']  # Final position in the race
    
    # Sort by RacePosition (which is already in order based on the dataset)
    result = result.sort_values('RacePosition')
    
    return result

# Process data for both years
race_2024 = process_race_results('australian_gp_race_2024.csv')
race_2023 = process_race_results('australian_gp_race_2023.csv')

# Display the results
print("2024 Race Results:")
print(race_2024)
print("\n2023 Race Results:")
print(race_2023)


2024 Race Results:
      DriverName  DriverNumber  Points         TeamName  RacePosition
0        C SAINZ            55    25.0          Ferrari           1.0
1      C LECLERC            16    19.0          Ferrari           2.0
2       L NORRIS             4    15.0          McLaren           3.0
3      O PIASTRI            81    12.0          McLaren           4.0
4        S PEREZ            11    10.0  Red Bull Racing           5.0
5       L STROLL            18     8.0     Aston Martin           6.0
6      Y TSUNODA            22     6.0               RB           7.0
7       F ALONSO            14     4.0     Aston Martin           8.0
8   N HULKENBERG            27     2.0     Haas F1 Team           9.0
9    K MAGNUSSEN            20     1.0     Haas F1 Team          10.0
10       A ALBON            23     0.0         Williams          11.0
11   D RICCIARDO             3     0.0               RB          12.0
12       P GASLY            10     0.0           Alpine          13.0
1

In [11]:
import pandas as pd

def process_qualifying_data(file_path):
    df = pd.read_csv(file_path)
    
    # Convert time strings to timedelta
    for q in ['Q1', 'Q2', 'Q3']:
        df[q] = pd.to_timedelta(df[q])
    
    result = pd.DataFrame()
    result['DriverName'] = df['BroadcastName']
    result['DriverNumber'] = df['DriverNumber']
    result['GridPosition'] = df['Position'].astype(int)
    
    # Determine qualifying time based on position
    def get_quali_time(row):
        if row['Position'] <= 10:
            return row['Q3']
        elif row['Position'] <= 15:
            return row['Q2']
        else:
            return row['Q1']
    
    result['QualifyingTime'] = df.apply(get_quali_time, axis=1)
    
    # Calculate qualifying performance
    def calculate_qualifying_performance(row):
        if row['Position'] <= 10:
            return (row['Q1'] + row['Q2'] + row['Q3'])/3
        elif row['Position'] <= 15:
            return (row['Q1'] + row['Q2'])/2
        else:
            return row['Q1']
    
    result['qualifying_performance'] = df.apply(calculate_qualifying_performance, axis=1)
    result = result.sort_values('GridPosition')
    return result

def process_race_results(file_path):
    df = pd.read_csv(file_path)
    
    result = pd.DataFrame()
    result['DriverName'] = df['BroadcastName']
    result['DriverNumber'] = df['DriverNumber']
    result['Points'] = df['Points']
    result['TeamName'] = df['TeamName']
    result['RacePosition'] = df['Position'].astype(int)
    result['GridPosition'] = df['GridPosition'].astype(int)
    
    # Calculate performance metrics
    result['race_performance_score'] = result['Points'] / (result['RacePosition'] + 1)
    result['grid_to_finish_delta'] = result['GridPosition'] - result['RacePosition']
    
    return result.sort_values('RacePosition')

# Process all data
quali_2024 = process_qualifying_data('australian_gp_qualifying_2024.csv')
quali_2023 = process_qualifying_data('australian_gp_qualifying_2023.csv')
race_2024 = process_race_results('australian_gp_race_2024.csv')
race_2023 = process_race_results('australian_gp_race_2023.csv')

# Display sample output
print("2024 Qualifying Data with Performance Metrics:")
print(quali_2024.head(3))
print("\n2023 Race Results with Performance Metrics:")
print(race_2023.head(3))


2024 Qualifying Data with Performance Metrics:
     DriverName  DriverNumber  GridPosition         QualifyingTime  \
0  M VERSTAPPEN             1             1 0 days 00:01:15.915000   
1       C SAINZ            55             2 0 days 00:01:16.185000   
2       S PEREZ            11             3 0 days 00:01:16.274000   

     qualifying_performance  
0 0 days 00:01:16.373666666  
1 0 days 00:01:16.368333333  
2    0 days 00:01:16.570000  

2023 Race Results with Performance Metrics:
     DriverName  DriverNumber  Points         TeamName  RacePosition  \
0  M VERSTAPPEN             1    25.0  Red Bull Racing             1   
1    L HAMILTON            44    18.0         Mercedes             2   
2      F ALONSO            14    15.0     Aston Martin             3   

   GridPosition  race_performance_score  grid_to_finish_delta  
0             1                   12.50                     0  
1             3                    6.00                     1  
2             4           

In [21]:
##Final data cleaning steps
import pandas as pd
from sklearn.preprocessing import StandardScaler

def process_qualifying_data(file_path):
    df = pd.read_csv(file_path)
    
    # Convert time columns to timedelta
    for q in ['Q1', 'Q2', 'Q3']:
        df[q] = pd.to_timedelta(df[q])
    
    result = pd.DataFrame()
    result['DriverName'] = df['BroadcastName']
    result['DriverNumber'] = df['DriverNumber'].astype(int)
    result['Year'] = df['Year'].astype(int)
    result['GridPosition'] = df['Position'].astype(int)
    
    # Qualifying time logic
    def get_quali_time(row):
        if row['Position'] <= 10: return row['Q3']
        elif row['Position'] <= 15: return row['Q2']
        return row['Q1']
    
    result['QualifyingTime'] = df.apply(get_quali_time, axis=1)
    
    # Performance calculation
    def calculate_performance(row):
        if row['Position'] <= 10: return (row['Q1'] + row['Q2'] + row['Q3'])/3
        elif row['Position'] <= 15: return (row['Q1'] + row['Q2'])/2
        return row['Q1']
    
    result['qualifying_performance'] = df.apply(calculate_performance, axis=1)
    
    # Convert to seconds for scaling
    result['QualifyingTime_sec'] = result['QualifyingTime'].dt.total_seconds()
    result['qualifying_performance_sec'] = result['qualifying_performance'].dt.total_seconds()
    
    return result.sort_values('GridPosition')

def process_race_results(file_path):
    df = pd.read_csv(file_path)
    
    result = pd.DataFrame()
    result['DriverName'] = df['BroadcastName']
    result['DriverNumber'] = df['DriverNumber'].astype(int)
    result['Year'] = df['Year'].astype(int)
    result['RacePosition'] = df['Position'].astype(int)
    result['Points'] = df['Points'].astype(float)
    result['TeamName'] = df['TeamName']
    result['GridPosition'] = df['GridPosition'].astype(int)
    
    # Performance metrics
    result['race_performance_score'] = result['Points'] / (result['RacePosition'] + 1)
    result['grid_to_finish_delta'] = result['GridPosition'] - result['RacePosition']
    
    return result.sort_values('RacePosition')

# Process and merge all data
quali_data = pd.concat([
    process_qualifying_data('australian_gp_qualifying_2023.csv'),
    process_qualifying_data('australian_gp_qualifying_2024.csv')
])

race_data = pd.concat([
    process_race_results('australian_gp_race_2023.csv'),
    process_race_results('australian_gp_race_2024.csv')
])

# Merge datasets
df = pd.merge(
    quali_data,
    race_data,
    on=['DriverName', 'DriverNumber', 'Year'],
    suffixes=('_qual', '_race')
)

# Normalization
scaler = StandardScaler()
numeric_features = [
    'qualifying_performance_sec',
    'race_performance_score',
    'grid_to_finish_delta'
]

df[numeric_features] = scaler.fit_transform(df[numeric_features])

# Final columns cleanup
final_columns = [
    'DriverName', 'DriverNumber', 'Year', 
    'QualifyingTime', 'GridPosition_qual', 'RacePosition',
    'TeamName', 'Points', 
    'qualifying_performance_sec', 'race_performance_score',
    'grid_to_finish_delta'
]

df = df[final_columns].rename(columns={
    'GridPosition_qual': 'GridPosition',
    'qualifying_performance_sec': 'NormalizedQualifyingPerf',
    'race_performance_score': 'NormalizedRacePerf',
    'grid_to_finish_delta': 'NormalizedPositionDelta'
})

print("Final Cleaned Dataset:")
print(df.head(3))


Final Cleaned Dataset:
     DriverName  DriverNumber  Year         QualifyingTime  GridPosition  \
0  M VERSTAPPEN             1  2023 0 days 00:01:16.732000             1   
1     G RUSSELL            63  2023 0 days 00:01:16.968000             2   
2    L HAMILTON            44  2023 0 days 00:01:17.104000             3   

   RacePosition         TeamName  Points  NormalizedQualifyingPerf  \
0             1  Red Bull Racing    25.0                 -0.803647   
1            18         Mercedes     0.0                 -0.293815   
2             2         Mercedes    18.0                 -0.183166   

   NormalizedRacePerf  NormalizedPositionDelta  
0            3.666542                 0.000000  
1           -0.498913                -2.389910  
2            1.500506                 0.149369  


In [18]:
df.head(1)

Unnamed: 0,DriverName,DriverNumber,Year,QualifyingTime,GridPosition,RacePosition,TeamName,Points,NormalizedQualifyingPerf,NormalizedRacePerf,NormalizedPositionDelta
0,M VERSTAPPEN,1,2023,0 days 00:01:16.732000,1,1,Red Bull Racing,25.0,-0.803647,3.666542,0.0


In [22]:
df['QualifyingTime'] = df['QualifyingTime'].apply(
    lambda x: str(x).split()[-1] if pd.notnull(x) else x
)

In [24]:
df.head(5)

Unnamed: 0,DriverName,DriverNumber,Year,QualifyingTime,GridPosition,RacePosition,TeamName,Points,NormalizedQualifyingPerf,NormalizedRacePerf,NormalizedPositionDelta
0,M VERSTAPPEN,1,2023,00:01:16.732000,1,1,Red Bull Racing,25.0,-0.803647,3.666542,0.0
1,G RUSSELL,63,2023,00:01:16.968000,2,18,Mercedes,0.0,-0.293815,-0.498913,-2.38991
2,L HAMILTON,44,2023,00:01:17.104000,3,2,Mercedes,18.0,-0.183166,1.500506,0.149369
3,F ALONSO,14,2023,00:01:17.139000,4,3,Aston Martin,15.0,-0.230813,0.750724,0.149369
4,C SAINZ,55,2023,00:01:17.270000,5,12,Ferrari,0.0,-0.075693,-0.498913,-1.045586


# Working Australian Grand Prix

In [59]:
import fastf1
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

# Enable FastF1 caching
fastf1.Cache.enable_cache('f1_cache')

# Load FastF1 2024 Australian GP race session
session_2024 = fastf1.get_session(2024, 3, "R")
session_2024.load()

# Extract lap times for 2024 historical data
laps_2024 = session_2024.laps[["Driver", "LapTime"]].copy()
laps_2024.dropna(subset=["LapTime"], inplace=True)
laps_2024["LapTime (s)"] = laps_2024["LapTime"].dt.total_seconds()

# Because there may be multiple laps per driver, aggregate by taking the best (minimum) lap time
laps_2024_best = laps_2024.groupby("Driver", as_index=False)["LapTime (s)"].min()

# -----------------------
# 2025 Qualifying Data (input)
# -----------------------
qualifying_2025 = pd.DataFrame({
    "Driver": [
        "Lando Norris", "Oscar Piastri", "Max Verstappen", 
        "George Russell", "Yuki Tsunoda", "Alexander Albon",
        "Charles Leclerc", "Lewis Hamilton", "Pierre Gasly", "Carlos Sainz"
    ],
    "QualifyingTime (s)": [75.096, 75.180, 75.481, 75.546, 75.670,
                           75.737, 75.755, 75.973, 75.980, 76.062]
})

# Map full names to FastF1 3-letter driver codes
driver_mapping = {
    "Lando Norris": "NOR",
    "Oscar Piastri": "PIA",
    "Max Verstappen": "VER",
    "George Russell": "RUS",
    "Yuki Tsunoda": "TSU",
    "Alexander Albon": "ALB",
    "Charles Leclerc": "LEC",
    "Lewis Hamilton": "HAM",
    "Pierre Gasly": "GAS",
    "Carlos Sainz": "SAI",
    "Lance Stroll": "STR",
    "Fernando Alonso": "ALO"
}
qualifying_2025["DriverCode"] = qualifying_2025["Driver"].map(driver_mapping)

# Merge the 2025 qualifying data with the 2024 historical race lap times via driver codes
merged_data = qualifying_2025.merge(laps_2024_best, left_on="DriverCode", right_on="Driver")

# Use only "QualifyingTime (s)" as the feature and "LapTime (s)" as the target (training on 2024 data)
X_train = merged_data[["QualifyingTime (s)"]]
y_train = merged_data["LapTime (s)"]

# Train the model on 2024 data
model = GradientBoostingRegressor(random_state=42,learning_rate=0.1, 
    n_estimators=100, 
    min_samples_split=5,  # Minimum samples required to split an internal node
    min_samples_leaf=3,  # Minimum samples required to be at a leaf node
)
model.fit(X_train, y_train)

# Evaluate on training data (for demonstration)
train_predictions = model.predict(X_train)
mae = mean_absolute_error(y_train, train_predictions)
print(f"Mean Absolute Error on training data: {mae:.6f} seconds")

# Predict lap times for 2025 using qualifying times as the input
qualifying_2025["PredictedLapTime"] = model.predict(qualifying_2025[["QualifyingTime (s)"]])
# Rank the predicted lap times to assign predicted finishing positions (lower lap time equals higher finishing position)
qualifying_2025["PredictedPosition"] = qualifying_2025["PredictedLapTime"].rank().astype(int)

# Display final prediction results
print("\n 🏁🏆 Predicted 2025 Australian Grand Prix Results 🏁🏆 :")
print(qualifying_2025[["Driver", "QualifyingTime (s)", "PredictedLapTime", "PredictedPosition"]].sort_values("PredictedPosition"))

core           INFO 	Loading data for Australian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 19 drivers: ['55', '16', '4', '81', '11', '18', '22', '14', '27', '20', '23', '3', '10', '77', '24', '31', '63', '44', '1']


Mean Absolute Error on training data: 0.789658 seconds

 🏁🏆 Predicted 2025 Australian Grand Prix Results 🏁🏆 :
            Driver  QualifyingTime (s)  PredictedLapTime  PredictedPosition
6  Charles Leclerc              75.755         80.304100                  1
3   George Russell              75.546         80.724895                  2
0     Lando Norris              75.096         81.008664                  4
1    Oscar Piastri              75.180         81.008664                  4
2   Max Verstappen              75.481         81.008664                  4
7   Lewis Hamilton              75.973         81.116983                  7
8     Pierre Gasly              75.980         81.116983                  7
9     Carlos Sainz              76.062         81.116983                  7
4     Yuki Tsunoda              75.670         81.118532                  9
5  Alexander Albon              75.737         81.118532                  9


# Chinese Grand Prix

### Moving from Australian Grand Prix, I would like to continue predicting the winner for Chinese Grand Prix, continuing from the same process. 
### This time to get the 2025 Chinese qualifying time from the FastF1 API itself. Let us use the same drivers as these drivers where present in 2024 as well which will keep the context clean

In [61]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# 2025 Qualifying Data
qualifying_2025 = pd.DataFrame({
    "Driver": [
        "Lando Norris", "Oscar Piastri", "Max Verstappen", 
        "George Russell", "Yuki Tsunoda", "Alexander Albon",
        "Charles Leclerc", "Lewis Hamilton", "Pierre Gasly", "Carlos Sainz"
    ],
    "QualifyingTime (s)": [
        90.793, 90.641, 90.817, 90.723, 91.638, 91.706,
        91.021, 90.927, 91.992, 91.840
    ]
})

# Map full names to FastF1 3-letter codes
driver_mapping = {
    "Lando Norris": "NOR", "Oscar Piastri": "PIA", "Max Verstappen": "VER",
    "George Russell": "RUS", "Yuki Tsunoda": "TSU", "Alexander Albon": "ALB",
    "Charles Leclerc": "LEC", "Lewis Hamilton": "HAM", "Pierre Gasly": "GAS",
    "Carlos Sainz": "SAI"
}

qualifying_2025["DriverCode"] = qualifying_2025["Driver"].map(driver_mapping)

fastf1.Cache.enable_cache('f1_cache')

# Load 2024 Chinese GP race session
session_2024 = fastf1.get_session(2024, 'China', 'R')
session_2024.load()

# Extract and process lap times
laps_2024 = session_2024.laps[["Driver", "LapTime"]].copy()
laps_2024.dropna(subset=["LapTime"], inplace=True)
laps_2024["LapTime (s)"] = laps_2024["LapTime"].dt.total_seconds()

# Get best lap times for each driver
laps_2024_best = laps_2024.groupby("Driver", as_index=False)["LapTime (s)"].min()

# Merge 2025 Qualifying Data with 2024 Race Data
merged_data = qualifying_2025.merge(laps_2024_best, left_on="DriverCode", right_on="Driver")

# Prepare features and target
X = merged_data[["QualifyingTime (s)"]]
y = merged_data["LapTime (s)"]

# Train the model
model = GradientBoostingRegressor(
    learning_rate=0.1, 
    n_estimators=100, 
    max_depth=3,
    min_samples_split=5,
    min_samples_leaf=3,
    random_state=39
)
model.fit(X, y)

## Assuming the model has already been trained
y_pred = model.predict(X)

# Calculate Mean Squared Error (MAE)
mse = mean_absolute_error(y, y_pred)
print(f"Mean Squared Error: {mse:.6f}")

# Rest of the code for predictions and results display
qualifying_2025["PredictedLapTime"] = model.predict(qualifying_2025[["QualifyingTime (s)"]])
qualifying_2025["PredictedPosition"] = qualifying_2025["PredictedLapTime"].rank().astype(int)

# Display results with MSE
print("\nPredicted 2025 Chinese Grand Prix Results:")
print(qualifying_2025[["Driver", "QualifyingTime (s)", "PredictedLapTime", "PredictedPosition"]].sort_values("PredictedPosition"))

core           INFO 	Loading data for Chinese Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '11', '16', '55', '63', '14', '81', '44', '27', '31', '23', '10', '24', '18', '20', '2', '3', '22', '77']


Mean Squared Error: 0.505979

Predicted 2025 Chinese Grand Prix Results:
            Driver  QualifyingTime (s)  PredictedLapTime  PredictedPosition
2   Max Verstappen              90.817         98.656544                  1
0     Lando Norris              90.793         99.545326                  3
1    Oscar Piastri              90.641         99.545326                  3
3   George Russell              90.723         99.545326                  3
6  Charles Leclerc              91.021         99.930218                  5
5  Alexander Albon              91.706         99.950404                  7
8     Pierre Gasly              91.992         99.950404                  7
9     Carlos Sainz              91.840         99.950404                  7
7   Lewis Hamilton              90.927        100.281231                  9
4     Yuki Tsunoda              91.638        101.216818                 10


## Moving From this approach, Lets take Sector times from year 2024 as input and now take all the drivers and there laptimes from 2025 lineup and make the predictions

In [1]:
import fastf1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

fastf1.Cache.enable_cache("f1_cache")
session_2024 = fastf1.get_session(2024, "China", "R")
session_2024.load()

# Extract lap and sector times
laps_2024 = session_2024.laps[["Driver", "LapTime", "Sector1Time", "Sector2Time", "Sector3Time"]].copy()
laps_2024.dropna(inplace=True)

# Convert times to seconds
for col in ["LapTime", "Sector1Time", "Sector2Time", "Sector3Time"]:
    laps_2024[f"{col} (s)"] = laps_2024[col].dt.total_seconds()

# Group by driver to get average sector times per driver
sector_times_2024 = laps_2024.groupby("Driver")[["Sector1Time (s)", "Sector2Time (s)", "Sector3Time (s)"]].mean().reset_index()

# 2025 Qualifying Data Chinese GP
qualifying_2025 = pd.DataFrame({
    "Driver": ["Oscar Piastri", "George Russell", "Lando Norris", "Max Verstappen", "Lewis Hamilton",
               "Charles Leclerc", "Isack Hadjar", "Andrea Kimi Antonelli", "Yuki Tsunoda", "Alexander Albon",
               "Esteban Ocon", "Nico Hülkenberg", "Fernando Alonso", "Lance Stroll", "Carlos Sainz Jr.",
               "Pierre Gasly", "Oliver Bearman", "Jack Doohan", "Gabriel Bortoleto", "Liam Lawson"],
    "QualifyingTime (s)": [90.641, 90.723, 90.793, 90.817, 90.927,
                           91.021, 91.079, 91.103, 91.638, 91.706,
                           91.625, 91.632, 91.688, 91.773, 91.840,
                           91.992, 92.018, 92.092, 92.141, 92.174]
})

# Map full names to FastF1 3-letter codes
driver_mapping = {
    "Oscar Piastri": "PIA", "George Russell": "RUS", "Lando Norris": "NOR", "Max Verstappen": "VER",
    "Lewis Hamilton": "HAM", "Charles Leclerc": "LEC", "Isack Hadjar": "HAD", "Andrea Kimi Antonelli": "ANT",
    "Yuki Tsunoda": "TSU", "Alexander Albon": "ALB", "Esteban Ocon": "OCO", "Nico Hülkenberg": "HUL",
    "Fernando Alonso": "ALO", "Lance Stroll": "STR", "Carlos Sainz Jr.": "SAI", "Pierre Gasly": "GAS",
    "Oliver Bearman": "BEA", "Jack Doohan": "DOO", "Gabriel Bortoleto": "BOR", "Liam Lawson": "LAW"
}

qualifying_2025["DriverCode"] = qualifying_2025["Driver"].map(driver_mapping)

# Merge qualifying data with sector times
merged_data = qualifying_2025.merge(sector_times_2024, left_on="DriverCode", right_on="Driver", how="left")

# Define feature set (Qualifying + Sector Times)
X = merged_data[["QualifyingTime (s)", "Sector1Time (s)", "Sector2Time (s)", "Sector3Time (s)"]].fillna(0)
y = laps_2024.groupby("Driver")["LapTime (s)"].mean().reset_index()["LapTime (s)"]

# Train Gradient Boosting Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=38)
model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, random_state=38)
model.fit(X_train, y_train)

# Predict race times using 2025 qualifying and sector data
predicted_race_times = model.predict(X)
qualifying_2025["PredictedRaceTime (s)"] = predicted_race_times

# Rank drivers by predicted race time
qualifying_2025 = qualifying_2025.sort_values(by="PredictedRaceTime (s)")

# Print final predictions
print("\n🏁 Predicted 2025 Chinese GP Winner with New Drivers and Sector Times 🏁\n")
print(qualifying_2025[["Driver", "PredictedRaceTime (s)"]])

# Evaluate Model
y_pred = model.predict(X_test)
print(f"\n🔍 Model Error (MAE): {mean_absolute_error(y_test, y_pred):.2f} seconds")

core           INFO 	Loading data for Chinese Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '11', '16', '55', '63', '14', '81', '44', '27', '31', '23', '10', '24', '18', '20', '2', '3', '22', '77']



🏁 Predicted 2025 Chinese GP Winner with New Drivers and Sector Times 🏁

                   Driver  PredictedRaceTime (s)
2            Lando Norris             103.911242
16         Oliver Bearman             105.768026
10           Esteban Ocon             105.937552
6            Isack Hadjar             106.014323
8            Yuki Tsunoda             107.250668
1          George Russell             107.378067
5         Charles Leclerc             107.433970
14       Carlos Sainz Jr.             107.987549
13           Lance Stroll             108.029351
11        Nico Hülkenberg             108.380663
7   Andrea Kimi Antonelli             108.485977
19            Liam Lawson             108.539843
18      Gabriel Bortoleto             108.539843
4          Lewis Hamilton             109.072694
9         Alexander Albon             109.209534
0           Oscar Piastri             109.248207
3          Max Verstappen             109.305052
15           Pierre Gasly             109.554

### Let us Remove the new drivers and see based on last years data how the old drivers will perform

In [5]:
import fastf1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

# Enable cache and load session data
fastf1.Cache.enable_cache("f1_cache")
session_2024 = fastf1.get_session(2024, "China", "R")
session_2024.load()

# Extract lap and sector times
laps_2024 = session_2024.laps[["Driver", "LapTime", "Sector1Time", "Sector2Time", "Sector3Time"]].copy()
laps_2024.dropna(inplace=True)

# Convert times to seconds
for col in ["LapTime", "Sector1Time", "Sector2Time", "Sector3Time"]:
    laps_2024[f"{col} (s)"] = laps_2024[col].dt.total_seconds()

# Group by driver to get average sector times and lap times per driver
sector_times_2024 = laps_2024.groupby("Driver")[["Sector1Time (s)", "Sector2Time (s)", "Sector3Time (s)"]].mean().reset_index()
lap_times_2024 = laps_2024.groupby("Driver")["LapTime (s)"].mean().reset_index()

# Merge sector times and lap times into one DataFrame (driver_stats)
driver_stats_2024 = pd.merge(sector_times_2024, lap_times_2024, on="Driver")

# 2025 Qualifying Data for the Chinese GP
qualifying_2025 = pd.DataFrame({
    "Driver": ["Oscar Piastri", "George Russell", "Lando Norris", "Max Verstappen", "Lewis Hamilton",
               "Charles Leclerc", "Yuki Tsunoda", "Alexander Albon",
               "Esteban Ocon", "Nico Hülkenberg", "Fernando Alonso", "Lance Stroll", "Carlos Sainz Jr.",
               "Pierre Gasly"],
    "QualifyingTime (s)": [90.641, 90.723, 90.793, 90.817, 90.927,
                           91.021, 91.638, 91.706,
                           91.625, 91.632, 91.688, 91.773, 91.840,
                           91.992]
})

# Map full names to FastF1 3-letter codes
driver_mapping = {
    "Oscar Piastri": "PIA", "George Russell": "RUS", "Lando Norris": "NOR", "Max Verstappen": "VER",
    "Lewis Hamilton": "HAM", "Charles Leclerc": "LEC",
    "Yuki Tsunoda": "TSU", "Alexander Albon": "ALB", "Esteban Ocon": "OCO", "Nico Hülkenberg": "HUL",
    "Fernando Alonso": "ALO", "Lance Stroll": "STR", "Carlos Sainz Jr.": "SAI", "Pierre Gasly": "GAS"
}
qualifying_2025["DriverCode"] = qualifying_2025["Driver"].map(driver_mapping)

# Merge qualifying data with 2024 driver stats using the driver code (which is the same as the 2024 "Driver" column)
merged_data = qualifying_2025.merge(driver_stats_2024, left_on="DriverCode", right_on="Driver", how="left")

# Define feature set (Qualifying time + sector times) and target (LapTime)
X = merged_data[["QualifyingTime (s)", "Sector1Time (s)", "Sector2Time (s)", "Sector3Time (s)"]].fillna(0)
y = merged_data["LapTime (s)"]

# Split data for training and testing (note: with only 14 rows, your test set is small)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=30)

# Train Gradient Boosting Model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=38)
model.fit(X_train, y_train)

# Predict race times for these drivers
predicted_race_times = model.predict(X)
qualifying_2025["PredictedRaceTime (s)"] = predicted_race_times

# Rank drivers by predicted race time
qualifying_2025 = qualifying_2025.sort_values(by="PredictedRaceTime (s)")

# Print final predictions
print("\n🏁 Predicted 2025 Chinese GP Winner with New Drivers and Sector Times 🏁\n")
print(qualifying_2025[["Driver", "PredictedRaceTime (s)"]])

# Evaluate the model on the test set
y_pred = model.predict(X_test)
print(f"\n🔍 Model Error (MAE): {mean_absolute_error(y_test, y_pred):.2f} seconds")

core           INFO 	Loading data for Chinese Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '11', '16', '55', '63', '14', '81', '44', '27', '31', '23', '10', '24', '18', '20', '2', '3', '22', '77']



🏁 Predicted 2025 Chinese GP Winner with New Drivers and Sector Times 🏁

              Driver  PredictedRaceTime (s)
3     Max Verstappen             105.306837
11      Lance Stroll             105.766335
5    Charles Leclerc             106.012008
12  Carlos Sainz Jr.             107.084729
10   Fernando Alonso             107.377528
2       Lando Norris             107.498633
1     George Russell             108.029422
0      Oscar Piastri             108.290875
9    Nico Hülkenberg             108.612781
4     Lewis Hamilton             109.073399
8       Esteban Ocon             109.208493
7    Alexander Albon             109.248974
13      Pierre Gasly             109.306665
6       Yuki Tsunoda             111.063511

🔍 Model Error (MAE): 0.13 seconds
