In [1]:
#All the importa that we need
from dotenv import load_dotenv
import os
import sys


import fastf1
import pandas as pd
from fastf1 import get_session
from fastf1.events import get_event_schedule
import pandas as pd
from tqdm import tqdm


import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
import joblib


import requests

In [2]:
#enable fastf1 cache
fastf1.Cache.enable_cache('cache')

In [3]:
load_dotenv()
api_key = os.getenv("OPENWEATHERMAP_API_KEY")
print(f"Using OpenWeatherMap API Key: {api_key}")

Using OpenWeatherMap API Key: 9ec8a731f83e782cd0f14a587417715b


In [4]:
race = get_session(2023, 'Australian Grand Prix', 'R')
race.load()
print(race.results[['FullName', 'Position', 'TeamName']])

core           INFO 	Loading data for Australian Grand Prix - Race [v3.6.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '44', '14', '18', '11', '4', '27', '81', '24', '22', '77', '55', '10', '31', '21', '2', '20', '63', '23', '16']


           FullName  Position         TeamName
1    Max Verstappen       1.0  Red Bull Racing
44   Lewis Hamilton       2.0         Mercedes
14  Fernando Alonso       3.0     Aston Martin
18     Lance Stroll       4.0     Aston Martin
11     Sergio Perez       5.0  Red Bull Racing
4      Lando Norris       6.0          McLaren
27  Nico Hulkenberg       7.0     Haas F1 Team
81    Oscar Piastri       8.0          McLaren
24      Guanyu Zhou       9.0       Alfa Romeo
22     Yuki Tsunoda      10.0       AlphaTauri
77  Valtteri Bottas      11.0       Alfa Romeo
55     Carlos Sainz      12.0          Ferrari
10     Pierre Gasly      13.0           Alpine
31     Esteban Ocon      14.0           Alpine
21    Nyck De Vries      15.0       AlphaTauri
2    Logan Sargeant      16.0         Williams
20  Kevin Magnussen      17.0     Haas F1 Team
63   George Russell      18.0         Mercedes
23  Alexander Albon      19.0         Williams
16  Charles Leclerc      20.0          Ferrari


In [5]:
# Function to get weather data from OpenWeatherMap API
def get_weather(lat, lon, target_dt, api_key):
    url = f"http://api.openweathermap.org/data/2.5/forecast?lat={lat}&lon={lon}&appid={api_key}&units=metric"
    try:
        res = requests.get(url)
        res.raise_for_status()
        forecasts = res.json()["list"]
        best_match = min(forecasts, key=lambda f: abs(pd.to_datetime(f["dt_txt"]) - pd.to_datetime(target_dt)))
        return {
            "Temperature": best_match["main"]["temp"],
            "RainProbability": best_match["pop"],
            "Humidity": best_match["main"]["humidity"]
        }
    except Exception as e:
        print("Weather error:", e)
        return {"Temperature": None, "RainProbability": None, "Humidity": None}


In [6]:
import json

with open("f1-locations.json", "r") as f:
    track_locations = json.load(f)

# Convert list to name → (lat, lon) dict
track_coords = {
    entry["name"]: (entry["lat"], entry["lon"])
    for entry in track_locations
}
print("Track coordinates loaded:", track_coords.keys())

Track coordinates loaded: dict_keys(['Circuit of the Americas', 'Baku City Circuit', 'Circuit de Barcelona-Catalunya', 'Hungaroring', 'Autódromo do Estoril', 'Hockenheimring', 'Autodromo Enzo e Dino Ferrari', 'Intercity Istanbul Park', 'Autódromo Internacional Nelson Piquet', 'Jeddah Corniche Circuit', 'Las Vegas Street Circuit', 'Circuit Paul Ricard', 'Losail International Circuit', 'Circuito de Madring', 'Circuit de Nevers Magny-Cours', 'Albert Park Circuit', 'Autódromo Hermanos Rodríguez', 'Miami International Autodrome', 'Circuit de Monaco', 'Circuit Gilles-Villeneuve', 'Autodromo Nazionale Monza', 'Nürburgring', 'Autódromo Internacional do Algarve', 'Bahrain International Circuit', 'Autódromo José Carlos Pace - Interlagos', 'Autodromo Internazionale del Mugello', 'Sepang International Circuit', 'Shanghai International Circuit', 'Silverstone Circuit', 'Marina Bay Street Circuit', 'Sochi Autodrom', 'Circuit de Spa-Francorchamps', 'Red Bull Ring', 'Suzuka International Racing Course'

In [7]:
track_aliases = {
    "Bahrain Grand Prix": "Bahrain International Circuit",
    "Saudi Arabian Grand Prix": "Jeddah Corniche Circuit",
    "Australian Grand Prix": "Albert Park Circuit",
    "Emilia Romagna Grand Prix": "Autodromo Enzo e Dino Ferrari",
    "Miami Grand Prix": "Miami International Autodrome",
    "Spanish Grand Prix": "Circuit de Barcelona-Catalunya",
    "Monaco Grand Prix": "Circuit de Monaco",
    "Canadian Grand Prix": "Circuit Gilles-Villeneuve",
    "British Grand Prix": "Silverstone Circuit",
    "Hungarian Grand Prix": "Hungaroring",
    "Belgian Grand Prix": "Circuit de Spa-Francorchamps",
    "Dutch Grand Prix": "Circuit Zandvoort",
    "Italian Grand Prix": "Autodromo Nazionale Monza",
    "Singapore Grand Prix": "Marina Bay Street Circuit",
    "Japanese Grand Prix": "Suzuka International Racing Course",
    "United States Grand Prix": "Circuit of the Americas",
    "Mexico City Grand Prix": "Autódromo Hermanos Rodríguez",
    "São Paulo Grand Prix": "Autódromo José Carlos Pace - Interlagos",
    "Abu Dhabi Grand Prix": "Yas Marina Circuit",
    "Las Vegas Grand Prix": "Las Vegas Street Circuit",
    "French Grand Prix": "Circuit Paul Ricard"
}


In [15]:
# Load event schedule for the specified years


years = [2023]
output = []

for year in years:
    schedule = get_event_schedule(year)
    for _, row in tqdm(schedule.iterrows(), total=len(schedule)):
        try:
            race_session = get_session(year, row["RoundNumber"], "R")
            race_session.load()
            quali_session = get_session(year, row["RoundNumber"], "Q")
            quali_session.load()

            race_date = race_session.date
            track_name = row["EventName"]
            resolved_name = track_aliases.get(track_name, track_name)  # fallback to default name

            latlon = track_coords.get(resolved_name)
            if not latlon:
                print(f"⚠️ Missing lat/lon for: {track_name} (resolved: {resolved_name})")
                continue

            lat, lon = latlon


            weather = get_weather(lat, lon, race_date, api_key=api_key)

            results = race_session.results
            laps = race_session.laps
            if laps.empty:
                continue

            for drv in results["Abbreviation"]:
                drv_laps = laps.pick_driver(drv)
                if drv_laps.empty:
                    continue

                clean_air_pace = drv_laps["LapTime"].dt.total_seconds().median()
                qual_time = quali_session.results.loc[quali_session.results["Abbreviation"] == drv, "Q3"].values
                if len(qual_time) == 0:
                    qual_time = quali_session.results.loc[quali_session.results["Abbreviation"] == drv, "Q2"].values
                if len(qual_time) == 0:
                    qual_time = quali_session.results.loc[quali_session.results["Abbreviation"] == drv, "Q1"].values
                if len(qual_time) == 0:
                    continue

                output.append({
                    "Year": year,
                    "Track": row["EventName"],
                    "Driver": drv,
                    "Team": race_session.results.loc[race_session.results["Abbreviation"] == drv, "TeamName"].values[0],
                    "QualifyingTime": pd.to_timedelta(qual_time[0]).total_seconds(),
                    "CleanAirRacePace": clean_air_pace,
                    **weather,
                    "FinalPosition": race_session.results.loc[race_session.results["Abbreviation"] == drv, "Position"].values[0]
                })
        except Exception as e:
            print(f"Skipping {row['EventName']} {year}: {e}")


  0%|          | 0/23 [00:00<?, ?it/s]core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.0]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Skipping Pre-Season Testing 2023: Cannot get testing event by round number!


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

⚠️ Missing lat/lon for: Azerbaijan Grand Prix (resolved: Azerbaijan Grand Prix)


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

⚠️ Missing lat/lon for: Austrian Grand Prix (resolved: Austrian Grand Prix)


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

⚠️ Missing lat/lon for: Qatar Grand Prix (resolved: Qatar Grand Prix)


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

Skipping Abu Dhabi Grand Prix 2023: The data you are trying to access has not been loaded yet. See `Session.load`





In [16]:
print(f"Collected {len(output)} records for {year} season.")

Collected 359 records for 2023 season.


In [17]:
print(f"✅ Total records collected: {len(output)}")
print("📊 Records per race:")
pd.DataFrame(output).groupby(['Year', 'Track']).size()


✅ Total records collected: 359
📊 Records per race:


Year  Track                   
2023  Australian Grand Prix       20
      Bahrain Grand Prix          20
      Belgian Grand Prix          20
      British Grand Prix          20
      Canadian Grand Prix         20
      Dutch Grand Prix            20
      Hungarian Grand Prix        20
      Italian Grand Prix          20
      Japanese Grand Prix         20
      Las Vegas Grand Prix        20
      Mexico City Grand Prix      20
      Miami Grand Prix            20
      Monaco Grand Prix           20
      Saudi Arabian Grand Prix    20
      Singapore Grand Prix        19
      Spanish Grand Prix          20
      São Paulo Grand Prix        20
      United States Grand Prix    20
dtype: int64

In [None]:
# Convert output to DataFrame and save to CSV
df = pd.DataFrame(output)
df.to_csv("data/training_data_2024.csv", index=False)
df.head()


Unnamed: 0,Year,Track,Driver,Team,QualifyingTime,CleanAirRacePace,Temperature,RainProbability,Humidity,FinalPosition
0,2023,Bahrain Grand Prix,VER,Red Bull Racing,89.708,97.651,39.94,0.0,30,1.0
1,2023,Bahrain Grand Prix,PER,Red Bull Racing,89.846,97.659,39.94,0.0,30,2.0
2,2023,Bahrain Grand Prix,ALO,Aston Martin,90.336,98.127,39.94,0.0,30,3.0
3,2023,Bahrain Grand Prix,SAI,Ferrari,90.154,98.287,39.94,0.0,30,4.0
4,2023,Bahrain Grand Prix,HAM,Mercedes,90.384,98.367,39.94,0.0,30,5.0
