In [3]:
import fastf1 as ff1
import pandas as pd
import numpy as np
from fastf1.ergast import Ergast
import logging
import os

In [13]:
# --- Configuration ---
START_YEAR = 2020
END_YEAR = 2022
CACHE_PATH = 'fastf1_cache'

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

# Enable caching
try:
    if not os.path.exists(CACHE_PATH):
        os.makedirs(CACHE_PATH)
        logging.info(f"Created cache directory: {CACHE_PATH}")

    ff1.Cache.enable_cache(CACHE_PATH)
    logging.info(f"FastF1 cache enabled at: {CACHE_PATH}")
    ff1.Cache.offline_mode(True)
    
except Exception as e:
    logging.error(f"Error enabling FastF1 cache: {e}")
    
all_seasons_race_stats = [] # List to hold race-by-race stats dicts
all_seasons_standings = [] # List to hold final standings DataFrames

ergast = Ergast()

INFO: FastF1 cache enabled at: fastf1_cache


In [None]:
logging.info(f"Starting data collection from {START_YEAR} to {END_YEAR}...")

for year in range(START_YEAR, END_YEAR + 1):
    logging.info(f"\n===== Processing Season: {year} =====")
    try:
        schedule = ff1.get_event_schedule(year, include_testing=False)
        race_events = schedule[schedule['Session5'].str.lower() == 'race']
        logging.info(f"Found {len(race_events)} race events for {year}.")

    except Exception as e:
        logging.error(f"Could not fetch schedule for {year}: {e}")
        continue

    # --- Collect Race-by-Race Data ---
    for round_num, event in race_events.iterrows():
        logging.info(f"  Processing Round {event['RoundNumber']} - {event['EventName']}...")
        try:
            # Load only necessary data for results and basic lap times
            session = ff1.get_session(year, event['RoundNumber'], 'R')
            session.load(weather=False, messages=False, telemetry=False)

            if session.results is None or session.results.empty:
                logging.warning(f"    --> No results found for {session.event['EventName']} Race. Skipping.")
                continue

            for idx, result in session.results.iterrows():
                driver_abbr = result["Abbreviation"]
                avg_lap_time_sec = np.nan

                # Calculate average lap time from completed laps
                try:
                    driver_laps = session.laps.pick_driver(driver_abbr)
                    if not driver_laps.empty:
                        if 'LapTime' in driver_laps.columns:
                            valid_laps = pd.to_timedelta(driver_laps['LapTime'], errors='coerce').dropna()
                            if not valid_laps.empty:
                                avg_lap_time = valid_laps.mean()
                                avg_lap_time_sec = avg_lap_time.total_seconds()
                        else:
                            logging.debug(f"      No 'LapTime' column for {driver_abbr} in {event['EventName']}")

                except Exception as lap_err:
                     logging.warning(f"      Could not calculate AvgLapTime for {driver_abbr}: {lap_err}")


                grid_pos = pd.to_numeric(result["GridPosition"], errors='coerce')
                finish_pos = pd.to_numeric(result["Position"], errors='coerce')

                positions_gained = np.nan
                if pd.notna(grid_pos) and pd.notna(finish_pos) and grid_pos > 0:
                    positions_gained = grid_pos - finish_pos

                all_seasons_race_stats.append({
                    "Season": year,
                    "Round": event['RoundNumber'],
                    "EventName": event['EventName'],
                    "Driver": driver_abbr,
                    "DriverId": result["DriverId"],
                    "TeamId": result["TeamId"],
                    "GridPosition": grid_pos,
                    "FinishPosition": finish_pos,
                    "PositionsGained": positions_gained,
                    "PointsGained": result["Points"],
                    "AvgLapTimeSec": avg_lap_time_sec,
                    "Status": result["Status"],
                })
            logging.info(f"    -> Processed {len(session.results)} drivers for the race.")

        except ff1.ErgastError as ergast_err:
             logging.error(f"    --> Ergast/API Error processing {event.get('EventName', 'Unknown Event')} Race ({year}): {ergast_err}. Check API status or data availability.")
        except Exception as e:
            logging.error(f"    --> UNEXPECTED ERROR processing {event.get('EventName', 'Unknown Event')} Race ({year}): {e}")
        finally:
            if 'session' in locals():
                del session


    # --- Collect Final Standings for the Season ---
    logging.info(f"Attempting to fetch final driver standings for {year}...")
    try:
        standings_data = ergast.get_driver_standings(season=year)

        if standings_data and not standings_data.content.empty:
            standings_df = standings_data.content[0]
            standings_df['Season'] = year
            all_seasons_standings.append(standings_df)
            logging.info(f"  -> Successfully fetched final standings for {year}.")
        else:
             logging.warning(f"  -> No final standings data returned for {year}.")

    except Exception as e:
        logging.error(f"  -> ERROR fetching final standings for {year}: {e}")

INFO: Starting data collection from 2020 to 2022...
INFO: 
===== Processing Season: 2020 =====
INFO: Found 17 race events for 2020.
INFO:   Processing Round 1 - Austrian Grand Prix...
core           INFO 	Loading data for Austrian Grand Prix - Race [v3.5.3]
INFO: Loading data for Austrian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
INFO: Using cached data for session_info
req            INFO 	Using cached data for driver_info
INFO: Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
INFO: Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
INFO: Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
INFO: Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
INFO: Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
INFO: Us

   Season  Round            EventName Driver  DriverId    TeamId  \
0    2020      1  Austrian Grand Prix    BOT    bottas  mercedes   
1    2020      1  Austrian Grand Prix    LEC   leclerc   ferrari   
2    2020      1  Austrian Grand Prix    NOR    norris   mclaren   
3    2020      1  Austrian Grand Prix    HAM  hamilton  mercedes   
4    2020      1  Austrian Grand Prix    SAI     sainz   mclaren   

   GridPosition  FinishPosition  PositionsGained  PointsGained  AvgLapTimeSec  \
0           1.0             1.0              0.0          25.0      77.281985   
1           7.0             2.0              5.0          18.0      77.359772   
2           3.0             3.0              0.0          16.0      77.422787   
3           5.0             4.0              1.0          12.0      77.407696   
4           8.0             5.0              3.0          10.0      77.453196   

     Status  
0  Finished  
1  Finished  
2  Finished  
3  Finished  
4  Finished  
<class 'pandas.core.

In [16]:
for year in range(START_YEAR, END_YEAR + 1):
    logging.info(f"\n===== Processing Season: {year} =====")
    try:
        schedule = ff1.get_event_schedule(year, include_testing=False)
        race_events = schedule[schedule['Session5'].str.lower() == 'race']
        logging.info(f"Found {len(race_events)} race events for {year}.")

    except Exception as e:
        logging.error(f"Could not fetch schedule for {year}: {e}")
        continue
    
    # --- Collect Final Standings for the Season ---
    logging.info(f"Attempting to fetch final driver standings for {year}...")
    try:
        standings_data = ergast.get_driver_standings(season=year)

        if standings_data and len(standings_data.content) != 0:
            standings_df = standings_data.content[0]
            standings_df['Season'] = year
            standings_df.rename(columns={
                'driverId': 'DriverId'
            }, inplace=True)
            all_seasons_standings.append(standings_df)
            logging.info(f"  -> Successfully fetched final standings for {year}.")
        else:
             logging.warning(f"  -> No final standings data returned for {year}.")

    except Exception as e:
        logging.error(f"  -> ERROR fetching final standings for {year}: {e}")

INFO: 
===== Processing Season: 2020 =====
INFO: Found 17 race events for 2020.
INFO: Attempting to fetch final driver standings for 2020...
INFO:   -> Successfully fetched final standings for 2020.
INFO: 
===== Processing Season: 2021 =====
INFO: Found 22 race events for 2021.
INFO: Attempting to fetch final driver standings for 2021...
INFO:   -> Successfully fetched final standings for 2021.
INFO: 
===== Processing Season: 2022 =====
INFO: Found 22 race events for 2022.
INFO: Attempting to fetch final driver standings for 2022...
INFO:   -> Successfully fetched final standings for 2022.


In [17]:
# --- Combine Collected Data ---
logging.info("\n===== Data Collection Complete =====")

# Combine race stats
if all_seasons_race_stats:
    race_df = pd.DataFrame(all_seasons_race_stats)
    logging.info(f"Total race records collected: {len(race_df)}")
    logging.info("Sample of combined race data:")
    print(race_df.head())
    logging.info("Race data types and missing values:")
    race_df.info()
else:
    logging.warning("No race data was collected.")
    race_df = pd.DataFrame()

# Combine standings stats
if all_seasons_standings:
    standings_df = pd.concat(all_seasons_standings, ignore_index=True)
    logging.info(f"\nTotal final standings records collected: {len(standings_df)}")
    logging.info("Sample of combined final standings data:")
    print(standings_df[['Season', 'position', 'points', 'wins', 'driverCode', 'constructorNames']].head())
    logging.info("Final standings data types and missing values:")
    standings_df.info()
else:
    logging.warning("No final standings data was collected.")
    standings_df = pd.DataFrame()

os.makedirs('csv', exist_ok=True)

# --- Optional: Save the collected data ---
if not race_df.empty:
    race_df.to_csv('csv/f1_race_data_raw.csv', index=False)
    logging.info("\nRaw race data saved to f1_race_data_raw.csv")
if not standings_df.empty:
    standings_df.to_csv('csv/f1_standings_data_raw.csv', index=False)
    logging.info("Raw standings data saved to f1_standings_data_raw.csv")

INFO: 
===== Data Collection Complete =====
INFO: 
Total final standings records collected: 198
INFO: Sample of combined final standings data:
INFO: Final standings data types and missing values:
INFO: Raw standings data saved to f1_standings_data_raw.csv


   Season  position  points  wins driverCode constructorNames
0    2020         1   347.0    11        HAM       [Mercedes]
1    2020         2   223.0     2        BOT       [Mercedes]
2    2020         3   214.0     2        VER       [Red Bull]
3    2020         4   125.0     1        PER   [Racing Point]
4    2020         5   119.0     0        RIC        [Renault]
<class 'fastf1.ergast.interface.ErgastResultFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   position                  198 non-null    int64         
 1   positionText              198 non-null    object        
 2   points                    198 non-null    float64       
 3   wins                      198 non-null    int64         
 4   DriverId                  198 non-null    object        
 5   driverNumber              198 non-null    int64         
 6   driverCo