In [8]:
import fastf1 as ff1
import pandas as pd
import numpy as np
from fastf1.ergast import Ergast
import logging
import os
from datetime import datetime

In [9]:
# --- Configuration ---
START_YEAR = 2025
END_YEAR = 2025
DATE = datetime.today().timestamp()
CACHE_PATH = 'fastf1_cache'

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

# Enable caching
try:
    if not os.path.exists(CACHE_PATH):
        os.makedirs(CACHE_PATH)
        logging.info(f"Created cache directory: {CACHE_PATH}")

    ff1.Cache.enable_cache(CACHE_PATH)
    logging.info(f"FastF1 cache enabled at: {CACHE_PATH}")
    ff1.Cache.offline_mode(False)
    
except Exception as e:
    logging.error(f"Error enabling FastF1 cache: {e}")
    
all_seasons_race_stats = [] # List to hold race-by-race stats dicts
all_seasons_standings = [] # List to hold final standings DataFrames

ergast = Ergast()

INFO: FastF1 cache enabled at: fastf1_cache


In [10]:
logging.info(f"Starting data collection from {START_YEAR} to {END_YEAR}...")

for year in range(START_YEAR, END_YEAR + 1):
    logging.info(f"\n===== Processing Season: {year} =====")
    try:
        schedule = ff1.get_event_schedule(year, include_testing=False)
        race_events = schedule[schedule['Session5'].str.lower() == 'race']
        logging.info(f"Found {len(race_events)} race events for {year}.")

    except Exception as e:
        logging.error(f"Could not fetch schedule for {year}: {e}")
        continue

    # --- Collect Race-by-Race Data ---
    for round_num, event in race_events.iterrows():
        race_date = pd.to_datetime(event['Session5Date']).timestamp()
        if race_date > DATE:
            logging.info(f"Skipping Round {event['RoundNumber']} - {event['EventName']} as it's in the future")
            continue
        
        logging.info(f"  Processing Round {event['RoundNumber']} - {event['EventName']}...")
        try:
            session = ff1.get_session(year, event['RoundNumber'], 'R')
            session.load(weather=False, messages=False, telemetry=False)

            if session.results is None or session.results.empty:
                logging.warning(f"    --> No results found for {session.event['EventName']} Race. Skipping.")
                continue

            for idx, result in session.results.iterrows():
                driver_abbr = result["Abbreviation"]
                avg_lap_time_sec = np.nan

                # Calculate average lap time from completed laps
                try:
                    driver_laps = session.laps.pick_driver(driver_abbr)
                    if not driver_laps.empty:
                        if 'LapTime' in driver_laps.columns:
                            valid_laps = pd.to_timedelta(driver_laps['LapTime'], errors='coerce').dropna()
                            if not valid_laps.empty:
                                avg_lap_time = valid_laps.mean()
                                avg_lap_time_sec = avg_lap_time.total_seconds()
                        else:
                            logging.debug(f"      No 'LapTime' column for {driver_abbr} in {event['EventName']}")

                except Exception as lap_err:
                     logging.warning(f"      Could not calculate AvgLapTime for {driver_abbr}: {lap_err}")


                grid_pos = pd.to_numeric(result["GridPosition"], errors='coerce')
                finish_pos = pd.to_numeric(result["Position"], errors='coerce')

                positions_gained = np.nan
                if pd.notna(grid_pos) and pd.notna(finish_pos) and grid_pos > 0:
                    positions_gained = grid_pos - finish_pos

                all_seasons_race_stats.append({
                    "Season": year,
                    "Round": event['RoundNumber'],
                    "EventName": event['EventName'],
                    "Driver": driver_abbr,
                    "DriverId": result["DriverId"],
                    "TeamId": result["TeamId"],
                    "GridPosition": grid_pos,
                    "FinishPosition": finish_pos,
                    "PositionsGained": positions_gained,
                    "PointsGained": result["Points"],
                    "AvgLapTimeSec": avg_lap_time_sec,
                    "Status": result["Status"],
                })
            logging.info(f"    -> Processed {len(session.results)} drivers for the race.")

        except ff1.ErgastError as ergast_err:
             logging.error(f"    --> Ergast/API Error processing {event.get('EventName', 'Unknown Event')} Race ({year}): {ergast_err}. Check API status or data availability.")
        except Exception as e:
            logging.error(f"    --> UNEXPECTED ERROR processing {event.get('EventName', 'Unknown Event')} Race ({year}): {e}")
        finally:
            if 'session' in locals():
                del session


    # --- Collect Final Standings for the Season ---
    logging.info(f"Attempting to fetch final driver standings for {year}...")
    try:
        standings_data = ergast.get_driver_standings(season=year)

        if standings_data and len(standings_data.content) != 0:
            standings_df = standings_data.content[0]
            standings_df['Season'] = year
            all_seasons_standings.append(standings_df)
            logging.info(f"  -> Successfully fetched final standings for {year}.")
        else:
             logging.warning(f"  -> No final standings data returned for {year}.")

    except Exception as e:
        logging.error(f"  -> ERROR fetching final standings for {year}: {e}")

INFO: Starting data collection from 2025 to 2025...
INFO: 
===== Processing Season: 2025 =====
INFO: Found 24 race events for 2025.
INFO:   Processing Round 1 - Australian Grand Prix...
core           INFO 	Loading data for Australian Grand Prix - Race [v3.5.3]
INFO: Loading data for Australian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
INFO: Using cached data for session_info
req            INFO 	Using cached data for driver_info
INFO: Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
INFO: Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
INFO: Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
INFO: Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
INFO: Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
IN

In [11]:
for year in range(START_YEAR, END_YEAR + 1):
    logging.info(f"\n===== Processing Season: {year} =====")
    try:
        schedule = ff1.get_event_schedule(year, include_testing=False)
        race_events = schedule[schedule['Session5'].str.lower() == 'race']
        logging.info(f"Found {len(race_events)} race events for {year}.")

    except Exception as e:
        logging.error(f"Could not fetch schedule for {year}: {e}")
        continue
    
    # --- Collect Final Standings for the Season ---
    logging.info(f"Attempting to fetch final driver standings for {year}...")
    try:
        standings_data = ergast.get_driver_standings(season=year)

        if standings_data and len(standings_data.content) != 0:
            standings_df = standings_data.content[0]
            standings_df['Season'] = year
            standings_df.rename(columns={
                'driverId': 'DriverId'
            }, inplace=True)
            all_seasons_standings.append(standings_df)
            logging.info(f"  -> Successfully fetched final standings for {year}.")
        else:
             logging.warning(f"  -> No final standings data returned for {year}.")

    except Exception as e:
        logging.error(f"  -> ERROR fetching final standings for {year}: {e}")

INFO: 
===== Processing Season: 2025 =====
INFO: Found 24 race events for 2025.
INFO: Attempting to fetch final driver standings for 2025...
INFO:   -> Successfully fetched final standings for 2025.


In [12]:
# --- Combine Collected Data ---
logging.info("\n===== Data Collection Complete =====")

# Combine race stats
if all_seasons_race_stats:
    race_df = pd.DataFrame(all_seasons_race_stats)
    logging.info(f"Total race records collected: {len(race_df)}")
    logging.info("Sample of combined race data:")
    print(race_df.head())
    logging.info("Race data types and missing values:")
    race_df.info()
else:
    logging.warning("No race data was collected.")
    race_df = pd.DataFrame()

# Combine standings stats
if all_seasons_standings:
    standings_df = pd.concat(all_seasons_standings, ignore_index=True)
    logging.info(f"\nTotal final standings records collected: {len(standings_df)}")
    logging.info("Sample of combined final standings data:")
    print(standings_df[['Season', 'position', 'points', 'wins', 'driverCode', 'constructorNames']].head())
    logging.info("Final standings data types and missing values:")
    standings_df.info()
else:
    logging.warning("No final standings data was collected.")
    standings_df = pd.DataFrame()

os.makedirs('csv', exist_ok=True)

# --- Optional: Save the collected data ---
if not race_df.empty:
    race_df.to_csv('csv/f1_race_data_raw_2025.csv', index=False)
    logging.info("\nRaw race data saved to f1_race_data_raw.csv")
if not standings_df.empty:
    standings_df.to_csv('csv/f1_standings_data_raw_2025.csv', index=False)
    logging.info("Raw standings data saved to f1_standings_data_raw.csv")

INFO: 
===== Data Collection Complete =====
INFO: Total race records collected: 120
INFO: Sample of combined race data:
INFO: Race data types and missing values:
INFO: 
Total final standings records collected: 40
INFO: Sample of combined final standings data:
INFO: Final standings data types and missing values:
INFO: 
Raw race data saved to f1_race_data_raw.csv
INFO: Raw standings data saved to f1_standings_data_raw.csv


   Season  Round              EventName Driver        DriverId    TeamId  \
0    2025      1  Australian Grand Prix    NOR          norris   mclaren   
1    2025      1  Australian Grand Prix    VER  max_verstappen  red_bull   
2    2025      1  Australian Grand Prix    RUS         russell  mercedes   
3    2025      1  Australian Grand Prix    ANT       antonelli  mercedes   
4    2025      1  Australian Grand Prix    ALB           albon  williams   

   GridPosition  FinishPosition  PositionsGained  PointsGained  AvgLapTimeSec  \
0           1.0             1.0              0.0          25.0     103.428301   
1           3.0             2.0              1.0          18.0     103.341150   
2           4.0             3.0              1.0          15.0     103.686339   
3          16.0             4.0             12.0          12.0     104.579370   
4           6.0             5.0              1.0          10.0     104.672388   

     Status  
0  Finished  
1  Finished  
2  Finished  
