# FastF1 Data Exploration

Testing FastF1 library and exploring historical data availability across different years (2005-2024).

In [2]:
import fastf1
import pandas as pd
import os
from pathlib import Path

print(f"FastF1 version: {fastf1.__version__}")

FastF1 version: 3.6.1


In [3]:
# Enable FastF1 cache
cache_dir = Path('../data/cache')
cache_dir.mkdir(parents=True, exist_ok=True)

fastf1.Cache.enable_cache(str(cache_dir))
print(f"✓ Cache enabled at: {cache_dir.absolute()}")

✓ Cache enabled at: /Users/ritwik/Projects/Vantage/notebooks/../data/cache


## Test 1: Load Recent Race (2024)

In [4]:
# Load 2024 Bahrain GP
print("Loading 2024 Bahrain Grand Prix...")
session_2024 = fastf1.get_session(2024, 'Bahrain', 'R')
session_2024.load()

print("\n✓ 2024 session loaded successfully")
print(f"Event: {session_2024.event['EventName']}")
print(f"Date: {session_2024.event['EventDate']}")
print(f"Circuit: {session_2024.event['Location']}")

Loading 2024 Bahrain Grand Prix...


core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '55', '16', '63', '4', '44', '81', '14', '18', '24', '20', '3', '22', '23', '27', '31', '10', '77', '2']



✓ 2024 session loaded successfully
Event: Bahrain Grand Prix
Date: 2024-03-02 00:00:00
Circuit: Sakhir


In [5]:
# Inspect 2024 results
results_2024 = session_2024.results
print(f"\nResults shape: {results_2024.shape}")
print(f"\nAvailable columns: {list(results_2024.columns)}")
print("\nFirst few results:")
results_2024[['Position', 'GridPosition', 'DriverNumber', 'FullName', 'TeamName', 'Points', 'Status']].head(10)


Results shape: (20, 22)

Available columns: ['DriverNumber', 'BroadcastName', 'Abbreviation', 'DriverId', 'TeamName', 'TeamColor', 'TeamId', 'FirstName', 'LastName', 'FullName', 'HeadshotUrl', 'CountryCode', 'Position', 'ClassifiedPosition', 'GridPosition', 'Q1', 'Q2', 'Q3', 'Time', 'Status', 'Points', 'Laps']

First few results:


Unnamed: 0,Position,GridPosition,DriverNumber,FullName,TeamName,Points,Status
1,1.0,1.0,1,Max Verstappen,Red Bull Racing,26.0,Finished
11,2.0,5.0,11,Sergio Perez,Red Bull Racing,18.0,Finished
55,3.0,4.0,55,Carlos Sainz,Ferrari,15.0,Finished
16,4.0,2.0,16,Charles Leclerc,Ferrari,12.0,Finished
63,5.0,3.0,63,George Russell,Mercedes,10.0,Finished
4,6.0,7.0,4,Lando Norris,McLaren,8.0,Finished
44,7.0,9.0,44,Lewis Hamilton,Mercedes,6.0,Finished
81,8.0,8.0,81,Oscar Piastri,McLaren,4.0,Finished
14,9.0,6.0,14,Fernando Alonso,Aston Martin,2.0,Finished
18,10.0,12.0,18,Lance Stroll,Aston Martin,1.0,Finished


## Test 2: Load Mid-Era Race (2014 - Hybrid Era Start)

In [6]:
# Load 2014 Bahrain GP
print("Loading 2014 Bahrain Grand Prix...")
session_2014 = fastf1.get_session(2014, 'Bahrain', 'R')
session_2014.load()

print("\n✓ 2014 session loaded successfully")
print(f"Event: {session_2014.event['EventName']}")
print(f"Date: {session_2014.event['EventDate']}")

Loading 2014 Bahrain Grand Prix...


core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.1]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
core           INFO 	Finished loading data for 22 drivers: ['44', '6', '11', '3', '27', '1', '19', '77', '14', '7', '26', '8', '4', '13', '10', '17', '22', '20', '21', '9', '25', '99']



✓ 2014 session loaded successfully
Event: Bahrain Grand Prix
Date: 2014-04-06 15:00:00


In [7]:
# Inspect 2014 results
results_2014 = session_2014.results
print(f"\nResults shape: {results_2014.shape}")
print("\nFirst few results:")
results_2014[['Position', 'GridPosition', 'DriverNumber', 'FullName', 'TeamName', 'Points', 'Status']].head(10)


Results shape: (22, 22)

First few results:


Unnamed: 0_level_0,Position,GridPosition,DriverNumber,FullName,TeamName,Points,Status
DriverNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
44,1.0,2.0,44,Lewis Hamilton,Mercedes,25.0,Finished
6,2.0,1.0,6,Nico Rosberg,Mercedes,18.0,Finished
11,3.0,4.0,11,Sergio Pérez,Force India,15.0,Finished
3,4.0,13.0,3,Daniel Ricciardo,Red Bull,12.0,Finished
27,5.0,11.0,27,Nico Hülkenberg,Force India,10.0,Finished
1,6.0,10.0,1,Sebastian Vettel,Red Bull,8.0,Finished
19,7.0,7.0,19,Felipe Massa,Williams,6.0,Finished
77,8.0,3.0,77,Valtteri Bottas,Williams,4.0,Finished
14,9.0,9.0,14,Fernando Alonso,Ferrari,2.0,Finished
7,10.0,5.0,7,Kimi Räikkönen,Ferrari,1.0,Finished


## Test 3: Load Earliest Race (2005 - V10 Era)

In [8]:
# Try loading 2005 data
print("Attempting to load 2005 Bahrain Grand Prix...")
try:
    session_2005 = fastf1.get_session(2005, 'Bahrain', 'R')
    session_2005.load()
    
    print("\n✓ 2005 session loaded successfully")
    print(f"Event: {session_2005.event['EventName']}")
    
    results_2005 = session_2005.results
    print(f"\nResults shape: {results_2005.shape}")
    print("\nFirst few results:")
    print(results_2005[['Position', 'GridPosition', 'DriverNumber', 'FullName', 'TeamName', 'Points', 'Status']].head(10))
    
except Exception as e:
    print(f"\n✗ 2005 data not available: {e}")
    print("\nNote: FastF1 may have limited data for years before 2018")

Attempting to load 2005 Bahrain Grand Prix...


core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.1]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
core           INFO 	Finished loading data for 20 drivers: ['5', '16', '9', '17', '10', '7', '12', '14', '2', '18', '11', '20', '21', '3', '4', '8', '1', '6', '19', '15']



✓ 2005 session loaded successfully
Event: Bahrain Grand Prix

Results shape: (20, 22)

First few results:
              Position  GridPosition DriverNumber            FullName  \
DriverNumber                                                            
5                  1.0           1.0            5     Fernando Alonso   
16                 2.0           3.0           16        Jarno Trulli   
9                  3.0           9.0            9      Kimi Räikkönen   
17                 4.0           6.0           17     Ralf Schumacher   
10                 5.0           8.0           10    Pedro de la Rosa   
7                  6.0           5.0            7         Mark Webber   
12                 7.0          12.0           12        Felipe Massa   
14                 8.0          14.0           14     David Coulthard   
2                  9.0          20.0            2  Rubens Barrichello   
18                10.0          16.0           18      Tiago Monteiro   

              Te

## Test 4: Check Data Availability Across Years

In [9]:
# Test data availability for different years
test_years = [2005, 2010, 2014, 2018, 2020, 2022, 2024]
availability = []

for year in test_years:
    try:
        session = fastf1.get_session(year, 1, 'R')  # First race of season
        session.load()
        results = session.results
        
        availability.append({
            'Year': year,
            'Available': '✓',
            'Columns': len(results.columns),
            'Drivers': len(results),
            'Has_GridPosition': 'GridPosition' in results.columns,
            'Has_Position': 'Position' in results.columns
        })
    except Exception as e:
        availability.append({
            'Year': year,
            'Available': '✗',
            'Columns': 0,
            'Drivers': 0,
            'Has_GridPosition': False,
            'Has_Position': False
        })

df_availability = pd.DataFrame(availability)
print("\nData Availability by Year:")
print(df_availability)

core           INFO 	Loading data for Australian Grand Prix - Race [v3.6.1]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


core           INFO 	Finished loading data for 20 drivers: ['6', '2', '5', '14', '7', '10', '15', '9', '16', '12', '3', '17', '11', '4', '19', '18', '20', '1', '8', '21']
core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.1]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
core           INFO 	Finished loading data for 24 drivers: ['8', '7', '2', '5', '4', '3', '1', '6', '15', '9', '11', '14', '17', '10', '19', '16', '18', '22', '21', '24', '12', '23', '25', '20']
core           INFO 	Loading data for Australian Grand Prix - Race [v3.6.1]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
core           INFO 	Finished loading data for 22 drivers: ['6', '20', '22', '14', '77', '27', '7', '25', '26', '11', '99', '21', '4', '17', '8', '13', '9', '1', '44', '19', '10', '3']
core           INFO 	Loading data for Australi


Data Availability by Year:
   Year Available  Columns  Drivers  Has_GridPosition  Has_Position
0  2005         ✓       22       20              True          True
1  2010         ✓       22       24              True          True
2  2014         ✓       22       22              True          True
3  2018         ✓       22       20              True          True
4  2020         ✓       22       20              True          True
5  2022         ✓       22       20              True          True
6  2024         ✓       22       20              True          True


## Test 5: Explore 2024 Season Schedule

In [10]:
# Get 2024 season schedule
schedule_2024 = fastf1.get_event_schedule(2024)
print(f"2024 Season: {len(schedule_2024)} events")
print("\nEvent schedule:")
print(schedule_2024[['RoundNumber', 'EventName', 'Location', 'EventDate', 'EventFormat']])

2024 Season: 25 events

Event schedule:
    RoundNumber                  EventName           Location  EventDate  \
0             0         Pre-Season Testing             Sakhir 2024-02-23   
1             1         Bahrain Grand Prix             Sakhir 2024-03-02   
2             2   Saudi Arabian Grand Prix             Jeddah 2024-03-09   
3             3      Australian Grand Prix          Melbourne 2024-03-24   
4             4        Japanese Grand Prix             Suzuka 2024-04-07   
5             5         Chinese Grand Prix           Shanghai 2024-04-21   
6             6           Miami Grand Prix              Miami 2024-05-05   
7             7  Emilia Romagna Grand Prix              Imola 2024-05-19   
8             8          Monaco Grand Prix             Monaco 2024-05-26   
9             9        Canadian Grand Prix           Montréal 2024-06-09   
10           10         Spanish Grand Prix          Barcelona 2024-06-23   
11           11        Austrian Grand Prix      

## Summary & Next Steps

In [11]:
print("="*60)
print("FastF1 Exploration Summary")
print("="*60)
print("\nKey Findings:")
print("✓ FastF1 cache configured and working")
print("✓ Recent data (2018-2024) fully available")
print("✓ Essential columns present: GridPosition, Position, Status")
print("\nData Availability:")
print("- Full data: 2018-2024 (~7 years)")
print("- Limited data: 2005-2017 (may vary by source)")
print("\nNext Steps:")
print("1. Design data collection strategy")
print("2. Handle missing data for older years")
print("3. Create robust collection pipeline")
print("4. Implement progress tracking for 20 years of data")
print("="*60)

FastF1 Exploration Summary

Key Findings:
✓ FastF1 cache configured and working
✓ Recent data (2018-2024) fully available
✓ Essential columns present: GridPosition, Position, Status

Data Availability:
- Full data: 2018-2024 (~7 years)
- Limited data: 2005-2017 (may vary by source)

Next Steps:
1. Design data collection strategy
2. Handle missing data for older years
3. Create robust collection pipeline
4. Implement progress tracking for 20 years of data
