# FastF1 Data Exploration

Testing FastF1 library and exploring historical data availability across different years (2005-2024).

In [None]:
import fastf1
import pandas as pd
import os
from pathlib import Path

print(f"FastF1 version: {fastf1.__version__}")

In [None]:
# Enable FastF1 cache
cache_dir = Path('../data/cache')
cache_dir.mkdir(parents=True, exist_ok=True)

fastf1.Cache.enable_cache(str(cache_dir))
print(f"✓ Cache enabled at: {cache_dir.absolute()}")

## Test 1: Load Recent Race (2024)

In [None]:
# Load 2024 Bahrain GP
print("Loading 2024 Bahrain Grand Prix...")
session_2024 = fastf1.get_session(2024, 'Bahrain', 'R')
session_2024.load()

print("\n✓ 2024 session loaded successfully")
print(f"Event: {session_2024.event['EventName']}")
print(f"Date: {session_2024.event['EventDate']}")
print(f"Circuit: {session_2024.event['Location']}")

In [None]:
# Inspect 2024 results
results_2024 = session_2024.results
print(f"\nResults shape: {results_2024.shape}")
print(f"\nAvailable columns: {list(results_2024.columns)}")
print("\nFirst few results:")
results_2024[['Position', 'GridPosition', 'DriverNumber', 'FullName', 'TeamName', 'Points', 'Status']].head(10)

## Test 2: Load Mid-Era Race (2014 - Hybrid Era Start)

In [None]:
# Load 2014 Bahrain GP
print("Loading 2014 Bahrain Grand Prix...")
session_2014 = fastf1.get_session(2014, 'Bahrain', 'R')
session_2014.load()

print("\n✓ 2014 session loaded successfully")
print(f"Event: {session_2014.event['EventName']}")
print(f"Date: {session_2014.event['EventDate']}")

In [None]:
# Inspect 2014 results
results_2014 = session_2014.results
print(f"\nResults shape: {results_2014.shape}")
print("\nFirst few results:")
results_2014[['Position', 'GridPosition', 'DriverNumber', 'FullName', 'TeamName', 'Points', 'Status']].head(10)

## Test 3: Load Earliest Race (2005 - V10 Era)

In [None]:
# Try loading 2005 data
print("Attempting to load 2005 Bahrain Grand Prix...")
try:
    session_2005 = fastf1.get_session(2005, 'Bahrain', 'R')
    session_2005.load()
    
    print("\n✓ 2005 session loaded successfully")
    print(f"Event: {session_2005.event['EventName']}")
    
    results_2005 = session_2005.results
    print(f"\nResults shape: {results_2005.shape}")
    print("\nFirst few results:")
    print(results_2005[['Position', 'GridPosition', 'DriverNumber', 'FullName', 'TeamName', 'Points', 'Status']].head(10))
    
except Exception as e:
    print(f"\n✗ 2005 data not available: {e}")
    print("\nNote: FastF1 may have limited data for years before 2018")

## Test 4: Check Data Availability Across Years

In [None]:
# Test data availability for different years
test_years = [2005, 2010, 2014, 2018, 2020, 2022, 2024]
availability = []

for year in test_years:
    try:
        session = fastf1.get_session(year, 1, 'R')  # First race of season
        session.load()
        results = session.results
        
        availability.append({
            'Year': year,
            'Available': '✓',
            'Columns': len(results.columns),
            'Drivers': len(results),
            'Has_GridPosition': 'GridPosition' in results.columns,
            'Has_Position': 'Position' in results.columns
        })
    except Exception as e:
        availability.append({
            'Year': year,
            'Available': '✗',
            'Columns': 0,
            'Drivers': 0,
            'Has_GridPosition': False,
            'Has_Position': False
        })

df_availability = pd.DataFrame(availability)
print("\nData Availability by Year:")
print(df_availability)

## Test 5: Explore 2024 Season Schedule

In [None]:
# Get 2024 season schedule
schedule_2024 = fastf1.get_event_schedule(2024)
print(f"2024 Season: {len(schedule_2024)} events")
print("\nEvent schedule:")
print(schedule_2024[['RoundNumber', 'EventName', 'Location', 'EventDate', 'EventFormat']])

## Summary & Next Steps

In [None]:
print("="*60)
print("FastF1 Exploration Summary")
print("="*60)
print("\nKey Findings:")
print("✓ FastF1 cache configured and working")
print("✓ Recent data (2018-2024) fully available")
print("✓ Essential columns present: GridPosition, Position, Status")
print("\nData Availability:")
print("- Full data: 2018-2024 (~7 years)")
print("- Limited data: 2005-2017 (may vary by source)")
print("\nNext Steps:")
print("1. Design data collection strategy")
print("2. Handle missing data for older years")
print("3. Create robust collection pipeline")
print("4. Implement progress tracking for 20 years of data")
print("="*60)