# Full Dataset Collection (2018-2024)

Collect all race data across 7 years. This will take 30-60 minutes on first run.

In [1]:
import sys
sys.path.append('../src')

from data_collection import F1DataCollector
import time

print("F1 Data Collection - 2018 to 2024")
print("="*60)

2025-11-14 18:30:37,710 - INFO - FastF1 cache enabled at: /Users/ritwik/Projects/Vantage/notebooks/../data/cache


F1 Data Collection - 2018 to 2024


## Initialize Collector

In [2]:
collector = F1DataCollector(start_year=2018, end_year=2024)

print("\nThis collection will:")
print("- Collect data from 2018-2024 (7 years)")
print("- Process ~150-160 races")
print("- Save intermediate results after each year")
print("- Take 30-60 minutes on first run")
print("- Be much faster on subsequent runs (cached)")
print("\nStarting collection...\n")

2025-11-14 18:30:39,290 - INFO - Initialized collector for years 2018-2024



This collection will:
- Collect data from 2018-2024 (7 years)
- Process ~150-160 races
- Save intermediate results after each year
- Take 30-60 minutes on first run
- Be much faster on subsequent runs (cached)

Starting collection...



## Run Full Collection

**Important Notes:**
- Watch for any error messages
- Intermediate files are saved after each year
- If interrupted, you can restart and it will use cached data
- Monitor memory usage - this creates a large dataset

In [3]:
start_time = time.time()

# THE BIG RUN
race_data = collector.collect_race_data(save_intermediate=True)

elapsed_time = time.time() - start_time
minutes = int(elapsed_time // 60)
seconds = int(elapsed_time % 60)

print(f"\n\nCollection completed in {minutes}m {seconds}s")

2025-11-14 18:30:41,628 - INFO - 
2025-11-14 18:30:41,629 - INFO - Starting full data collection: 2018-2024

2025-11-14 18:30:41,630 - INFO - 
2025-11-14 18:30:41,631 - INFO - Processing 2018 season
2025-11-14 18:30:41,631 - INFO - Starting collection for 2018 season
2018 Season:   0%|          | 0/21 [00:00<?, ?it/s]core           INFO 	Loading data for Australian Grand Prix - Race [v3.6.1]
2025-11-14 18:30:42,054 - INFO - Loading data for Australian Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
2025-11-14 18:30:42,056 - INFO - Using cached data for session_info
req            INFO 	Using cached data for driver_info
2025-11-14 18:30:42,057 - INFO - Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
2025-11-14 18:30:42,777 - INFO - Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
2025-11-14 18:30:42,778 - INFO - Using cached data for lap_count
req            



Collection completed in 9m 27s


## Quick Data Check

In [4]:
if race_data is not None:
    print("\nQuick validation:")
    print(f"Total records: {len(race_data):,}")
    print(f"Shape: {race_data.shape}")
    print(f"Years: {sorted(race_data['year'].unique())}")
    print(f"Races: {race_data['race_name'].nunique()}")
    print(f"Drivers: {race_data['DriverNumber'].nunique()}")
    print(f"Circuits: {race_data['circuit'].nunique()}")
    print(f"\nMemory usage: {race_data.memory_usage(deep=True).sum() / (1024*1024):.2f} MB")
else:
    print("\n ERROR: No data collected!")


Quick validation:
Total records: 780
Shape: (780, 27)
Years: [np.int64(2018), np.int64(2019)]
Races: 21
Drivers: 26
Circuits: 21

Memory usage: 0.75 MB


## Save Complete Dataset

In [5]:
if race_data is not None:
    print("Saving complete dataset...\n")
    collector.save_data(race_data, 'race_data', create_backup=True)
    print("\n SUCCESS! Dataset saved.")
else:
    print("ERROR: Cannot save - no data collected")

2025-11-14 18:43:24,214 - INFO - Saved 780 records:
2025-11-14 18:43:24,214 - INFO -   CSV: /Users/ritwik/Projects/Vantage/notebooks/../data/raw/race_data.csv (156.65 KB)
2025-11-14 18:43:24,214 - INFO -   Pickle: /Users/ritwik/Projects/Vantage/notebooks/../data/raw/race_data.pkl (197.23 KB)
2025-11-14 18:43:24,217 - INFO - Backup created in /Users/ritwik/Projects/Vantage/notebooks/../data/raw/backup


Saving complete dataset...


 SUCCESS! Dataset saved.


## Collection Summary

In [6]:
if race_data is not None:
    print("="*70)
    print("COLLECTION COMPLETE")
    print("="*70)
    print(f"\nTime elapsed: {minutes}m {seconds}s")
    print(f"Total records: {len(race_data):,}")
    print(f"\nFiles saved:")
    print("  - data/raw/race_data.csv")
    print("  - data/raw/race_data.pkl")
    print("  - data/raw/backup/race_data.csv (backup)")
    print("  - data/raw/backup/race_data.pkl (backup)")
    print(f"\nIntermediate files (by year):")
    for year in sorted(race_data['year'].unique()):
        print(f"  - data/raw/races_{year}.csv")
    print("\nNext steps:")
    print("  1. Run data verification notebook")
    print("  2. Check data quality")
    print("  3. Begin exploratory analysis")
    print("="*70)
else:
    print("="*70)
    print("COLLECTION FAILED")
    print("="*70)
    print("Check error messages above for details.")
    print("="*70)

COLLECTION COMPLETE

Time elapsed: 9m 27s
Total records: 780

Files saved:
  - data/raw/race_data.csv
  - data/raw/race_data.pkl
  - data/raw/backup/race_data.csv (backup)
  - data/raw/backup/race_data.pkl (backup)

Intermediate files (by year):
  - data/raw/races_2018.csv
  - data/raw/races_2019.csv

Next steps:
  1. Run data verification notebook
  2. Check data quality
  3. Begin exploratory analysis
