# Full Dataset Collection (2018-2024)

Collect all race data across 7 years. This will take 30-60 minutes on first run.

In [None]:
import sys
sys.path.append('../src')

from data_collection import F1DataCollector
import time

print("F1 Data Collection - 2018 to 2024")
print("="*60)

## Initialize Collector

In [None]:
collector = F1DataCollector(start_year=2018, end_year=2024)

print("\nThis collection will:")
print("- Collect data from 2018-2024 (7 years)")
print("- Process ~150-160 races")
print("- Save intermediate results after each year")
print("- Take 30-60 minutes on first run")
print("- Be much faster on subsequent runs (cached)")
print("\nStarting collection...\n")

## Run Full Collection

**Important Notes:**
- Watch for any error messages
- Intermediate files are saved after each year
- If interrupted, you can restart and it will use cached data
- Monitor memory usage - this creates a large dataset

In [None]:
start_time = time.time()

# THE BIG RUN
race_data = collector.collect_race_data(save_intermediate=True)

elapsed_time = time.time() - start_time
minutes = int(elapsed_time // 60)
seconds = int(elapsed_time % 60)

print(f"\n\nCollection completed in {minutes}m {seconds}s")

## Quick Data Check

In [None]:
if race_data is not None:
    print("\nQuick validation:")
    print(f"Total records: {len(race_data):,}")
    print(f"Shape: {race_data.shape}")
    print(f"Years: {sorted(race_data['year'].unique())}")
    print(f"Races: {race_data['race_name'].nunique()}")
    print(f"Drivers: {race_data['DriverNumber'].nunique()}")
    print(f"Circuits: {race_data['circuit'].nunique()}")
    print(f"\nMemory usage: {race_data.memory_usage(deep=True).sum() / (1024*1024):.2f} MB")
else:
    print("\n ERROR: No data collected!")

## Save Complete Dataset

In [None]:
if race_data is not None:
    print("Saving complete dataset...\n")
    collector.save_data(race_data, 'race_data', create_backup=True)
    print("\n SUCCESS! Dataset saved.")
else:
    print("ERROR: Cannot save - no data collected")

## Collection Summary

In [None]:
if race_data is not None:
    print("="*70)
    print("COLLECTION COMPLETE")
    print("="*70)
    print(f"\nTime elapsed: {minutes}m {seconds}s")
    print(f"Total records: {len(race_data):,}")
    print(f"\nFiles saved:")
    print("  - data/raw/race_data.csv")
    print("  - data/raw/race_data.pkl")
    print("  - data/raw/backup/race_data.csv (backup)")
    print("  - data/raw/backup/race_data.pkl (backup)")
    print(f"\nIntermediate files (by year):")
    for year in sorted(race_data['year'].unique()):
        print(f"  - data/raw/races_{year}.csv")
    print("\nNext steps:")
    print("  1. Run data verification notebook")
    print("  2. Check data quality")
    print("  3. Begin exploratory analysis")
    print("="*70)
else:
    print("="*70)
    print("COLLECTION FAILED")
    print("="*70)
    print("Check error messages above for details.")
    print("="*70)