# 2024 Season Data Collection

Full season collection test before running the complete 7-year data gathering.

In [None]:
import sys
sys.path.append('../src')

from data_collection import F1DataCollector
import pandas as pd
import time

pd.set_option('display.max_columns', None)

## Initialize Collector

In [None]:
# Create collector instance
collector = F1DataCollector()

print("Starting 2024 season collection...")
print("This will take a few minutes on first run.")
print("Subsequent runs will be much faster due to caching.\n")

## Collect 2024 Season

In [None]:
# Track collection time
start_time = time.time()

# Collect all 2024 races
data_2024 = collector.collect_season_data(2024, include_sprints=False)

# Calculate elapsed time
elapsed_time = time.time() - start_time
minutes = int(elapsed_time // 60)
seconds = int(elapsed_time % 60)

print(f"\nCollection completed in {minutes}m {seconds}s")

## Examine the Results

In [None]:
# Basic info
print(f"Total records: {len(data_2024):,}")
print(f"Shape: {data_2024.shape}")
print(f"Memory usage: {data_2024.memory_usage(deep=True).sum() / 1024:.2f} KB")

In [None]:
# Check number of races
num_races = data_2024['race_name'].nunique()
num_drivers = data_2024['DriverNumber'].nunique()

print(f"\nRaces collected: {num_races}")
print(f"Unique drivers: {num_drivers}")
print(f"Average drivers per race: {len(data_2024) / num_races:.1f}")

In [None]:
# List all races
print("\nRaces in 2024 season:")
races = data_2024[['round', 'race_name', 'circuit', 'date']].drop_duplicates().sort_values('round')
print(races.to_string(index=False))

## Data Quality Checks

In [None]:
# Check for missing values
key_columns = ['Position', 'GridPosition', 'FullName', 'TeamName', 'Points', 'Status']
print("Missing values in key columns:")
print(data_2024[key_columns].isnull().sum())

In [None]:
# DNF statistics
total_entries = len(data_2024)
finished = (data_2024['Status'] == 'Finished').sum()
dnf = total_entries - finished
dnf_rate = (dnf / total_entries) * 100

print(f"\nDNF Statistics:")
print(f"Total entries: {total_entries}")
print(f"Finished: {finished}")
print(f"DNFs: {dnf}")
print(f"DNF rate: {dnf_rate:.1f}%")

In [None]:
# Most common DNF reasons
dnf_data = data_2024[data_2024['Status'] != 'Finished']
print("\nTop DNF reasons:")
print(dnf_data['Status'].value_counts().head(10))

## Quick Analysis

In [None]:
# Race winners
winners = data_2024[data_2024['Position'] == '1'][['race_name', 'FullName', 'TeamName']]
print("\n2024 Race Winners:")
print(winners.to_string(index=False))

In [None]:
# Driver wins count
win_counts = winners['FullName'].value_counts()
print("\nWins by driver:")
print(win_counts)

In [None]:
# Pole to win conversion
pole_winners = data_2024[
    (data_2024['Position'] == '1') & (data_2024['GridPosition'] == '1')
]
pole_win_rate = (len(pole_winners) / num_races) * 100

print(f"\nPole position to win conversion:")
print(f"Pole winners: {len(pole_winners)} / {num_races}")
print(f"Conversion rate: {pole_win_rate:.1f}%")

## Save the Data

In [None]:
# Save to CSV
collector.save_data(data_2024, 'races_2024.csv')

print("\nData saved successfully!")

## Summary

In [None]:
print("=" * 70)
print("2024 Season Collection Summary")
print("=" * 70)
print(f"\n✓ Collection time: {minutes}m {seconds}s")
print(f"✓ Races collected: {num_races}")
print(f"✓ Total records: {len(data_2024):,}")
print(f"✓ Data quality: {(1 - dnf_rate/100) * 100:.1f}% finish rate")
print(f"✓ File saved: data/raw/races_2024.csv")
print(f"\nReady for full 2018-2024 collection!")
print(f"Estimated time for 7 years: ~{(elapsed_time * 7 / 60):.0f} minutes")
print("=" * 70)