# 2024 Season Data Collection

Full season collection test before running the complete 7-year data gathering.

In [None]:
import sys
sys.path.append('../src')

from data_collection import F1DataCollector
import pandas as pd
import time

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## Initialize Collector

In [None]:
collector = F1DataCollector()

print("Starting 2024 season collection...")
print("This will take a few minutes on first run.")
print("Subsequent runs will be much faster due to caching.\n")

## Collect 2024 Season

In [None]:
start_time = time.time()

data_2024 = collector.collect_season_data(2024, include_sprints=False)

elapsed_time = time.time() - start_time
minutes = int(elapsed_time // 60)
seconds = int(elapsed_time % 60)

print(f"\nCollection completed in {minutes}m {seconds}s")

## Examine the Results

In [None]:
print(f"Total records: {len(data_2024):,}")
print(f"Shape: {data_2024.shape}")
print(f"Memory usage: {data_2024.memory_usage(deep=True).sum() / 1024:.2f} KB")

In [None]:
num_races = data_2024['race_name'].nunique()
num_drivers = data_2024['DriverNumber'].nunique()

print(f"\nRaces collected: {num_races}")
print(f"Unique drivers: {num_drivers}")
print(f"Average drivers per race: {len(data_2024) / num_races:.1f}")

In [None]:
print("\nRaces in 2024 season:")
races = data_2024[['round', 'race_name', 'circuit', 'date']].drop_duplicates().sort_values('round')
print(races.to_string(index=False))

## Data Quality Checks

In [None]:
key_columns = ['Position', 'GridPosition', 'FullName', 'TeamName', 'Points', 'Status']
print("Missing values in key columns:")
print(data_2024[key_columns].isnull().sum())

In [None]:
# Inspect data types and values
print("\nData types:")
print(f"Position: {data_2024['Position'].dtype}")
print(f"GridPosition: {data_2024['GridPosition'].dtype}")

print("\nUnique Position values (first 25):")
print(sorted(data_2024['Position'].unique())[:25])

print("\nUnique GridPosition values (first 25):")
print(sorted(data_2024['GridPosition'].dropna().unique())[:25])

In [None]:
# Check what the winner's row looks like
print("\nFirst race winner (all columns):")
first_race = data_2024[data_2024['round'] == 1]
print(first_race[['Position', 'GridPosition', 'FullName', 'TeamName']].head(5))

In [None]:
total_entries = len(data_2024)
finished = (data_2024['Status'] == 'Finished').sum()
dnf = total_entries - finished
dnf_rate = (dnf / total_entries) * 100

print(f"DNF Statistics:")
print(f"Total entries: {total_entries}")
print(f"Finished: {finished}")
print(f"DNFs: {dnf}")
print(f"DNF rate: {dnf_rate:.1f}%")

In [None]:
dnf_data = data_2024[data_2024['Status'] != 'Finished']
if len(dnf_data) > 0:
    print("\nTop DNF reasons:")
    print(dnf_data['Status'].value_counts().head(10))

## Quick Analysis

In [None]:
# Try multiple methods to find winners
print("Testing winner detection methods:\n")

# Method 1: String comparison
winners_str = data_2024[data_2024['Position'].astype(str) == '1']
print(f"Method 1 (string '1'): {len(winners_str)} winners")

# Method 2: Float comparison
winners_float = data_2024[data_2024['Position'].astype(float) == 1.0]
print(f"Method 2 (float 1.0): {len(winners_float)} winners")

# Method 3: Int comparison
winners_int = data_2024[data_2024['Position'].astype(int) == 1]
print(f"Method 3 (int 1): {len(winners_int)} winners")

# Use the one that works
winners = winners_float if len(winners_float) > 0 else winners_str
print(f"\nUsing method with {len(winners)} winners\n")

In [None]:
print("2024 Race Winners:")
print(winners[['race_name', 'FullName', 'TeamName', 'GridPosition']].to_string(index=False))

In [None]:
win_counts = winners['FullName'].value_counts()
print("\nWins by driver:")
print(win_counts)

In [None]:
# Test pole winner detection
print("Testing pole position detection:\n")

# Check GridPosition values for winners
print("GridPosition values for winners:")
print(winners['GridPosition'].unique())
print(f"Type: {winners['GridPosition'].dtype}")

# Try different comparisons
pole_str = winners[winners['GridPosition'].astype(str) == '1']
print(f"\nPole winners (string comparison): {len(pole_str)}")

pole_float = winners[winners['GridPosition'].astype(float) == 1.0]
print(f"Pole winners (float comparison): {len(pole_float)}")

pole_winners = pole_float if len(pole_float) > 0 else pole_str

In [None]:
pole_win_rate = (len(pole_winners) / num_races) * 100

print(f"\nPole position to win conversion:")
print(f"Pole winners: {len(pole_winners)} / {num_races}")
print(f"Conversion rate: {pole_win_rate:.1f}%")

if len(pole_winners) > 0:
    print("\nPole winners:")
    print(pole_winners[['race_name', 'FullName']].to_string(index=False))

## Save the Data

In [None]:
collector.save_data(data_2024, 'races_2024.csv')
print("Data saved successfully!")

## Summary

In [None]:
print("=" * 70)
print("2024 Season Collection Summary")
print("=" * 70)
print(f"\n✓ Collection time: {minutes}m {seconds}s")
print(f"✓ Races collected: {num_races}")
print(f"✓ Total records: {len(data_2024):,}")
print(f"✓ Winners found: {len(winners)}")
print(f"✓ Pole-to-win rate: {pole_win_rate:.1f}%")
print(f"✓ DNF rate: {dnf_rate:.1f}%")
print(f"✓ File saved: data/raw/races_2024.csv")
print(f"\nReady for full 2018-2024 collection!")
print(f"Estimated time for 7 years: ~{(elapsed_time * 7 / 60):.0f} minutes")
print("=" * 70)