# 2024 Season Data Collection

Full season collection test before running the complete 7-year data gathering.

In [2]:
import sys
sys.path.append('../src')

from data_collection import F1DataCollector
import pandas as pd
import time

pd.set_option('display.max_columns', None)

## Initialize Collector

In [3]:
# Create collector instance
collector = F1DataCollector()

print("Starting 2024 season collection...")
print("This will take a few minutes on first run.")
print("Subsequent runs will be much faster due to caching.\n")

2025-11-14 17:55:05,273 - INFO - Initialized collector for years 2018-2024


Starting 2024 season collection...
This will take a few minutes on first run.
Subsequent runs will be much faster due to caching.



## Collect 2024 Season

In [4]:
# Track collection time
start_time = time.time()

# Collect all 2024 races
data_2024 = collector.collect_season_data(2024, include_sprints=False)

# Calculate elapsed time
elapsed_time = time.time() - start_time
minutes = int(elapsed_time // 60)
seconds = int(elapsed_time % 60)

print(f"\nCollection completed in {minutes}m {seconds}s")

2025-11-14 17:55:11,373 - INFO - Starting collection for 2024 season
2024 Season:   0%|          | 0/24 [00:00<?, ?it/s]core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.1]
2025-11-14 17:55:11,737 - INFO - Loading data for Bahrain Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
2025-11-14 17:55:11,738 - INFO - Using cached data for session_info
req            INFO 	Using cached data for driver_info
2025-11-14 17:55:11,739 - INFO - Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
2025-11-14 17:55:11,753 - INFO - Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
2025-11-14 17:55:11,754 - INFO - Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
2025-11-14 17:55:11,755 - INFO - Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
2025-11-14 17:55:11,7


Collection completed in 6m 48s


## Examine the Results

In [5]:
# Basic info
print(f"Total records: {len(data_2024):,}")
print(f"Shape: {data_2024.shape}")
print(f"Memory usage: {data_2024.memory_usage(deep=True).sum() / 1024:.2f} KB")

Total records: 479
Shape: (479, 27)
Memory usage: 526.46 KB


In [25]:
# Check number of races
num_races = data_2024['race_name'].nunique()
num_drivers = data_2024['DriverNumber'].nunique()

print(f"\nRaces collected: {num_races}")
print(f"Unique drivers: {num_drivers}")
print(f"Average drivers per race: {len(data_2024) / num_races:.1f}")


Races collected: 24
Unique drivers: 25
Average drivers per race: 20.0


In [26]:
# List all races
print("\nRaces in 2024 season:")
races = data_2024[['round', 'race_name', 'circuit', 'date']].drop_duplicates().sort_values('round')
print(races.to_string(index=False))


Races in 2024 season:
 round                 race_name           circuit       date
     1        Bahrain Grand Prix            Sakhir 2024-03-02
     2  Saudi Arabian Grand Prix            Jeddah 2024-03-09
     3     Australian Grand Prix         Melbourne 2024-03-24
     4       Japanese Grand Prix            Suzuka 2024-04-07
     5        Chinese Grand Prix          Shanghai 2024-04-21
     6          Miami Grand Prix             Miami 2024-05-05
     7 Emilia Romagna Grand Prix             Imola 2024-05-19
     8         Monaco Grand Prix            Monaco 2024-05-26
     9       Canadian Grand Prix          Montréal 2024-06-09
    10        Spanish Grand Prix         Barcelona 2024-06-23
    11       Austrian Grand Prix         Spielberg 2024-06-30
    12        British Grand Prix       Silverstone 2024-07-07
    13      Hungarian Grand Prix          Budapest 2024-07-21
    14        Belgian Grand Prix Spa-Francorchamps 2024-07-28
    15          Dutch Grand Prix         Zandvo

## Data Quality Checks

In [27]:
# Check for missing values
key_columns = ['Position', 'GridPosition', 'FullName', 'TeamName', 'Points', 'Status']
print("Missing values in key columns:")
print(data_2024[key_columns].isnull().sum())

Missing values in key columns:
Position        0
GridPosition    0
FullName        0
TeamName        0
Points          0
Status          0
dtype: int64


In [28]:
# DNF statistics
total_entries = len(data_2024)
finished = (data_2024['Status'] == 'Finished').sum()
dnf = total_entries - finished
dnf_rate = (dnf / total_entries) * 100

print(f"\nDNF Statistics:")
print(f"Total entries: {total_entries}")
print(f"Finished: {finished}")
print(f"DNFs: {dnf}")
print(f"DNF rate: {dnf_rate:.1f}%")


DNF Statistics:
Total entries: 479
Finished: 287
DNFs: 192
DNF rate: 40.1%


In [10]:
# Most common DNF reasons
dnf_data = data_2024[data_2024['Status'] != 'Finished']
print("\nTop DNF reasons:")
print(dnf_data['Status'].value_counts().head(10))


Top DNF reasons:
Status
Lapped           138
Retired           49
Did not start      3
Disqualified       2
Name: count, dtype: int64


## Quick Analysis

In [30]:
# Debug: Check Position column data type and sample values
print("Position column data type:", data_2024['Position'].dtype)
print("\nUnique Position values:")
print(sorted(data_2024['Position'].unique()))
print("\nSample of first place finishes:")
print(data_2024[data_2024['Position'] == 1][['race_name', 'Position', 'FullName', 'TeamName']].head())

Position column data type: float64

Unique Position values:
[np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0), np.float64(13.0), np.float64(14.0), np.float64(15.0), np.float64(16.0), np.float64(17.0), np.float64(18.0), np.float64(19.0), np.float64(20.0)]

Sample of first place finishes:
                   race_name  Position        FullName         TeamName
0         Bahrain Grand Prix       1.0  Max Verstappen  Red Bull Racing
20  Saudi Arabian Grand Prix       1.0  Max Verstappen  Red Bull Racing
40     Australian Grand Prix       1.0    Carlos Sainz          Ferrari
59       Japanese Grand Prix       1.0  Max Verstappen  Red Bull Racing
79        Chinese Grand Prix       1.0  Max Verstappen  Red Bull Racing


In [31]:
# Race winners
winners = data_2024[data_2024['Position'] == 1][['race_name', 'FullName', 'TeamName']]
print("\n2024 Race Winners:")
print(winners.to_string(index=False))


2024 Race Winners:
                race_name        FullName        TeamName
       Bahrain Grand Prix  Max Verstappen Red Bull Racing
 Saudi Arabian Grand Prix  Max Verstappen Red Bull Racing
    Australian Grand Prix    Carlos Sainz         Ferrari
      Japanese Grand Prix  Max Verstappen Red Bull Racing
       Chinese Grand Prix  Max Verstappen Red Bull Racing
         Miami Grand Prix    Lando Norris         McLaren
Emilia Romagna Grand Prix  Max Verstappen Red Bull Racing
        Monaco Grand Prix Charles Leclerc         Ferrari
      Canadian Grand Prix  Max Verstappen Red Bull Racing
       Spanish Grand Prix  Max Verstappen Red Bull Racing
      Austrian Grand Prix  George Russell        Mercedes
       British Grand Prix  Lewis Hamilton        Mercedes
     Hungarian Grand Prix   Oscar Piastri         McLaren
       Belgian Grand Prix  Lewis Hamilton        Mercedes
         Dutch Grand Prix    Lando Norris         McLaren
       Italian Grand Prix Charles Leclerc         Fe

In [32]:
# Driver wins count
win_counts = winners['FullName'].value_counts()
print("\nWins by driver:")
print(win_counts)


Wins by driver:
FullName
Max Verstappen     9
Lando Norris       4
Charles Leclerc    3
Carlos Sainz       2
George Russell     2
Lewis Hamilton     2
Oscar Piastri      2
Name: count, dtype: int64


In [33]:
# Pole to win conversion
pole_winners = data_2024[
    (data_2024['Position'] == 1) & (data_2024['GridPosition'] == 1)
]
pole_win_rate = (len(pole_winners) / num_races) * 100

print(f"\nPole position to win conversion:")
print(f"Pole winners: {len(pole_winners)} / {num_races}")
print(f"Conversion rate: {pole_win_rate:.1f}%")


Pole position to win conversion:
Pole winners: 11 / 24
Conversion rate: 45.8%


## Save the Data

In [34]:
# Save to CSV
collector.save_data(data_2024, 'races_2024.csv')

print("\nData saved successfully!")

2025-11-14 18:12:19,648 - INFO - Saved 479 records to /Users/ritwik/Projects/Vantage/notebooks/../data/raw/races_2024.csv
2025-11-14 18:12:19,649 - INFO - File size: 153.52 KB
2025-11-14 18:12:19,649 - INFO - File size: 153.52 KB



Data saved successfully!


## Summary

In [35]:
print("=" * 70)
print("2024 Season Collection Summary")
print("=" * 70)
print(f"\n✓ Collection time: {minutes}m {seconds}s")
print(f"✓ Races collected: {num_races}")
print(f"✓ Total records: {len(data_2024):,}")
print(f"✓ Data quality: {(1 - dnf_rate/100) * 100:.1f}% finish rate")
print(f"✓ File saved: data/raw/races_2024.csv")
print(f"\nReady for full 2018-2024 collection!")
print(f"Estimated time for 7 years: ~{(elapsed_time * 7 / 60):.0f} minutes")
print("=" * 70)

2024 Season Collection Summary

✓ Collection time: 6m 48s
✓ Races collected: 24
✓ Total records: 479
✓ Data quality: 59.9% finish rate
✓ File saved: data/raw/races_2024.csv

Ready for full 2018-2024 collection!
Estimated time for 7 years: ~48 minutes
