# Test Data Collection

Testing the F1DataCollector class with single race collection.

In [1]:
import sys
sys.path.append('../src')

from data_collection import F1DataCollector
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

2025-11-14 18:29:42,677 - INFO - FastF1 cache enabled at: /Users/ritwik/Projects/Vantage/notebooks/../data/cache


## Test 1: Single Race Collection (2024 Monaco GP)

In [2]:
# Initialize collector
collector = F1DataCollector()

print("Testing single race collection...")
print("Race: 2024 Monaco Grand Prix")

2025-11-14 18:29:44,041 - INFO - Initialized collector for years 2018-2024


Testing single race collection...
Race: 2024 Monaco Grand Prix


In [3]:
# Collect Monaco GP (Round 8 in 2024)
monaco_results = collector.collect_single_race(2024, 8)

print(f"\nCollected {len(monaco_results)} driver results")

core           INFO 	Loading data for Monaco Grand Prix - Race [v3.6.1]
2025-11-14 18:29:45,770 - INFO - Loading data for Monaco Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
2025-11-14 18:29:45,771 - INFO - Using cached data for session_info
req            INFO 	Using cached data for driver_info
2025-11-14 18:29:45,772 - INFO - Using cached data for driver_info
2025-11-14 18:29:45,770 - INFO - Loading data for Monaco Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
2025-11-14 18:29:45,771 - INFO - Using cached data for session_info
req            INFO 	Using cached data for driver_info
2025-11-14 18:29:45,772 - INFO - Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
2025-11-14 18:29:45,789 - INFO - Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
2025-11-14 18:29:45,791 - INFO - Using cached data for lap_count
req         


Collected 20 driver results


## Examine the Data

In [4]:
# Check shape and columns
print(f"DataFrame shape: {monaco_results.shape}")
print(f"\nColumns ({len(monaco_results.columns)}):")
print(monaco_results.columns.tolist())

DataFrame shape: (20, 27)

Columns (27):
['DriverNumber', 'BroadcastName', 'Abbreviation', 'DriverId', 'TeamName', 'TeamColor', 'TeamId', 'FirstName', 'LastName', 'FullName', 'HeadshotUrl', 'CountryCode', 'Position', 'ClassifiedPosition', 'GridPosition', 'Q1', 'Q2', 'Q3', 'Time', 'Status', 'Points', 'Laps', 'year', 'round', 'race_name', 'circuit', 'date']


In [5]:
# Display key columns
key_cols = ['Position', 'GridPosition', 'DriverNumber', 'FullName', 
            'TeamName', 'Points', 'Status', 'year', 'race_name', 'circuit']

print("\nRace Results:")
print(monaco_results[key_cols])


Race Results:
    Position  GridPosition DriverNumber          FullName         TeamName  \
16       1.0           1.0           16   Charles Leclerc          Ferrari   
81       2.0           2.0           81     Oscar Piastri          McLaren   
55       3.0           3.0           55      Carlos Sainz          Ferrari   
4        4.0           4.0            4      Lando Norris          McLaren   
63       5.0           5.0           63    George Russell         Mercedes   
1        6.0           6.0            1    Max Verstappen  Red Bull Racing   
44       7.0           7.0           44    Lewis Hamilton         Mercedes   
22       8.0           8.0           22      Yuki Tsunoda               RB   
23       9.0           9.0           23   Alexander Albon         Williams   
10      10.0          10.0           10      Pierre Gasly           Alpine   
14      11.0          14.0           14   Fernando Alonso     Aston Martin   
3       12.0          12.0            3  Daniel R

In [6]:
# Check data types
print("\nData types:")
print(monaco_results[key_cols].dtypes)


Data types:
Position        float64
GridPosition    float64
DriverNumber     object
FullName         object
TeamName         object
Points          float64
Status           object
year              int64
race_name        object
circuit          object
dtype: object


In [7]:
# Look for missing values
print("\nMissing values:")
print(monaco_results[key_cols].isnull().sum())


Missing values:
Position        0
GridPosition    0
DriverNumber    0
FullName        0
TeamName        0
Points          0
Status          0
year            0
race_name       0
circuit         0
dtype: int64


## Analyze Monaco Results

In [8]:
# Check position changes
monaco_results['position_change'] = (
    monaco_results['GridPosition'].astype(float) - 
    monaco_results['Position'].astype(float)
)

print("Position changes at Monaco:")
print(monaco_results[['FullName', 'GridPosition', 'Position', 'position_change']].sort_values('Position'))

Position changes at Monaco:
            FullName  GridPosition  Position  position_change
16   Charles Leclerc           1.0       1.0              0.0
81     Oscar Piastri           2.0       2.0              0.0
55      Carlos Sainz           3.0       3.0              0.0
4       Lando Norris           4.0       4.0              0.0
63    George Russell           5.0       5.0              0.0
1     Max Verstappen           6.0       6.0              0.0
44    Lewis Hamilton           7.0       7.0              0.0
22      Yuki Tsunoda           8.0       8.0              0.0
23   Alexander Albon           9.0       9.0              0.0
10      Pierre Gasly          10.0      10.0              0.0
14   Fernando Alonso          14.0      11.0              3.0
3   Daniel Ricciardo          12.0      12.0              0.0
77   Valtteri Bottas          17.0      13.0              4.0
18      Lance Stroll          13.0      14.0             -1.0
2     Logan Sargeant          15.0      15

In [9]:
# DNF analysis
dnf_mask = monaco_results['Status'] != 'Finished'
dnf_count = dnf_mask.sum()
dnf_rate = (dnf_count / len(monaco_results)) * 100

print(f"\nDNF Statistics:")
print(f"Total DNFs: {dnf_count}")
print(f"DNF Rate: {dnf_rate:.1f}%")

if dnf_count > 0:
    print("\nDNF Details:")
    print(monaco_results[dnf_mask][['FullName', 'TeamName', 'Status']])


DNF Statistics:
Total DNFs: 13
DNF Rate: 65.0%

DNF Details:
            FullName         TeamName   Status
22      Yuki Tsunoda               RB   Lapped
23   Alexander Albon         Williams   Lapped
10      Pierre Gasly           Alpine   Lapped
14   Fernando Alonso     Aston Martin   Lapped
3   Daniel Ricciardo               RB   Lapped
77   Valtteri Bottas      Kick Sauber   Lapped
18      Lance Stroll     Aston Martin   Lapped
2     Logan Sargeant         Williams   Lapped
24       Guanyu Zhou      Kick Sauber   Lapped
31      Esteban Ocon           Alpine  Retired
11      Sergio Perez  Red Bull Racing  Retired
27   Nico Hulkenberg     Haas F1 Team  Retired
20   Kevin Magnussen     Haas F1 Team  Retired


In [10]:
# Quick stats
print("\nQuick Statistics:")
print(f"Winner: {monaco_results[monaco_results['Position'] == 1]['FullName'].values[0]}")
print(f"Pole position: {monaco_results[monaco_results['GridPosition'] == 1]['FullName'].values[0]}")
print(f"Average position change: {monaco_results['position_change'].mean():.2f}")
print(f"Max position gained: {monaco_results['position_change'].max():.0f}")
print(f"Max position lost: {monaco_results['position_change'].min():.0f}")


Quick Statistics:
Winner: Charles Leclerc
Pole position: Charles Leclerc
Average position change: 0.00
Max position gained: 4
Max position lost: -6


## Test 2: Try Another Race (2024 Bahrain GP)

In [12]:
# Collect Bahrain GP
bahrain_results = collector.collect_single_race(2024, 1)

print(f"\nBahrain GP: {len(bahrain_results)} results collected")
print("\nTop 10 finishers:")
print(bahrain_results[key_cols].head(10))

core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.1]
2025-11-14 18:13:36,121 - INFO - Loading data for Bahrain Grand Prix - Race [v3.6.1]
2025-11-14 18:13:36,121 - INFO - Loading data for Bahrain Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
2025-11-14 18:13:36,122 - INFO - Using cached data for session_info
req            INFO 	Using cached data for driver_info
2025-11-14 18:13:36,122 - INFO - Using cached data for driver_info
req            INFO 	Using cached data for session_info
2025-11-14 18:13:36,122 - INFO - Using cached data for session_info
req            INFO 	Using cached data for driver_info
2025-11-14 18:13:36,122 - INFO - Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
2025-11-14 18:13:36,136 - INFO - Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
2025-11-14 18:13:36,137 - INFO - Using cached data for lap_count
req      


Bahrain GP: 20 results collected

Top 10 finishers:
    Position  GridPosition DriverNumber         FullName         TeamName  \
1        1.0           1.0            1   Max Verstappen  Red Bull Racing   
11       2.0           5.0           11     Sergio Perez  Red Bull Racing   
55       3.0           4.0           55     Carlos Sainz          Ferrari   
16       4.0           2.0           16  Charles Leclerc          Ferrari   
63       5.0           3.0           63   George Russell         Mercedes   
4        6.0           7.0            4     Lando Norris          McLaren   
44       7.0           9.0           44   Lewis Hamilton         Mercedes   
81       8.0           8.0           81    Oscar Piastri          McLaren   
14       9.0           6.0           14  Fernando Alonso     Aston Martin   
18      10.0          12.0           18     Lance Stroll     Aston Martin   

    Points    Status  year           race_name circuit  
1     26.0  Finished  2024  Bahrain Grand 

## Summary

In [13]:
print("=" * 60)
print("Data Collection Test Summary")
print("=" * 60)
print("\n✓ Single race collection working")
print("✓ All essential columns present:")
print("  - GridPosition, Position, DriverNumber")
print("  - FullName, TeamName, Points, Status")
print("  - year, race_name, circuit, date")
print("\n✓ Data quality looks good")
print("✓ DNF tracking working")
print("✓ Metadata correctly added")
print("\nReady for full season collection!")
print("=" * 60)

Data Collection Test Summary

✓ Single race collection working
✓ All essential columns present:
  - GridPosition, Position, DriverNumber
  - FullName, TeamName, Points, Status
  - year, race_name, circuit, date

✓ Data quality looks good
✓ DNF tracking working
✓ Metadata correctly added

Ready for full season collection!
