# Data Verification and Quality Assessment

Comprehensive verification of the collected 2018-2024 F1 dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

plt.style.use('default')
sns.set_palette('husl')
%matplotlib inline

## Load Dataset

In [None]:
data_path = Path('../data/raw/race_data.csv')

if not data_path.exists():
    print(f"ERROR: Data file not found at {data_path}")
    print("Please run the data collection notebook first.")
else:
    df = pd.read_csv(data_path)
    print(f" Loaded: {data_path}")
    print(f"File size: {data_path.stat().st_size / 1024:.2f} KB")

## Basic Information

In [None]:
print("Dataset Shape:")
print(f"Rows: {df.shape[0]:,}")
print(f"Columns: {df.shape[1]}")
print(f"\nMemory Usage: {df.memory_usage(deep=True).sum() / (1024*1024):.2f} MB")

In [None]:
print("\nDataFrame Info:")
df.info()

In [None]:
print("\nFirst few rows:")
df.head(10)

## Unique Value Counts

In [None]:
print("Unique Value Counts:")
print(f"\nYears: {df['year'].nunique()}")
print(f"Unique years: {sorted(df['year'].unique())}")

print(f"\nRaces: {df['race_name'].nunique()}")
print(f"Circuits: {df['circuit'].nunique()}")
print(f"Drivers: {df['DriverNumber'].nunique()}")
print(f"Teams: {df['TeamName'].nunique() if 'TeamName' in df.columns else 'N/A'}")

In [None]:
# Races per year
races_by_year = df.groupby('year')['race_name'].nunique().sort_index()
print("\nRaces per year:")
for year, count in races_by_year.items():
    print(f"  {year}: {count} races")

In [None]:
# Records per year
records_by_year = df.groupby('year').size().sort_index()
print("\nRecords per year:")
for year, count in records_by_year.items():
    avg_per_race = count / races_by_year[year]
    print(f"  {year}: {count} records ({avg_per_race:.1f} avg per race)")

## Missing Values Analysis

In [None]:
print("Missing Values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing': missing,
    'Percentage': missing_pct
}).sort_values('Missing', ascending=False)

print(missing_df[missing_df['Missing'] > 0])

if missing_df['Missing'].sum() == 0:
    print("\n No missing values found!")

In [None]:
# Check critical columns
critical_cols = ['Position', 'GridPosition', 'DriverNumber', 'year', 'round']
print("\nMissing values in critical columns:")
for col in critical_cols:
    if col in df.columns:
        missing_count = df[col].isnull().sum()
        print(f"  {col}: {missing_count}")
        if missing_count > 0:
            print(f"    WARNING: Critical column has missing data!")

## Data Quality Checks

In [None]:
# Check Position values
print("Position Analysis:")
print(f"\nUnique Position values: {df['Position'].nunique()}")
print(f"Position range: {df['Position'].min()} to {df['Position'].max()}")
print(f"Data type: {df['Position'].dtype}")

# Convert to numeric for analysis
df['Position_num'] = pd.to_numeric(df['Position'], errors='coerce')
print(f"\nNumeric conversion successful")
print(f"Min position: {df['Position_num'].min()}")
print(f"Max position: {df['Position_num'].max()}")

In [None]:
# Check GridPosition values
print("GridPosition Analysis:")
print(f"\nUnique GridPosition values: {df['GridPosition'].nunique()}")
print(f"Data type: {df['GridPosition'].dtype}")

df['GridPosition_num'] = pd.to_numeric(df['GridPosition'], errors='coerce')
print(f"\nMin grid: {df['GridPosition_num'].min()}")
print(f"Max grid: {df['GridPosition_num'].max()}")

# Check for pit lane starts (grid position 0)
pit_starts = (df['GridPosition_num'] == 0).sum()
print(f"\nPit lane starts: {pit_starts}")

In [None]:
# DNF Analysis
print("DNF Analysis:")

total_entries = len(df)
finished = (df['Status'] == 'Finished').sum()
dnf = total_entries - finished
dnf_rate = (dnf / total_entries) * 100

print(f"\nTotal entries: {total_entries:,}")
print(f"Finished: {finished:,} ({100-dnf_rate:.1f}%)")
print(f"DNFs: {dnf:,} ({dnf_rate:.1f}%)")

In [None]:
# Top DNF reasons
dnf_data = df[df['Status'] != 'Finished']
if len(dnf_data) > 0:
    print("\nTop 10 DNF reasons:")
    print(dnf_data['Status'].value_counts().head(10))

## Data Anomalies

In [None]:
# Check for races with unusual driver counts
drivers_per_race = df.groupby(['year', 'round', 'race_name']).size()

print("Races with unusual driver counts:")
unusual = drivers_per_race[(drivers_per_race < 19) | (drivers_per_race > 22)]
if len(unusual) > 0:
    print(unusual)
else:
    print("None found - all races have 19-22 drivers")

In [None]:
# Check for duplicate entries
duplicates = df.duplicated(subset=['year', 'round', 'DriverNumber']).sum()
print(f"\nDuplicate entries: {duplicates}")

if duplicates > 0:
    print("\nWARNING: Found duplicate entries!")
    dup_rows = df[df.duplicated(subset=['year', 'round', 'DriverNumber'], keep=False)]
    print(dup_rows[['year', 'round', 'race_name', 'DriverNumber', 'FullName']].sort_values(['year', 'round']))

## Visualizations

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Grid position distribution
axes[0, 0].hist(df['GridPosition_num'].dropna(), bins=20, edgecolor='black')
axes[0, 0].set_title('Grid Position Distribution', fontweight='bold')
axes[0, 0].set_xlabel('Grid Position')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].grid(alpha=0.3)

# Finish position distribution
axes[0, 1].hist(df['Position_num'].dropna(), bins=20, edgecolor='black', color='orange')
axes[0, 1].set_title('Finish Position Distribution', fontweight='bold')
axes[0, 1].set_xlabel('Finish Position')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].grid(alpha=0.3)

# Races per year
races_by_year.plot(kind='bar', ax=axes[1, 0], color='green', edgecolor='black')
axes[1, 0].set_title('Races per Year', fontweight='bold')
axes[1, 0].set_xlabel('Year')
axes[1, 0].set_ylabel('Number of Races')
axes[1, 0].grid(alpha=0.3, axis='y')

# Records per circuit
circuit_counts = df['circuit'].value_counts().head(15)
circuit_counts.plot(kind='barh', ax=axes[1, 1], color='purple', edgecolor='black')
axes[1, 1].set_title('Top 15 Circuits by Records', fontweight='bold')
axes[1, 1].set_xlabel('Number of Records')
axes[1, 1].set_ylabel('Circuit')
axes[1, 1].grid(alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## Summary Statistics

In [None]:
print("Numeric Columns Summary:")
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols].describe()

## Notable Issues and Observations

In [None]:
issues = []

# Check 2020 for fewer races (COVID)
races_2020 = df[df['year'] == 2020]['race_name'].nunique()
if races_2020 < 17:
    issues.append(f"2020 has only {races_2020} races (COVID impact)")

# Check for missing critical data
if df['Position'].isnull().sum() > 0:
    issues.append(f"{df['Position'].isnull().sum()} missing Position values")

if df['GridPosition'].isnull().sum() > 0:
    issues.append(f"{df['GridPosition'].isnull().sum()} missing GridPosition values")

# Check DNF rate
if dnf_rate > 25:
    issues.append(f"High DNF rate: {dnf_rate:.1f}%")

print("\n" + "="*70)
print("DATA QUALITY REPORT")
print("="*70)

if issues:
    print("\nIssues found:")
    for i, issue in enumerate(issues, 1):
        print(f"  {i}. {issue}")
else:
    print("\n No major issues found!")

print("\nDataset Summary:")
print(f"  Years: {df['year'].min()} - {df['year'].max()}")
print(f"  Total records: {len(df):,}")
print(f"  Total races: {df['race_name'].nunique()}")
print(f"  DNF rate: {dnf_rate:.1f}%")
print(f"  Duplicates: {duplicates}")
print("\nReady for data cleaning and analysis!")
print("="*70)