# Data Cleaning and Processing

Clean and standardize the collected F1 race data for analysis.

In [None]:
import sys
sys.path.append('../src')

from data_processing import DataProcessor
import pandas as pd
from pathlib import Path

pd.set_option('display.max_columns', None)

## Load Raw Data

In [None]:
# Load the raw race data
raw_data_path = Path('../data/raw/race_data.csv')

if not raw_data_path.exists():
    print(f"ERROR: {raw_data_path} not found")
    print("Run the data collection notebook first.")
else:
    df_raw = pd.read_csv(raw_data_path)
    print(f"Loaded: {raw_data_path}")
    print(f"Shape: {df_raw.shape}")
    print(f"\nFirst few rows:")
    display(df_raw.head())

## Before Cleaning - Snapshot

In [None]:
print("BEFORE CLEANING:")
print("="*60)
print(f"Records: {len(df_raw):,}")
print(f"Columns: {df_raw.shape[1]}")
print(f"\nUnique circuits: {df_raw['circuit'].nunique()}")
print(f"Unique teams: {df_raw['TeamName'].nunique() if 'TeamName' in df_raw.columns else 'N/A'}")
print(f"\nSample circuit names:")
print(df_raw['circuit'].value_counts().head(10))

## Initialize Data Processor

In [None]:
# Create processor with settings
processor = DataProcessor(
    remove_dnfs=False,         # Keep DNFs but flag them
    standardize_names=True     # Standardize circuit and team names
)

print("DataProcessor initialized")
print("Settings:")
print(f"  remove_dnfs: {processor.remove_dnfs}")
print(f"  standardize_names: {processor.standardize_names}")

## Run Data Cleaning

In [None]:
# Clean the data
print("Starting data cleaning...\n")
df_clean = processor.clean_data(df_raw)
print("\nCleaning complete!")

## Handle DNFs

In [None]:
# Add DNF flags
print("Processing DNF data...\n")
df_processed = processor.handle_dnfs(df_clean)
print("\nDNF processing complete!")

## After Cleaning - Comparison

In [None]:
print("\n" + "="*60)
print("CLEANING SUMMARY")
print("="*60)

print("\nBefore -> After:")
print(f"  Records: {len(df_raw):,} -> {len(df_processed):,}")
print(f"  Circuits: {df_raw['circuit'].nunique()} -> {df_processed['circuit'].nunique()}")

if 'TeamName' in df_raw.columns:
    print(f"  Teams: {df_raw['TeamName'].nunique()} -> {df_processed['TeamName'].nunique()}")

# Get detailed stats
stats = processor.get_cleaning_summary()
print("\nDetailed Statistics:")
for key, value in stats.items():
    print(f"  {key}: {value}")

In [None]:
# Show new columns added
new_cols = set(df_processed.columns) - set(df_raw.columns)
if new_cols:
    print(f"\nNew columns added: {list(new_cols)}")
    print("\nSample of new data:")
    display(df_processed[['year', 'round', 'FullName', 'Position', 'is_dnf', 
                          'completed_race', 'position_change']].head(15))

## Verify Cleaning Results

In [None]:
# Check specific races to verify cleaning worked
print("Verification - Sample race (2024 Round 1):")
sample_race = df_processed[(df_processed['year'] == 2024) & (df_processed['round'] == 1)]
print(f"\nDrivers in race: {len(sample_race)}")
print("\nSample results:")
display(sample_race[['FullName', 'TeamName', 'GridPosition', 'Position', 
                     'position_change', 'is_dnf', 'Status']].head(10))

In [None]:
# Check data types
print("\nData types after cleaning:")
print(df_processed[['Position', 'GridPosition', 'Points', 'is_dnf']].dtypes)

In [None]:
# Verify no critical data lost
print("\nData integrity check:")
print(f"Original unique races: {df_raw.groupby(['year', 'round']).ngroups}")
print(f"Cleaned unique races: {df_processed.groupby(['year', 'round']).ngroups}")

if df_raw.groupby(['year', 'round']).ngroups == df_processed.groupby(['year', 'round']).ngroups:
    print(" ✓ All races retained")
else:
    print(" ⚠ Some races removed - check cleaning logs")

## Save Cleaned Data

In [None]:
# Save processed data
from config import PROCESSED_DATA_DIR

# CSV format
csv_path = PROCESSED_DATA_DIR / 'processed_race_data.csv'
df_processed.to_csv(csv_path, index=False)
csv_size = csv_path.stat().st_size / 1024

# Pickle format for faster loading
pkl_path = PROCESSED_DATA_DIR / 'processed_race_data.pkl'
df_processed.to_pickle(pkl_path)
pkl_size = pkl_path.stat().st_size / 1024

print("\nSaved cleaned data:")
print(f"  CSV: {csv_path} ({csv_size:.2f} KB)")
print(f"  Pickle: {pkl_path} ({pkl_size:.2f} KB)")

## Final Summary

In [None]:
print("="*70)
print("DATA CLEANING COMPLETE")
print("="*70)

print("\nBefore -> After:")
print(f"  Total records: {len(df_raw):,} -> {len(df_processed):,}")
print(f"  Records removed: {len(df_raw) - len(df_processed)}")

print("\nData quality improvements:")
print(f"  ✓ Position/GridPosition converted to numeric")
print(f"  ✓ Circuit names standardized ({stats.get('circuits_before', 0)} -> {stats.get('circuits_after', 0)})")
print(f"  ✓ Team names standardized ({stats.get('teams_before', 0)} -> {stats.get('teams_after', 0)})")
print(f"  ✓ DNF flags added ({stats.get('dnf_count', 0)} DNFs flagged)")
print(f"  ✓ Derived columns added (position_change, is_dnf, etc.)")
print(f"  ✓ {stats.get('errors_removed', 0)} erroneous records removed")

print("\nFiles saved:")
print(f"  - {csv_path}")
print(f"  - {pkl_path}")

print("\nNext steps:")
print("  1. Begin exploratory data analysis")
print("  2. Create visualizations")
print("  3. Start feature engineering")

print("="*70)