Purpose: Download all F1 data (FP1, FP2, FP3, Q, Sprint, Race) from 2021-2025 in batches, respecting API limits.

In [None]:
"""
F1 DATA DOWNLOADER - REVERSE CHRONOLOGICAL ORDER
=================================================
Downloads all F1 session data starting from the LATEST races (Dec 2025)
working backwards to 2021. This ensures you get the most recent data first.

API CAPACITY ESTIMATE:
- Per session: ~11 API calls (3 for loading + 8 for data)
- Per race weekend: ~55 calls (5 sessions: FP1, FP2, FP3, Q, R)
- API Limit: 500 calls/hour
- EXPECTED: ~8-9 race weekends per hour

USAGE:
1. Run this script: python 1_download_data.py
2. Wait for API limit message
3. Wait 1 hour, run again - it resumes automatically
4. Repeat until all data is downloaded

OUTPUT:
- f1_data/2025_races.csv
- f1_data/2025_qualifying.csv
- f1_data/2025_practice.csv
- ... (same for 2024-2021)
"""

import fastf1 as ff1
import pandas as pd
import os
import time
import warnings
from datetime import datetime

warnings.filterwarnings('ignore')
ff1.set_log_level('ERROR')
ff1.Cache.enable_cache('fastf1_cache')

# ==============================================================================
# CONFIGURATION
# ==============================================================================
YEARS = [2025, 2024, 2023, 2022, 2021]  # ‚Üê REVERSED: Newest first!
SESSION_TYPES = {
    'FP1': 'practice',
    'FP2': 'practice', 
    'FP3': 'practice',
    'Q': 'qualifying',
    'SQ': 'qualifying',  # Sprint Qualifying
    'S': 'sprint',       # Sprint Race
    'R': 'race'
}

OUTPUT_DIR = 'f1_data'
os.makedirs(OUTPUT_DIR, exist_ok=True)

MAX_API_CALLS = 400  # Conservative limit (actual is 500/hour)
api_call_count = 0
session_count = 0
race_weekend_count = 0

# ==============================================================================
# HELPER FUNCTIONS
# ==============================================================================

def check_api_limit():
    """Check if we're approaching API limit"""
    global api_call_count, race_weekend_count
    if api_call_count >= MAX_API_CALLS:
        print(f"\n{'='*80}")
        print(f"‚ö†Ô∏è  API LIMIT REACHED: {api_call_count} calls")
        print(f"{'='*80}")
        print(f"‚úÖ Downloaded {race_weekend_count} race weekends this session")
        print(f"‚úÖ Progress saved to: {OUTPUT_DIR}/")
        print(f"üí§ Wait 1 hour, then run this script again to continue")
        print(f"‚ÑπÔ∏è  Already downloaded sessions will be skipped automatically")
        print(f"{'='*80}\n")
        return True
    return False

def session_already_downloaded(year, event_name, session_type):
    """Check if session data already exists"""
    category = SESSION_TYPES.get(session_type, 'other')
    
    if category == 'practice':
        file_path = os.path.join(OUTPUT_DIR, f'{year}_practice.csv')
    elif category == 'qualifying':
        file_path = os.path.join(OUTPUT_DIR, f'{year}_qualifying.csv')
    elif category == 'sprint':
        file_path = os.path.join(OUTPUT_DIR, f'{year}_sprint.csv')
    elif category == 'race':
        file_path = os.path.join(OUTPUT_DIR, f'{year}_races.csv')
    else:
        return False
    
    if not os.path.exists(file_path):
        return False
    
    # Check if this specific event exists in the file
    try:
        df = pd.read_csv(file_path)
        exists = ((df['Year'] == year) & 
                  (df['Event'] == event_name) & 
                  (df['SessionType'] == session_type)).any()
        return exists
    except:
        return False

def append_to_csv(data, year, category):
    """Append data to year-category CSV file"""
    file_path = os.path.join(OUTPUT_DIR, f'{year}_{category}.csv')
    
    if os.path.exists(file_path):
        data.to_csv(file_path, mode='a', header=False, index=False)
    else:
        data.to_csv(file_path, index=False)

def event_fully_downloaded(year, event_name):
    """Check if all sessions for this event are already downloaded"""
    required_sessions = ['FP1', 'FP2', 'FP3', 'Q', 'R']
    for session_type in required_sessions:
        if not session_already_downloaded(year, event_name, session_type):
            return False
    return True

# ==============================================================================
# DOWNLOAD FUNCTION
# ==============================================================================

def download_session(year, event_name, session_type):
    """
    Download a single session's data
    Returns: (success, lap_count, api_calls_used)
    """
    global api_call_count, session_count
    
    calls_before = api_call_count
    
    try:
        # Load session
        session = ff1.get_session(year, event_name, session_type)
        api_call_count += 3  # Estimate
        
        # Load data
        session.load(laps=True, telemetry=False, weather=True, messages=False)
        api_call_count += 8  # Estimate
        
        if session.laps.empty:
            return False, 0, api_call_count - calls_before
        
        # Extract lap data
        laps = session.laps.copy()
        
        # Filter out invalid laps
        if 'Deleted' in laps.columns:
            laps = laps[laps['Deleted'] == False]
        if 'IsAccurate' in laps.columns:
            laps = laps[laps['IsAccurate'] == True]
        
        # Add metadata
        laps['Year'] = year
        laps['Event'] = event_name
        laps['SessionType'] = session_type
        
        # Add weather data
        try:
            if session.weather_data is not None and not session.weather_data.empty:
                weather = session.weather_data
                laps['AirTemp'] = weather['AirTemp'].mean()
                laps['TrackTemp'] = weather['TrackTemp'].mean()
                laps['Humidity'] = weather['Humidity'].mean()
                laps['Pressure'] = weather['Pressure'].mean()
                laps['Rainfall'] = weather['Rainfall'].max()
                laps['WindSpeed'] = weather['WindSpeed'].mean()
                laps['WindDirection'] = weather['WindDirection'].mean()
        except:
            pass
        
        # Add session results (final positions)
        try:
            if session.results is not None and not session.results.empty:
                results = session.results[['Abbreviation', 'Position', 'GridPosition', 'Status']].copy()
                results.rename(columns={
                    'Abbreviation': 'Driver',
                    'Position': 'FinalPosition',
                    'GridPosition': 'StartPosition'
                }, inplace=True)
                
                laps = laps.merge(results, on='Driver', how='left', suffixes=('', '_result'))
        except:
            pass
        
        # Save to appropriate CSV
        category = SESSION_TYPES.get(session_type, 'other')
        if category != 'other':
            append_to_csv(laps, year, category)
        
        session_count += 1
        return True, len(laps), api_call_count - calls_before
        
    except Exception as e:
        return False, 0, api_call_count - calls_before

# ==============================================================================
# MAIN DOWNLOAD LOOP
# ==============================================================================

def download_all_data():
    """Main function to download all data"""
    global api_call_count, race_weekend_count
    
    print("="*80)
    print("F1 DATA DOWNLOADER - REVERSE CHRONOLOGICAL ORDER")
    print("="*80)
    print(f"üìÖ Years: {YEARS[0]} ‚Üí {YEARS[-1]} (newest first)")
    print(f"üèÅ Sessions per race: FP1, FP2, FP3, Q, Sprint (if any), R")
    print(f"üìä API Limit: {MAX_API_CALLS} calls/hour")
    print(f"")
    print(f"üí° CAPACITY ESTIMATE:")
    print(f"   - Per session: ~11 API calls")
    print(f"   - Per race weekend: ~55 calls (5 sessions)")
    print(f"   - Expected download: ~7-8 race weekends per hour")
    print("="*80)
    print()
    
    total_sessions = 0
    skipped_sessions = 0
    failed_sessions = 0
    
    for year in YEARS:
        print(f"\n{'='*80}")
        print(f"üìÖ YEAR {year}")
        print(f"{'='*80}")
        
        try:
            # Get event schedule
            schedule = ff1.get_event_schedule(year)
            api_call_count += 1
            
            # REVERSE THE SCHEDULE: Latest races first
            schedule = schedule.iloc[::-1].reset_index(drop=True)
            
            print(f"Found {len(schedule)} events (downloading newest first)")
            
            for idx, event in schedule.iterrows():
                event_name = event['EventName']
                
                # Check if event is fully downloaded
                if event_fully_downloaded(year, event_name):
                    print(f"\nüèÅ {event_name}")
                    print(f"   ‚è≠Ô∏è  Complete: All sessions already downloaded")
                    continue
                
                # Check API limit before starting new race weekend
                if check_api_limit():
                    return
                
                print(f"\nüèÅ {event_name}")
                
                event_sessions_downloaded = 0
                event_sessions_total = 0
                
                # Try each session type
                for session_type in ['FP1', 'FP2', 'FP3', 'Q', 'SQ', 'S', 'R']:
                    
                    # Check if already downloaded
                    if session_already_downloaded(year, event_name, session_type):
                        skipped_sessions += 1
                        print(f"   ‚è≠Ô∏è  {session_type:3s}: Already downloaded")
                        continue
                    
                    # Check API limit again
                    if check_api_limit():
                        return
                    
                    # Download session
                    success, lap_count, calls_used = download_session(year, event_name, session_type)
                    
                    if success:
                        total_sessions += 1
                        event_sessions_downloaded += 1
                        event_sessions_total += 1
                        print(f"   ‚úÖ {session_type:3s}: {lap_count:4d} laps ({calls_used} calls) | Total: {api_call_count}/{MAX_API_CALLS}")
                    else:
                        failed_sessions += 1
                        if session_type in ['FP1', 'FP2', 'FP3', 'Q', 'R']:  # Only show important sessions
                            print(f"   ‚ö†Ô∏è  {session_type:3s}: Not available")
                    
                    # Small delay to be nice to API
                    time.sleep(0.5)
                
                # Increment race weekend count if we downloaded at least one session
                if event_sessions_downloaded > 0:
                    race_weekend_count += 1
                    
        except Exception as e:
            print(f"   ‚ùå Year {year} error: {str(e)[:80]}")
            continue
    
    # Summary
    print(f"\n{'='*80}")
    print("‚úÖ DOWNLOAD COMPLETE!")
    print(f"{'='*80}")
    print(f"‚úÖ Downloaded: {total_sessions} sessions ({race_weekend_count} race weekends)")
    print(f"‚è≠Ô∏è  Skipped: {skipped_sessions} sessions (already had)")
    print(f"‚ö†Ô∏è  Failed: {failed_sessions} sessions (not available)")
    print(f"üìä API Calls Used: {api_call_count}/{MAX_API_CALLS}")
    print(f"üíæ Data saved to: {OUTPUT_DIR}/")
    print(f"{'='*80}\n")

# ==============================================================================
# SHOW CURRENT PROGRESS
# ==============================================================================

def show_current_progress():
    """Show what data has been downloaded so far"""
    print("\nüìä Current Download Status:")
    print("="*80)
    
    total_laps = 0
    total_events = 0
    
    for year in [2025, 2024, 2023, 2022, 2021]:
        year_events = set()
        
        for category in ['practice', 'qualifying', 'sprint', 'races']:
            file_path = os.path.join(OUTPUT_DIR, f'{year}_{category}.csv')
            if os.path.exists(file_path):
                df = pd.read_csv(file_path)
                laps = len(df)
                events = df['Event'].nunique() if 'Event' in df.columns else 0
                total_laps += laps
                year_events.update(df['Event'].unique() if 'Event' in df.columns else [])
                print(f"   ‚úÖ {year}_{category:12s}.csv: {laps:6d} laps, {events:2d} events")
        
        if year_events:
            total_events += len(year_events)
            print(f"      ‚Üí {year} total: {len(year_events)} unique events")
    
    print(f"\nüìà Overall Progress:")
    print(f"   Total laps: {total_laps:,}")
    print(f"   Total race weekends: {total_events}")
    
    # Estimate remaining
    # F1 season typically has ~23 races per year √ó 5 years = ~115 total
    estimated_total = 115
    remaining = estimated_total - total_events
    
    if remaining > 0:
        hours_needed = (remaining / 8)  # ~8 races per hour
        print(f"\n‚è±Ô∏è  Estimated remaining:")
        print(f"   Race weekends: ~{remaining}")
        print(f"   Time needed: ~{hours_needed:.1f} more hours of downloading")
    
    print("="*80)

# ==============================================================================
# RUN
# ==============================================================================

if __name__ == "__main__":
    print("\nüöÄ Starting F1 data download (newest races first)...")
    print("‚ÑπÔ∏è  This respects API rate limits - may need multiple runs\n")
    
    # Show what we already have
    show_current_progress()
    
    print("\n‚è≥ Starting download...\n")
    
    start_time = time.time()
    download_all_data()
    elapsed = time.time() - start_time
    
    # Show final status
    show_current_progress()
    
    print(f"\n‚è±Ô∏è  Session duration: {elapsed/60:.1f} minutes")
    print(f"üìä API calls used: {api_call_count}")
    print(f"üèÅ Race weekends downloaded: {race_weekend_count}")
    
    if api_call_count > 0:
        print(f"üìà Average: {api_call_count/max(1, race_weekend_count):.1f} API calls per race weekend")
    
    print("\n‚úÖ Ready for next step: python 2_train_model.py\n")


Purpose: Load downloaded data, aggregate features, train model on 2021-2024, save model.

In [None]:
"""
F1 RACE POSITION PREDICTOR - TRAINING
======================================
Trains a machine learning model to predict race finishing positions
using 2021-2024 data.

FEATURES:
- Qualifying position
- Practice pace (FP1, FP2, FP3 average lap times)
- Weather conditions
- Driver/Team historical performance
- Track characteristics

USAGE:
python 2_train_model.py

OUTPUT:
- f1_trained_model.pkl (saved model)
- training_report.txt (performance metrics)
"""

import pandas as pd
import numpy as np
import os
import warnings
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pickle

warnings.filterwarnings('ignore')

# ==============================================================================
# CONFIGURATION
# ==============================================================================
DATA_DIR = 'f1_data'
TRAINING_YEARS = [2021, 2022, 2023, 2024]
OUTPUT_MODEL = 'f1_trained_model.pkl'
OUTPUT_REPORT = 'training_report.txt'

# ==============================================================================
# LOAD DATA
# ==============================================================================

def load_all_data(years):
    """Load race and qualifying data for specified years"""
    
    print("="*80)
    print("LOADING DATA")
    print("="*80)
    
    all_races = []
    all_quali = []
    all_practice = []
    all_sprint = []
    
    for year in years:
        # Load races
        race_file = os.path.join(DATA_DIR, f'{year}_races.csv')
        if os.path.exists(race_file):
            races = pd.read_csv(race_file)
            all_races.append(races)
            print(f"‚úÖ {year} Races: {len(races):,} laps")
        
        # Load qualifying
        quali_file = os.path.join(DATA_DIR, f'{year}_qualifying.csv')
        if os.path.exists(quali_file):
            quali = pd.read_csv(quali_file)
            all_quali.append(quali)
            print(f"‚úÖ {year} Qualifying: {len(quali):,} laps")
        
        # Load practice
        practice_file = os.path.join(DATA_DIR, f'{year}_practice.csv')
        if os.path.exists(practice_file):
            practice = pd.read_csv(practice_file)
            all_practice.append(practice)
            print(f"‚úÖ {year} Practice: {len(practice):,} laps")
        
        # Load sprint
        sprint_file = os.path.join(DATA_DIR, f'{year}_sprint.csv')
        if os.path.exists(sprint_file):
            sprint = pd.read_csv(sprint_file)
            all_sprint.append(sprint)
            print(f"‚úÖ {year} Sprint: {len(sprint):,} laps")
    
    races_df = pd.concat(all_races, ignore_index=True) if all_races else pd.DataFrame()
    quali_df = pd.concat(all_quali, ignore_index=True) if all_quali else pd.DataFrame()
    practice_df = pd.concat(all_practice, ignore_index=True) if all_practice else pd.DataFrame()
    sprint_df = pd.concat(all_sprint, ignore_index=True) if all_sprint else pd.DataFrame()
    
    print(f"\n‚úÖ Total loaded:")
    print(f"   Races: {len(races_df):,} laps")
    print(f"   Qualifying: {len(quali_df):,} laps")
    print(f"   Practice: {len(practice_df):,} laps")
    print(f"   Sprint: {len(sprint_df):,} laps")
    
    return races_df, quali_df, practice_df, sprint_df

# ==============================================================================
# FEATURE ENGINEERING
# ==============================================================================

def aggregate_race_data(races_df):
    """Aggregate lap-level race data to driver-race level"""
    
    print(f"\n{'='*80}")
    print("AGGREGATING RACE DATA")
    print("="*80)
    
    # Convert lap times
    races_df['LapTime_sec'] = pd.to_timedelta(races_df['LapTime'], errors='coerce').dt.total_seconds()
    
    # Filter valid laps (no pit in/out)
    races_clean = races_df.copy()
    if 'PitOutTime' in races_clean.columns:
        races_clean = races_clean[races_clean['PitOutTime'].isna()]
    if 'PitInTime' in races_clean.columns:
        races_clean = races_clean[races_clean['PitInTime'].isna()]
    
    # Remove extreme outliers
    races_clean = races_clean[races_clean['LapTime_sec'].notna()]
    races_clean = races_clean[races_clean['LapTime_sec'] > 0]
    races_clean = races_clean[races_clean['LapTime_sec'] < 200]
    
    print(f"Valid racing laps: {len(races_clean):,}")
    
    # Aggregate
    agg_dict = {
        'LapTime_sec': 'median',
        'SpeedI1': 'mean',
        'SpeedI2': 'mean',
        'SpeedFL': 'mean',
        'SpeedST': 'mean',
        'AirTemp': 'first',
        'TrackTemp': 'first',
        'Humidity': 'first',
        'Pressure': 'first',
        'Rainfall': 'max',
        'WindSpeed': 'mean',
        'FinalPosition': 'first',
        'StartPosition': 'first',
        'Team': 'first'
    }
    
    agg_dict = {k: v for k, v in agg_dict.items() if k in races_clean.columns}
    
    race_aggregated = races_clean.groupby(['Year', 'Event', 'Driver']).agg(agg_dict).reset_index()
    
    print(f"Aggregated: {len(race_aggregated):,} driver-race records")
    
    return race_aggregated

def aggregate_qualifying(quali_df):
    """Get best qualifying lap per driver per event"""
    
    print(f"\n{'='*80}")
    print("PROCESSING QUALIFYING DATA")
    print("="*80)
    
    quali_df['LapTime_sec'] = pd.to_timedelta(quali_df['LapTime'], errors='coerce').dt.total_seconds()
    
    # Remove deleted laps
    if 'Deleted' in quali_df.columns:
        quali_df = quali_df[quali_df['Deleted'] == False]
    
    quali_df = quali_df[quali_df['LapTime_sec'].notna()]
    quali_df = quali_df[quali_df['LapTime_sec'] > 0]
    
    # Get best lap and average speed per driver per event
    quali_agg = quali_df.groupby(['Year', 'Event', 'Driver']).agg({
        'LapTime_sec': 'min',
        'SpeedI1': 'mean',
        'SpeedFL': 'mean'
    }).reset_index()
    
    # Calculate qualifying position from lap times
    quali_agg['QualiPosition'] = quali_agg.groupby(['Year', 'Event'])['LapTime_sec'].rank(method='min')
    
    quali_agg.rename(columns={
        'LapTime_sec': 'Quali_BestLapTime',
        'SpeedI1': 'Quali_AvgSpeed',
        'SpeedFL': 'Quali_MaxSpeed'
    }, inplace=True)
    
    print(f"Qualifying records: {len(quali_agg):,}")
    
    return quali_agg

def aggregate_practice(practice_df):
    """Get average practice pace per driver per event"""
    
    print(f"\n{'='*80}")
    print("PROCESSING PRACTICE DATA")
    print("="*80)
    
    if practice_df.empty:
        print("No practice data available")
        return pd.DataFrame()
    
    practice_df['LapTime_sec'] = pd.to_timedelta(practice_df['LapTime'], errors='coerce').dt.total_seconds()
    
    # Filter valid laps
    if 'Deleted' in practice_df.columns:
        practice_df = practice_df[practice_df['Deleted'] == False]
    
    practice_df = practice_df[practice_df['LapTime_sec'].notna()]
    practice_df = practice_df[practice_df['LapTime_sec'] > 0]
    practice_df = practice_df[practice_df['LapTime_sec'] < 200]
    
    # Aggregate per driver per event
    practice_agg = practice_df.groupby(['Year', 'Event', 'Driver']).agg({
        'LapTime_sec': 'median',
        'SpeedI1': 'mean',
        'SpeedST': 'mean'
    }).reset_index()
    
    practice_agg.rename(columns={
        'LapTime_sec': 'Practice_MedianLapTime',
        'SpeedI1': 'Practice_AvgSpeed',
        'SpeedST': 'Practice_MaxSpeed'
    }, inplace=True)
    
    print(f"Practice records: {len(practice_agg):,}")
    
    return practice_agg

def process_sprint(sprint_df):
    """Process sprint race data - sprint results become grid positions"""
    
    print(f"\n{'='*80}")
    print("PROCESSING SPRINT DATA")
    print("="*80)
    
    if sprint_df.empty:
        print("No sprint data available")
        return pd.DataFrame()
    
    # Get sprint results (final positions)
    sprint_results = sprint_df.groupby(['Year', 'Event', 'Driver']).agg({
        'FinalPosition': 'first'
    }).reset_index()
    
    sprint_results.rename(columns={'FinalPosition': 'SprintPosition'}, inplace=True)
    
    print(f"Sprint records: {len(sprint_results):,}")
    
    return sprint_results

def merge_all_features(race_agg, quali_agg, practice_agg, sprint_agg):
    """Merge all features together"""
    
    print(f"\n{'='*80}")
    print("MERGING FEATURES")
    print("="*80)
    
    # Start with race data
    data = race_agg.copy()
    
    # Merge qualifying
    if not quali_agg.empty:
        data = data.merge(quali_agg, on=['Year', 'Event', 'Driver'], how='left')
        print(f"‚úÖ Merged qualifying data")
    
    # Merge practice
    if not practice_agg.empty:
        data = data.merge(practice_agg, on=['Year', 'Event', 'Driver'], how='left')
        print(f"‚úÖ Merged practice data")
    
    # Merge sprint
    if not sprint_agg.empty:
        data = data.merge(sprint_agg, on=['Year', 'Event', 'Driver'], how='left')
        print(f"‚úÖ Merged sprint data")
        
        # Use sprint position as starting position if available
        data['StartPosition'] = data['SprintPosition'].fillna(data['StartPosition'])
    
    # Use qualifying position as starting position if StartPosition is missing
    data['StartPosition'] = data['StartPosition'].fillna(data['QualiPosition'])
    
    print(f"\nFinal dataset: {len(data):,} records")
    
    return data

def engineer_features(data):
    """Create additional features"""
    
    print(f"\n{'='*80}")
    print("FEATURE ENGINEERING")
    print("="*80)
    
    # Speed features
    speed_cols = [c for c in ['SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST'] if c in data.columns]
    if speed_cols:
        data['Race_AvgSpeed'] = data[speed_cols].mean(axis=1)
    
    # Weather features
    if 'TrackTemp' in data.columns and 'AirTemp' in data.columns:
        data['TempDiff'] = data['TrackTemp'] - data['AirTemp']
    
    if 'Rainfall' in data.columns:
        data['IsWet'] = (data['Rainfall'] > 0).astype(int)
    
    # Pace delta (practice vs qualifying)
    if 'Practice_MedianLapTime' in data.columns and 'Quali_BestLapTime' in data.columns:
        data['PaceDelta'] = data['Practice_MedianLapTime'] - data['Quali_BestLapTime']
    
    # Historical driver performance (rolling average of past positions)
    data = data.sort_values(['Driver', 'Year', 'Event'])
    data['Driver_AvgPosition'] = data.groupby('Driver')['FinalPosition'].transform(
        lambda x: x.shift(1).rolling(window=5, min_periods=1).mean()
    )
    
    # Historical team performance
    data['Team_AvgPosition'] = data.groupby('Team')['FinalPosition'].transform(
        lambda x: x.shift(1).rolling(window=5, min_periods=1).mean()
    )
    
    # Encode categoricals
    encoders = {}
    for col in ['Driver', 'Team', 'Event']:
        if col in data.columns:
            le = LabelEncoder()
            data[f'{col}_encoded'] = le.fit_transform(data[col].astype(str))
            encoders[col] = le
    
    print(f"‚úÖ Created engineered features")
    print(f"‚úÖ Encoded categorical variables")
    
    return data, encoders

# ==============================================================================
# TRAINING
# ==============================================================================

def train_model(data):
    """Train model and evaluate"""
    
    print(f"\n{'='*80}")
    print("TRAINING MODEL")
    print("="*80)
    
    # Remove records without final position
    data = data.dropna(subset=['FinalPosition'])
    
    # Feature candidates
    feature_candidates = [
        # Starting position (most important!)
        'StartPosition',
        'QualiPosition',
        
        # Qualifying performance
        'Quali_BestLapTime',
        'Quali_AvgSpeed',
        'Quali_MaxSpeed',
        
        # Practice performance
        'Practice_MedianLapTime',
        'Practice_AvgSpeed',
        'Practice_MaxSpeed',
        
        # Race pace
        'LapTime_sec',
        'Race_AvgSpeed',
        
        # Weather
        'AirTemp',
        'TrackTemp',
        'TempDiff',
        'Humidity',
        'Pressure',
        'WindSpeed',
        'IsWet',
        
        # Engineered
        'PaceDelta',
        'Driver_AvgPosition',
        'Team_AvgPosition',
        
        # Encodings
        'Driver_encoded',
        'Team_encoded',
        'Event_encoded'
    ]
    
    # Only use features that exist
    available_features = [f for f in feature_candidates if f in data.columns]
    
    print(f"Available features: {len(available_features)}")
    
    # Fill missing values
    X = data[available_features].copy()
    X = X.fillna(X.median())
    
    # Remove highly correlated features
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if any(upper[col] > 0.95)]
    
    if to_drop:
        print(f"Dropping {len(to_drop)} highly correlated features: {to_drop}")
        X = X.drop(columns=to_drop)
        available_features = [f for f in available_features if f not in to_drop]
    
    print(f"Final features: {len(available_features)}")
    
    y = data['FinalPosition']
    
    print(f"\nTraining samples: {len(X):,}")
    print(f"Features: {available_features[:10]}...")
    
    # Train model
    print(f"\nü§ñ Training Random Forest...")
    model = RandomForestRegressor(
        n_estimators=300,
        max_depth=15,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1,
        verbose=0
    )
    
    model.fit(X, y)
    
    # Evaluate
    y_pred = model.predict(X)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    mae = mean_absolute_error(y, y_pred)
    
    print(f"\n‚úÖ Training complete!")
    print(f"   RMSE: {rmse:.2f} positions")
    print(f"   MAE:  {mae:.2f} positions")
    
    # Feature importance
    importance = pd.DataFrame({
        'Feature': available_features,
        'Importance': model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print(f"\nüìä Top 10 Most Important Features:")
    for idx, row in importance.head(10).iterrows():
        print(f"   {row['Feature']:30s} {row['Importance']:.4f}")
    
    return model, available_features, importance, rmse, mae

# ==============================================================================
# SAVE MODEL
# ==============================================================================

def save_model(model, features, encoders, importance, rmse, mae):
    """Save trained model and metadata"""
    
    print(f"\n{'='*80}")
    print("SAVING MODEL")
    print("="*80)
    
    model_data = {
        'model': model,
        'features': features,
        'encoders': encoders,
        'importance': importance,
        'training_rmse': rmse,
        'training_mae': mae
    }
    
    with open(OUTPUT_MODEL, 'wb') as f:
        pickle.dump(model_data, f)
    
    print(f"‚úÖ Model saved: {OUTPUT_MODEL}")
    
    # Save report
    with open(OUTPUT_REPORT, 'w') as f:
        f.write("="*80 + "\n")
        f.write("F1 RACE PREDICTION MODEL - TRAINING REPORT\n")
        f.write("="*80 + "\n\n")
        f.write(f"Training Years: 2021-2024\n")
        f.write(f"Model Type: Random Forest Regressor\n")
        f.write(f"Features Used: {len(features)}\n")
        f.write(f"Training RMSE: {rmse:.2f} positions\n")
        f.write(f"Training MAE: {mae:.2f} positions\n\n")
        f.write("="*80 + "\n")
        f.write("FEATURE IMPORTANCE (Top 20)\n")
        f.write("="*80 + "\n\n")
        f.write(importance.head(20).to_string(index=False))
        f.write("\n\n")
    
    print(f"‚úÖ Report saved: {OUTPUT_REPORT}")

# ==============================================================================
# MAIN
# ==============================================================================

def main():
    print("\n" + "="*80)
    print("F1 RACE PREDICTION - MODEL TRAINING")
    print("="*80 + "\n")
    
    # Load data
    races_df, quali_df, practice_df, sprint_df = load_all_data(TRAINING_YEARS)
    
    if races_df.empty:
        print("\n‚ùå No race data found! Run 1_download_data.py first")
        return
    
    # Aggregate
    race_agg = aggregate_race_data(races_df)
    quali_agg = aggregate_qualifying(quali_df)
    practice_agg = aggregate_practice(practice_df)
    sprint_agg = process_sprint(sprint_df)
    
    # Merge
    data = merge_all_features(race_agg, quali_agg, practice_agg, sprint_agg)
    
    # Engineer features
    data, encoders = engineer_features(data)
    
    # Train
    model, features, importance, rmse, mae = train_model(data)
    
    # Save
    save_model(model, features, encoders, importance, rmse, mae)
    
    print(f"\n{'='*80}")
    print("‚úÖ TRAINING COMPLETE!")
    print(f"{'='*80}")
    print(f"Next step: python 3_predict_2025.py")
    print(f"{'='*80}\n")

if __name__ == "__main__":
    main()


Purpose: Load trained model, predict 2025 race results, compare against actual results.

In [None]:
"""
F1 RACE PREDICTION - 2025 PREDICTIONS & EVALUATION
==================================================
Uses trained model to predict 2025 race results and compares against
actual results.

USAGE:
python 3_predict_2025.py

OUTPUT:
- 2025_predictions.csv (predicted vs actual for all races)
- evaluation_report.txt (detailed performance metrics)
"""

import pandas as pd
import numpy as np
import pickle
import os
import warnings
from sklearn.metrics import mean_squared_error, mean_absolute_error

warnings.filterwarnings('ignore')

# ==============================================================================
# CONFIGURATION
# ==============================================================================
DATA_DIR = 'f1_data'
MODEL_FILE = 'f1_trained_model.pkl'
OUTPUT_PREDICTIONS = '2025_predictions.csv'
OUTPUT_EVALUATION = 'evaluation_report.txt'

# ==============================================================================
# LOAD MODEL
# ==============================================================================

def load_trained_model():
    """Load the trained model"""
    
    print("="*80)
    print("LOADING TRAINED MODEL")
    print("="*80)
    
    if not os.path.exists(MODEL_FILE):
        print(f"‚ùå Model file not found: {MODEL_FILE}")
        print("Run 2_train_model.py first!")
        return None
    
    with open(MODEL_FILE, 'rb') as f:
        model_data = pickle.load(f)
    
    print(f"‚úÖ Model loaded: {MODEL_FILE}")
    print(f"   Features: {len(model_data['features'])}")
    print(f"   Training RMSE: {model_data['training_rmse']:.2f}")
    
    return model_data

# ==============================================================================
# LOAD 2025 DATA
# ==============================================================================

def load_2025_data():
    """Load all 2025 session data"""
    
    print(f"\n{'='*80}")
    print("LOADING 2025 DATA")
    print("="*80)
    
    year = 2025
    
    # Load races
    races_2025 = pd.DataFrame()
    race_file = os.path.join(DATA_DIR, f'{year}_races.csv')
    if os.path.exists(race_file):
        races_2025 = pd.read_csv(race_file)
        print(f"‚úÖ Races: {len(races_2025):,} laps from {races_2025['Event'].nunique()} events")
    
    # Load qualifying
    quali_2025 = pd.DataFrame()
    quali_file = os.path.join(DATA_DIR, f'{year}_qualifying.csv')
    if os.path.exists(quali_file):
        quali_2025 = pd.read_csv(quali_file)
        print(f"‚úÖ Qualifying: {len(quali_2025):,} laps")
    
    # Load practice
    practice_2025 = pd.DataFrame()
    practice_file = os.path.join(DATA_DIR, f'{year}_practice.csv')
    if os.path.exists(practice_file):
        practice_2025 = pd.read_csv(practice_file)
        print(f"‚úÖ Practice: {len(practice_2025):,} laps")
    
    # Load sprint
    sprint_2025 = pd.DataFrame()
    sprint_file = os.path.join(DATA_DIR, f'{year}_sprint.csv')
    if os.path.exists(sprint_file):
        sprint_2025 = pd.read_csv(sprint_file)
        print(f"‚úÖ Sprint: {len(sprint_2025):,} laps")
    
    return races_2025, quali_2025, practice_2025, sprint_2025

# ==============================================================================
# PROCESS 2025 DATA (same as training)
# ==============================================================================

def process_2025_data(races_df, quali_df, practice_df, sprint_df, model_data):
    """Process 2025 data same way as training data"""
    
    print(f"\n{'='*80}")
    print("PROCESSING 2025 DATA")
    print("="*80)
    
    # Import processing functions from training
    # (In practice, these would be in a shared module)
    
    # Aggregate races
    races_df['LapTime_sec'] = pd.to_timedelta(races_df['LapTime'], errors='coerce').dt.total_seconds()
    
    races_clean = races_df.copy()
    if 'PitOutTime' in races_clean.columns:
        races_clean = races_clean[races_clean['PitOutTime'].isna()]
    if 'PitInTime' in races_clean.columns:
        races_clean = races_clean[races_clean['PitInTime'].isna()]
    
    races_clean = races_clean[races_clean['LapTime_sec'].notna()]
    races_clean = races_clean[races_clean['LapTime_sec'] > 0]
    races_clean = races_clean[races_clean['LapTime_sec'] < 200]
    
    agg_dict = {
        'LapTime_sec': 'median',
        'SpeedI1': 'mean',
        'SpeedI2': 'mean',
        'SpeedFL': 'mean',
        'SpeedST': 'mean',
        'AirTemp': 'first',
        'TrackTemp': 'first',
        'Humidity': 'first',
        'Pressure': 'first',
        'Rainfall': 'max',
        'WindSpeed': 'mean',
        'FinalPosition': 'first',
        'StartPosition': 'first',
        'Team': 'first'
    }
    
    agg_dict = {k: v for k, v in agg_dict.items() if k in races_clean.columns}
    race_agg = races_clean.groupby(['Year', 'Event', 'Driver']).agg(agg_dict).reset_index()
    
    print(f"‚úÖ Aggregated races: {len(race_agg)} records")
    
    # Process qualifying
    if not quali_df.empty:
        quali_df['LapTime_sec'] = pd.to_timedelta(quali_df['LapTime'], errors='coerce').dt.total_seconds()
        if 'Deleted' in quali_df.columns:
            quali_df = quali_df[quali_df['Deleted'] == False]
        quali_df = quali_df[quali_df['LapTime_sec'].notna()]
        quali_df = quali_df[quali_df['LapTime_sec'] > 0]
        
        quali_agg = quali_df.groupby(['Year', 'Event', 'Driver']).agg({
            'LapTime_sec': 'min',
            'SpeedI1': 'mean',
            'SpeedFL': 'mean'
        }).reset_index()
        
        quali_agg['QualiPosition'] = quali_agg.groupby(['Year', 'Event'])['LapTime_sec'].rank(method='min')
        quali_agg.rename(columns={
            'LapTime_sec': 'Quali_BestLapTime',
            'SpeedI1': 'Quali_AvgSpeed',
            'SpeedFL': 'Quali_MaxSpeed'
        }, inplace=True)
        
        race_agg = race_agg.merge(quali_agg, on=['Year', 'Event', 'Driver'], how='left')
        print(f"‚úÖ Merged qualifying data")
    
    # Process practice
    if not practice_df.empty:
        practice_df['LapTime_sec'] = pd.to_timedelta(practice_df['LapTime'], errors='coerce').dt.total_seconds()
        if 'Deleted' in practice_df.columns:
            practice_df = practice_df[practice_df['Deleted'] == False]
        practice_df = practice_df[practice_df['LapTime_sec'].notna()]
        practice_df = practice_df[practice_df['LapTime_sec'] > 0]
        practice_df = practice_df[practice_df['LapTime_sec'] < 200]
        
        practice_agg = practice_df.groupby(['Year', 'Event', 'Driver']).agg({
            'LapTime_sec': 'median',
            'SpeedI1': 'mean',
            'SpeedST': 'mean'
        }).reset_index()
        
        practice_agg.rename(columns={
            'LapTime_sec': 'Practice_MedianLapTime',
            'SpeedI1': 'Practice_AvgSpeed',
            'SpeedST': 'Practice_MaxSpeed'
        }, inplace=True)
        
        race_agg = race_agg.merge(practice_agg, on=['Year', 'Event', 'Driver'], how='left')
        print(f"‚úÖ Merged practice data")
    
    # Process sprint
    if not sprint_df.empty:
        sprint_results = sprint_df.groupby(['Year', 'Event', 'Driver']).agg({
            'FinalPosition': 'first'
        }).reset_index()
        sprint_results.rename(columns={'FinalPosition': 'SprintPosition'}, inplace=True)
        
        race_agg = race_agg.merge(sprint_results, on=['Year', 'Event', 'Driver'], how='left')
        race_agg['StartPosition'] = race_agg['SprintPosition'].fillna(race_agg['StartPosition'])
        print(f"‚úÖ Merged sprint data")
    
    # Use qualifying as start position if missing
    race_agg['StartPosition'] = race_agg['StartPosition'].fillna(race_agg['QualiPosition'])
    
    # Engineer features
    speed_cols = [c for c in ['SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST'] if c in race_agg.columns]
    if speed_cols:
        race_agg['Race_AvgSpeed'] = race_agg[speed_cols].mean(axis=1)
    
    if 'TrackTemp' in race_agg.columns and 'AirTemp' in race_agg.columns:
        race_agg['TempDiff'] = race_agg['TrackTemp'] - race_agg['AirTemp']
    
    if 'Rainfall' in race_agg.columns:
        race_agg['IsWet'] = (race_agg['Rainfall'] > 0).astype(int)
    
    if 'Practice_MedianLapTime' in race_agg.columns and 'Quali_BestLapTime' in race_agg.columns:
        race_agg['PaceDelta'] = race_agg['Practice_MedianLapTime'] - race_agg['Quali_BestLapTime']
    
    # For 2025, use historical performance from all previous data (2021-2024)
    # Load historical data
    historical_data = []
    for year in [2021, 2022, 2023, 2024]:
        race_file = os.path.join(DATA_DIR, f'{year}_races.csv')
        if os.path.exists(race_file):
            hist = pd.read_csv(race_file)
            historical_data.append(hist)
    
    if historical_data:
        hist_df = pd.concat(historical_data, ignore_index=True)
        hist_df = hist_df[hist_df['FinalPosition'].notna()]
        
        driver_avg = hist_df.groupby('Driver')['FinalPosition'].mean().to_dict()
        team_avg = hist_df.groupby('Team')['FinalPosition'].mean().to_dict()
        
        race_agg['Driver_AvgPosition'] = race_agg['Driver'].map(driver_avg)
        race_agg['Team_AvgPosition'] = race_agg['Team'].map(team_avg)
        
        print(f"‚úÖ Added historical performance features")
    
    # Encode categoricals using training encoders
    encoders = model_data['encoders']
    
    for col in ['Driver', 'Team', 'Event']:
        if col in race_agg.columns and col in encoders:
            le = encoders[col]
            race_agg[f'{col}_encoded'] = race_agg[col].apply(
                lambda x: le.transform([str(x)])[0] if str(x) in le.classes_ else -1
            )
    
    print(f"‚úÖ Encoded categorical variables")
    
    return race_agg

# ==============================================================================
# MAKE PREDICTIONS
# ==============================================================================

def predict_2025(data_2025, model_data):
    """Make predictions for 2025"""
    
    print(f"\n{'='*80}")
    print("MAKING PREDICTIONS")
    print("="*80)
    
    model = model_data['model']
    features = model_data['features']
    
    # Prepare features
    X = data_2025[features].copy()
    X = X.fillna(X.median())
    
    # Predict
    predictions = model.predict(X)
    
    # Round to nearest integer and clip to valid range
    predictions = np.clip(np.round(predictions), 1, 20)
    
    # Add predictions to dataframe
    results = data_2025[['Year', 'Event', 'Driver', 'Team', 'StartPosition', 'FinalPosition']].copy()
    results['PredictedPosition'] = predictions
    results['Error'] = results['PredictedPosition'] - results['FinalPosition']
    results['AbsError'] = np.abs(results['Error'])
    
    print(f"‚úÖ Predicted {len(results)} driver-race combinations")
    
    return results

# ==============================================================================
# EVALUATE
# ==============================================================================

def evaluate_predictions(results):
    """Evaluate prediction accuracy"""
    
    print(f"\n{'='*80}")
    print("EVALUATION METRICS")
    print("="*80)
    
    # Overall metrics
    rmse = np.sqrt(mean_squared_error(results['FinalPosition'], results['PredictedPosition']))
    mae = mean_absolute_error(results['FinalPosition'], results['PredictedPosition'])
    
    print(f"\nüìä Overall Performance:")
    print(f"   RMSE: {rmse:.2f} positions")
    print(f"   MAE:  {mae:.2f} positions")
    
    # Per-race performance
    print(f"\nüìä Per-Race Performance:")
    race_metrics = results.groupby('Event').agg({
        'AbsError': 'mean',
        'Error': ['mean', 'std']
    }).round(2)
    
    race_metrics.columns = ['MAE', 'Mean_Error', 'Std_Error']
    race_metrics = race_metrics.sort_values('MAE')
    
    print(f"\nBest predictions (lowest error):")
    print(race_metrics.head(5).to_string())
    
    print(f"\nWorst predictions (highest error):")
    print(race_metrics.tail(5).to_string())
    
    # Per-driver performance
    driver_metrics = results.groupby('Driver').agg({
        'AbsError': 'mean',
        'FinalPosition': 'count'
    }).round(2)
    driver_metrics.columns = ['MAE', 'Races']
    driver_metrics = driver_metrics[driver_metrics['Races'] >= 3]  # At least 3 races
    driver_metrics = driver_metrics.sort_values('MAE')
    
    print(f"\nüìä Per-Driver Performance (drivers with 3+ races):")
    print(f"\nMost predictable drivers:")
    print(driver_metrics.head(5).to_string())
    
    print(f"\nLeast predictable drivers:")
    print(driver_metrics.tail(5).to_string())
    
    # Prediction accuracy by position
    results['PositionBucket'] = pd.cut(results['FinalPosition'], 
                                        bins=[0, 3, 10, 20], 
                                        labels=['Podium', 'Midfield', 'Back'])
    
    print(f"\nüìä Accuracy by Position:")
    position_metrics = results.groupby('PositionBucket')['AbsError'].agg(['mean', 'std']).round(2)
    print(position_metrics.to_string())
    
    return rmse, mae, race_metrics, driver_metrics

# ==============================================================================
# SAVE RESULTS
# ==============================================================================

def save_results(results, rmse, mae, race_metrics, driver_metrics, model_data):
    """Save predictions and evaluation report"""
    
    print(f"\n{'='*80}")
    print("SAVING RESULTS")
    print("="*80)
    
    # Save predictions
    results_save = results[['Year', 'Event', 'Driver', 'Team', 'StartPosition', 
                             'FinalPosition', 'PredictedPosition', 'Error', 'AbsError']]
    results_save = results_save.sort_values(['Event', 'FinalPosition'])
    results_save.to_csv(OUTPUT_PREDICTIONS, index=False)
    
    print(f"‚úÖ Predictions saved: {OUTPUT_PREDICTIONS}")
    
    # Save evaluation report
    with open(OUTPUT_EVALUATION, 'w') as f:
        f.write("="*80 + "\n")
        f.write("F1 2025 RACE PREDICTIONS - EVALUATION REPORT\n")
        f.write("="*80 + "\n\n")
        
        f.write(f"Model Training RMSE: {model_data['training_rmse']:.2f} positions\n")
        f.write(f"2025 Test RMSE: {rmse:.2f} positions\n")
        f.write(f"2025 Test MAE: {mae:.2f} positions\n\n")
        
        f.write("="*80 + "\n")
        f.write("PER-RACE PERFORMANCE\n")
        f.write("="*80 + "\n\n")
        f.write(race_metrics.to_string())
        f.write("\n\n")
        
        f.write("="*80 + "\n")
        f.write("PER-DRIVER PERFORMANCE (3+ races)\n")
        f.write("="*80 + "\n\n")
        f.write(driver_metrics.to_string())
        f.write("\n\n")
        
        f.write("="*80 + "\n")
        f.write("SAMPLE PREDICTIONS\n")
        f.write("="*80 + "\n\n")
        
        # Show first race in detail
        first_race = results['Event'].iloc[0]
        first_race_results = results[results['Event'] == first_race].sort_values('FinalPosition')
        f.write(f"Example: {first_race}\n\n")
        f.write(first_race_results[['Driver', 'StartPosition', 'PredictedPosition', 'FinalPosition', 'Error']].to_string(index=False))
        f.write("\n\n")
    
    print(f"‚úÖ Evaluation report saved: {OUTPUT_EVALUATION}")

# ==============================================================================
# MAIN
# ==============================================================================

def main():
    print("\n" + "="*80)
    print("F1 2025 RACE PREDICTIONS & EVALUATION")
    print("="*80 + "\n")
    
    # Load model
    model_data = load_trained_model()
    if model_data is None:
        return
    
    # Load 2025 data
    races_2025, quali_2025, practice_2025, sprint_2025 = load_2025_data()
    
    if races_2025.empty:
        print("\n‚ùå No 2025 race data found! Run 1_download_data.py first")
        return
    
    # Process 2025 data
    data_2025 = process_2025_data(races_2025, quali_2025, practice_2025, sprint_2025, model_data)
    
    # Make predictions
    results = predict_2025(data_2025, model_data)
    
    # Evaluate
    rmse, mae, race_metrics, driver_metrics = evaluate_predictions(results)
    
    # Save
    save_results(results, rmse, mae, race_metrics, driver_metrics, model_data)
    
    print(f"\n{'='*80}")
    print("‚úÖ PREDICTION & EVALUATION COMPLETE!")
    print(f"{'='*80}")
    print(f"üìÅ Check outputs:")
    print(f"   - {OUTPUT_PREDICTIONS}")
    print(f"   - {OUTPUT_EVALUATION}")
    print(f"{'='*80}\n")

if __name__ == "__main__":
    main()
