In [1]:
!pip install fastf1 pandas numpy scikit-learn matplotlib seaborn requests optuna lightgbm prophet feature-engine catboost mlFlow

Collecting fastf1
  Downloading fastf1-3.5.3-py3-none-any.whl.metadata (4.6 kB)
Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting feature-engine
  Downloading feature_engine-1.8.3-py2.py3-none-any.whl.metadata (9.9 kB)
Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting mlFlow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting rapidfuzz (from fastf1)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting requests-cache>=1.0.0 (from fastf1)
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting timple>=0.1.6 (from fastf1)
  Downloading timple-0.1.8-py3-none-any.whl.metadata (2.0 kB)
Collecting websockets<14,>=10.3 (from fastf1)
  Downloading websockets-13.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Colle

In [10]:
import os
import fastf1
import pandas as pd
import numpy as np
import requests
import optuna
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from prophet import Prophet
from tensorflow import keras
import mlflow
import shap
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from feature_engine.selection import SmartCorrelatedSelection
from feature_engine.creation import CyclicalFeatures
import matplotlib.pyplot as plt
import seaborn as sns

# MLflow setup
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("F1_Canada_Predictions")

class WeatherAnalyzer:
    def __init__(self):
        self.weather_impact = {
            'DRY': 1.0,
            'LIGHT_RAIN': 1.05,
            'RAIN': 1.15,
            'HEAVY_RAIN': 1.25
        }
        self.track_temp_optimal = 35  # Optimal track temperature for Montreal

    def calculate_weather_impact(self, conditions, track_temp, air_temp):
        """Calculate weather impact on lap times"""
        base_impact = self.weather_impact.get(conditions, 1.0)

        # Temperature impact (Montreal specific)
        temp_delta = abs(track_temp - self.track_temp_optimal)
        temp_impact = 1 + (temp_delta * 0.002)  # 0.2% per degree difference

        # Air density impact (important for Montreal's long straights)
        air_density_impact = 1 + ((15 - air_temp) * 0.001)

        # Humidity impact (Montreal can be humid in June)
        humidity_impact = 1 + (0.001 * (air_temp - 15))  # More impact at higher temperatures

        return base_impact * temp_impact * air_density_impact * humidity_impact



In [11]:
class TireStrategyAnalyzer:
    def __init__(self):
        self.compound_characteristics = {
            'SOFT': {'peak_grip': 1.0, 'wear_rate': 0.12, 'optimal_temp': 90, 'durability': 0.8},
            'MEDIUM': {'peak_grip': 0.97, 'wear_rate': 0.08, 'optimal_temp': 85, 'durability': 1.0},
            'HARD': {'peak_grip': 0.94, 'wear_rate': 0.05, 'optimal_temp': 80, 'durability': 1.2},
            'INTERMEDIATE': {'peak_grip': 0.96, 'wear_rate': 0.10, 'optimal_temp': 75, 'durability': 0.9},
            'WET': {'peak_grip': 0.93, 'wear_rate': 0.07, 'optimal_temp': 65, 'durability': 1.1}
        }

    def calculate_tire_performance(self, compound, lap_number, track_temp):
        """Calculate tire performance considering Montreal's characteristics"""
        # Handle unknown compounds
        if compound not in self.compound_characteristics:
            print(f"Warning: Unknown tire compound '{compound}', using MEDIUM characteristics")
            char = self.compound_characteristics['MEDIUM']
        else:
            char = self.compound_characteristics[compound]

        # Base performance
        base_perf = char['peak_grip']

        # Wear effect (Montreal is medium on tires)
        wear = lap_number * char['wear_rate']

        # Temperature effect
        temp_delta = abs(track_temp - char['optimal_temp'])
        temp_effect = 1 - (temp_delta * 0.002)

        # Montreal-specific adjustments
        hairpin_impact = 1 - (wear * 1.2)  # Heavy braking zones
        straight_impact = 1 - (wear * 0.8)  # Long straights

        # Durability factor
        durability_factor = 1 - (wear * (1 - char['durability']))

        return base_perf * (1 - wear) * temp_effect * hairpin_impact * straight_impact * durability_factor

    def preprocess_data(self, data):
        """Preprocess data with Canadian GP specific handling"""
        try:
            processed_data = data.copy()

            # Handle time-based columns
            time_cols = [col for col in processed_data.columns
                        if any(t in col.lower() for t in ['time', 'lap', 'sector'])]

            for col in time_cols:
                if pd.api.types.is_timedelta64_dtype(processed_data[col]):
                    processed_data[f'{col}_seconds'] = processed_data[col].dt.total_seconds()
                    processed_data = processed_data.drop(col, axis=1)

            # Replace empty strings and other problematic values with NaN
            processed_data = processed_data.replace(['', 'None', 'NaN', 'nan', 'NULL'], np.nan)

            # Handle datetime columns
            if 'LapStartDate' in processed_data.columns:
                try:
                    # Convert to datetime if not already
                    processed_data['LapStartDate'] = pd.to_datetime(processed_data['LapStartDate'])
                    # Extract useful features from datetime
                    processed_data['DayOfYear'] = processed_data['LapStartDate'].dt.dayofyear
                    processed_data['MonthOfYear'] = processed_data['LapStartDate'].dt.month
                    processed_data['DayOfWeek'] = processed_data['LapStartDate'].dt.dayofweek
                    # Drop original column
                    processed_data = processed_data.drop('LapStartDate', axis=1)
                except Exception as e:
                    print(f"Warning: Error processing LapStartDate: {e}")
                    processed_data = processed_data.drop('LapStartDate', axis=1)

            # Handle categorical variables
            cat_cols = ['Driver', 'Team', 'Compound', 'TrackStatus', 'Session']
            for col in cat_cols:
                if col in processed_data.columns:
                    if col == 'Compound':
                        # Create dummy variables for tire compounds
                        compound_dummies = pd.get_dummies(processed_data[col], prefix='Compound')
                        processed_data = pd.concat([processed_data, compound_dummies], axis=1)
                        processed_data = processed_data.drop('Compound', axis=1)  # Drop original column
                    else:
                        # Fill NaN with a placeholder before encoding
                        processed_data[col] = processed_data[col].fillna('Unknown')
                        # Create both label encoding and one-hot encoding
                        processed_data[f'{col}_encoded'] = pd.Categorical(processed_data[col]).codes
                        # Create one-hot encoding for the categorical columns
                        dummies = pd.get_dummies(processed_data[col], prefix=col)
                        processed_data = pd.concat([processed_data, dummies], axis=1)
                        processed_data = processed_data.drop(col, axis=1)

            # Create session-specific features
            processed_data['IsRace'] = (processed_data['Session'] == 'Race').astype(int)
            processed_data['IsQuali'] = (processed_data['Session'] == 'Qualifying').astype(int)
            processed_data['IsSprint'] = (processed_data['Session'] == 'Sprint').astype(int)

            # Convert specific columns to numeric, handling errors
            numeric_cols = [
                'LapNumber', 'Stint', 'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST',
                'TyreLife', 'Position', 'Year', 'DayOfYear', 'MonthOfYear', 'DayOfWeek'
            ]

            for col in numeric_cols:
                if col in processed_data.columns:
                    try:
                        processed_data[col] = pd.to_numeric(processed_data[col], errors='coerce')
                    except Exception as e:
                        print(f"Warning: Error converting {col} to numeric: {e}")

            # Handle missing values for numeric columns
            numeric_cols = processed_data.select_dtypes(include=[np.number]).columns
            for col in numeric_cols:
                if processed_data[col].isnull().any():
                    if 'Time' in col or 'time' in col:
                        # For time columns, use median to avoid outlier effects
                        median_val = processed_data[col].median()
                        processed_data[col] = processed_data[col].fillna(median_val)
                    elif col in ['Position', 'LapNumber', 'Stint']:
                        # For these columns, use forward fill then backward fill
                        processed_data[col] = processed_data[col].ffill().bfill()
                    elif col in ['Year']:
                        # For year, use mode
                        mode_val = processed_data[col].mode()[0]
                        processed_data[col] = processed_data[col].fillna(mode_val)
                    else:
                        # For other numeric columns, use mean
                        mean_val = processed_data[col].mean()
                        processed_data[col] = processed_data[col].fillna(mean_val)

            # Drop non-essential columns that might cause issues
            cols_to_drop = ['IsPersonalBest', 'DeletedReason', 'Deleted', 'FastF1Generated', 'IsAccurate', 'DriverNumber', 'FreshTyre']
            processed_data = processed_data.drop([col for col in cols_to_drop if col in processed_data.columns], axis=1)

            # Ensure all remaining numeric columns are float64
            numeric_cols = processed_data.select_dtypes(include=[np.number]).columns
            for col in numeric_cols:
                processed_data[col] = processed_data[col].astype(np.float64)

            # Print debugging information
            print("\nProcessed data info:")
            print(processed_data.info())
            print("\nNumeric columns:", numeric_cols.tolist())
            print("\nMissing values after processing:")
            print(processed_data.isnull().sum()[processed_data.isnull().sum() > 0])

            # Final check for any remaining non-numeric data in numeric columns
            for col in numeric_cols:
                non_numeric = processed_data[pd.to_numeric(processed_data[col], errors='coerce').isnull()]
                if len(non_numeric) > 0:
                    print(f"\nWarning: Found non-numeric values in {col}:")
                    print(non_numeric[col].unique())

            return processed_data

        except Exception as e:
            print(f"Error in preprocessing: {e}")
            print("Data columns:", data.columns.tolist())
            if 'Compound' in data.columns:
                print("\nUnique compounds:", data['Compound'].unique())
            return None

    def _add_montreal_features(self, data):
        """Add Montreal-specific features to the dataset"""
        try:
            # Weather impact
            if 'WeatherCondition' in data.columns:
                data['WeatherImpact'] = data.apply(
                    lambda x: self.weather_analyzer.calculate_weather_impact(
                        x['WeatherCondition'],
                        x.get('TrackTemp', 25),
                        x.get('AirTemp', 20)
                    ), axis=1
                )

            # Tire performance
            if 'Compound' in data.columns:
                # Print unique compounds for debugging
                print("\nUnique tire compounds found:", data['Compound'].unique())

                data['TirePerformance'] = data.apply(
                    lambda x: self.tire_analyzer.calculate_tire_performance(
                        x['Compound'],
                        x['LapNumber'],
                        x.get('TrackTemp', 25)
                    ), axis=1
                )

            # DRS effectiveness (Montreal has long DRS zones)
            if 'DRS' in data.columns:
                data['DRSEffect'] = data['DRS'] * 0.3  # 30% performance gain potential

            # Track evolution
            data['TrackEvolution'] = data.groupby('Year')['LapNumber'].transform(
                lambda x: (x / x.max()) * 0.1 + 1  # Up to 10% improvement
            )

            # Brake wear impact (Montreal is hard on brakes)
            data['BrakeWear'] = data.apply(
                lambda x: 1 - (x['LapNumber'] / 70) * 0.15  # Up to 15% degradation
                if x['Session'] == 'Race' else 1.0,
                axis=1
            )

            return data

        except Exception as e:
            print(f"Error adding Montreal features: {e}")
            print("Data columns:", data.columns.tolist())
            print("\nSample of problematic data:")
            if 'Compound' in data.columns:
                print(data[['Compound', 'LapNumber', 'TrackTemp']].head())
            return data



In [12]:
class CanadianGPPredictor:
    def __init__(self):
        self.setup_cache()
        self.weather_analyzer = WeatherAnalyzer()
        self.tire_analyzer = TireStrategyAnalyzer()
        self.scalers = {
            'standard': StandardScaler(),
            'robust': RobustScaler()
        }
        self.models = {
            'lightgbm': None,
            'xgboost': None,
            'catboost': None,
            'neural_net': None
        }

        # 2025 F1 Driver Lineup
        self.drivers_2025 = {
            'VER': {'name': 'Max Verstappen', 'team': 'Red Bull Racing', 'performance_factor': 0.995},
            'NOR': {'name': 'Lando Norris', 'team': 'McLaren', 'performance_factor': 0.996},
            'PIA': {'name': 'Oscar Piastri', 'team': 'McLaren', 'performance_factor': 0.997},
            'RUS': {'name': 'George Russell', 'team': 'Mercedes', 'performance_factor': 0.997},
            'SAI': {'name': 'Carlos Sainz', 'team': 'Audi', 'performance_factor': 0.998},
            'ALB': {'name': 'Alexander Albon', 'team': 'Williams', 'performance_factor': 1.000},
            'LEC': {'name': 'Charles Leclerc', 'team': 'Ferrari', 'performance_factor': 0.996},
            'OCO': {'name': 'Esteban Ocon', 'team': 'Alpine', 'performance_factor': 1.000},
            'HAM': {'name': 'Lewis Hamilton', 'team': 'Ferrari', 'performance_factor': 0.997},
            'STR': {'name': 'Lance Stroll', 'team': 'Aston Martin', 'performance_factor': 1.001},
            'GAS': {'name': 'Pierre Gasly', 'team': 'Alpine', 'performance_factor': 1.000},
            'ALO': {'name': 'Fernando Alonso', 'team': 'Aston Martin', 'performance_factor': 0.998},
            'HUL': {'name': 'Nico Hulkenberg', 'team': 'Haas', 'performance_factor': 1.001}
        }

        # Team performance factors for 2025
        self.team_factors = {
            'Red Bull Racing': 0.995,  # Dominant team
            'McLaren': 0.997,         # Continued improvement
            'Mercedes': 0.998,        # Recovery
            'Audi': 0.999,           # New team but strong resources
            'Williams': 1.001,        # Improving
            'Ferrari': 0.997,         # Strong development
            'Alpine': 1.000,         # Midfield
            'Aston Martin': 0.999,    # Stable
            'Haas': 1.002            # Backmarker
        }

        # Montreal-specific constants
        self.TRACK_SECTORS = {
            'S1': ['Turn1', 'Turn2'],
            'S2': ['Hairpin', 'Back_Straight'],
            'S3': ['Wall_Champions', 'Final_Chicane']
        }
        self.SPRINT_IMPORTANCE = 0.3
        self.QUALI_IMPORTANCE = 0.4

    def setup_cache(self):
        """Setup FastF1 cache"""
        os.makedirs("f1_cache", exist_ok=True)
        fastf1.Cache.enable_cache("f1_cache")

    def load_historical_data(self):
        """Load historical Canadian GP data including sprint races"""
        try:
            all_data = []

            # Load race, qualifying, and sprint data for recent years
            for year in [2024, 2023, 2022]:
                print(f"\nLoading {year} data...")

                # Load race data
                race = fastf1.get_session(year, "Canada", "R")
                race.load()
                race_data = race.laps.copy()
                race_data['Session'] = 'Race'
                race_data['Year'] = year

                # Load qualifying data
                quali = fastf1.get_session(year, "Canada", "Q")
                quali.load()
                quali_data = quali.laps.copy()
                quali_data['Session'] = 'Qualifying'
                quali_data['Year'] = year

                # Load sprint data if available
                try:
                    sprint = fastf1.get_session(year, "Canada", "S")
                    sprint.load()
                    sprint_data = sprint.laps.copy()
                    sprint_data['Session'] = 'Sprint'
                    sprint_data['Year'] = year
                    all_data.append(sprint_data)
                except Exception as e:
                    print(f"No sprint data for {year}: {e}")

                all_data.extend([race_data, quali_data])

            # Combine all data
            combined_data = pd.concat(all_data, ignore_index=True)

            # Add Montreal-specific features
            combined_data = self._add_montreal_features(combined_data)

            return combined_data

        except Exception as e:
            print(f"Error loading Canadian GP data: {e}")
            return None

    def preprocess_data(self, data):
        """Preprocess data with Canadian GP specific handling"""
        try:
            processed_data = data.copy()
            print("\nStarting data preprocessing...")
            print("Initial columns:", processed_data.columns.tolist())

            # Replace empty strings and other problematic values with NaN
            processed_data = processed_data.replace(['', 'None', 'NaN', 'nan', 'NULL'], np.nan)

            # Create session-specific features first (before dropping Session column)
            processed_data['IsRace'] = (processed_data['Session'] == 'Race').astype(int)
            processed_data['IsQuali'] = (processed_data['Session'] == 'Qualifying').astype(int)
            processed_data['IsSprint'] = (processed_data['Session'] == 'Sprint').astype(int)

            # Handle time-based columns
            time_cols = [col for col in processed_data.columns
                        if any(t in col.lower() for t in ['time', 'lap', 'sector'])]

            for col in time_cols:
                if pd.api.types.is_timedelta64_dtype(processed_data[col]):
                    processed_data[f'{col}_seconds'] = processed_data[col].dt.total_seconds()
                    processed_data = processed_data.drop(col, axis=1)

            # Handle datetime columns
            if 'LapStartDate' in processed_data.columns:
                try:
                    processed_data['LapStartDate'] = pd.to_datetime(processed_data['LapStartDate'])
                    processed_data['DayOfYear'] = processed_data['LapStartDate'].dt.dayofyear
                    processed_data['MonthOfYear'] = processed_data['LapStartDate'].dt.month
                    processed_data['DayOfWeek'] = processed_data['LapStartDate'].dt.dayofweek
                    processed_data = processed_data.drop('LapStartDate', axis=1)
                except Exception as e:
                    print(f"Warning: Error processing LapStartDate: {e}")
                    processed_data = processed_data.drop('LapStartDate', axis=1)

            # Handle categorical variables with one-hot encoding
            cat_cols = ['Driver', 'Team', 'Compound', 'TrackStatus', 'Session', 'WeatherCondition']
            for col in cat_cols:
                if col in processed_data.columns:
                    print(f"\nProcessing categorical column: {col}")
                    # Ensure the column contains single values, not repeated strings
                    if col == 'WeatherCondition':
                        processed_data[col] = processed_data[col].str[:4]  # Take first 4 characters if repeated
                    print(f"Unique values in {col}:", processed_data[col].unique())

                    # Fill NaN values
                    processed_data[col] = processed_data[col].fillna('Unknown')

                    # Create one-hot encoding
                    try:
                        dummies = pd.get_dummies(processed_data[col], prefix=col)
                        processed_data = pd.concat([processed_data, dummies], axis=1)
                        processed_data = processed_data.drop(col, axis=1)
                        print(f"Successfully created dummies for {col}")
                    except Exception as e:
                        print(f"Error creating dummies for {col}: {e}")
                        return None

            # Convert specific columns to numeric
            numeric_cols = [
                'LapNumber', 'Stint', 'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST',
                'TyreLife', 'Position', 'Year', 'DayOfYear', 'MonthOfYear', 'DayOfWeek',
                'TirePerformance', 'TrackEvolution', 'BrakeWear', 'LapTime_seconds',
                'TrackTemp', 'AirTemp', 'Humidity', 'WindSpeed', 'WindDirection', 'CloudCover'
            ]

            for col in numeric_cols:
                if col in processed_data.columns:
                    try:
                        processed_data[col] = pd.to_numeric(processed_data[col], errors='coerce')
                    except Exception as e:
                        print(f"Warning: Error converting {col} to numeric: {e}")

            # Handle missing values for numeric columns
            numeric_cols = processed_data.select_dtypes(include=[np.number]).columns
            for col in numeric_cols:
                if processed_data[col].isnull().any():
                    if 'Time' in col or 'time' in col:
                        median_val = processed_data[col].median()
                        processed_data[col] = processed_data[col].fillna(median_val)
                    elif col in ['Position', 'LapNumber', 'Stint']:
                        processed_data[col] = processed_data[col].ffill().bfill()
                    elif col in ['Year']:
                        mode_val = processed_data[col].mode()[0]
                        processed_data[col] = processed_data[col].fillna(mode_val)
                    else:
                        mean_val = processed_data[col].mean()
                        processed_data[col] = processed_data[col].fillna(mean_val)

            # Drop non-essential columns
            cols_to_drop = [
                'IsPersonalBest', 'DeletedReason', 'Deleted', 'FastF1Generated',
                'IsAccurate', 'DriverNumber', 'FreshTyre', 'Time'
            ]
            processed_data = processed_data.drop([col for col in cols_to_drop if col in processed_data.columns], axis=1)

            # Ensure all remaining numeric columns are float64
            numeric_cols = processed_data.select_dtypes(include=[np.number]).columns
            for col in numeric_cols:
                processed_data[col] = processed_data[col].astype(np.float64)

            # Ensure no infinite values
            processed_data = processed_data.replace([np.inf, -np.inf], np.nan)
            processed_data = processed_data.fillna(processed_data.mean())

            print("\nFinal columns after preprocessing:", processed_data.columns.tolist())
            print("\nNumeric columns:", numeric_cols.tolist())
            print("\nMissing values after processing:")
            print(processed_data.isnull().sum()[processed_data.isnull().sum() > 0])

            return processed_data

        except Exception as e:
            print(f"Error in preprocessing: {e}")
            print("Data columns:", data.columns.tolist())
            if 'Compound' in data.columns:
                print("\nUnique compounds:", data['Compound'].unique())
            return None

    def _add_montreal_features(self, data):
        """Add Montreal-specific features to the dataset"""
        try:
            # Weather impact
            if 'WeatherCondition' in data.columns:
                data['WeatherImpact'] = data.apply(
                    lambda x: self.weather_analyzer.calculate_weather_impact(
                        x['WeatherCondition'],
                        x.get('TrackTemp', 25),
                        x.get('AirTemp', 20)
                    ), axis=1
                )

            # Tire performance
            if 'Compound' in data.columns:
                print("\nUnique tire compounds found:", data['Compound'].unique())
                data['TirePerformance'] = data.apply(
                    lambda x: self.tire_analyzer.calculate_tire_performance(
                        x['Compound'],
                        x['LapNumber'],
                        x.get('TrackTemp', 25)
                    ), axis=1
                )

            # DRS effectiveness (Montreal has long DRS zones)
            if 'DRS' in data.columns:
                data['DRSEffect'] = data['DRS'] * 0.3  # 30% performance gain potential

            # Track evolution
            data['TrackEvolution'] = data.groupby('Year')['LapNumber'].transform(
                lambda x: (x / x.max()) * 0.1 + 1  # Up to 10% improvement
            )

            # Brake wear impact (Montreal is hard on brakes)
            data['BrakeWear'] = data.apply(
                lambda x: 1 - (x['LapNumber'] / 70) * 0.15  # Up to 15% degradation
                if x['Session'] == 'Race' else 1.0,
                axis=1
            )

            return data

        except Exception as e:
            print(f"Error adding Montreal features: {e}")
            print("Data columns:", data.columns.tolist())
            print("\nSample of problematic data:")
            if 'Compound' in data.columns:
                print(data[['Compound', 'LapNumber', 'TrackTemp']].head())
            return data

    def create_canadian_neural_network(self, input_shape):
        """Create a neural network optimized for Canadian GP predictions"""
        model = keras.Sequential([
            keras.layers.Dense(256, activation='relu', input_shape=input_shape),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.4),
            keras.layers.Dense(128, activation='relu'),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(64, activation='relu'),
            keras.layers.Dense(32, activation='relu'),
            keras.layers.Dense(1)
        ])

        model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=0.001),
            loss='huber',
            metrics=['mae']
        )

        return model

    def train_models(self, X_train, y_train, X_val, y_val):
        """Train multiple models with Canadian GP specific optimizations"""
        with mlflow.start_run():
            # Store training columns for prediction
            self.training_columns = X_train.columns.tolist()

            # Scale the data
            self.X_scaler = StandardScaler()
            self.y_scaler = StandardScaler()

            # Ensure data is finite and handle any remaining issues
            X_train = X_train.replace([np.inf, -np.inf], np.nan)
            X_val = X_val.replace([np.inf, -np.inf], np.nan)
            X_train = X_train.fillna(X_train.mean())
            X_val = X_val.fillna(X_val.mean())

            # Add interaction features
            if 'SpeedI1' in X_train.columns and 'TrackGrip' in X_train.columns:
                X_train['Speed_Track_Interaction'] = X_train['SpeedI1'] * X_train['TrackGrip']
                X_val['Speed_Track_Interaction'] = X_val['SpeedI1'] * X_val['TrackGrip']

            # Scale the data
            X_train_scaled = self.X_scaler.fit_transform(X_train)
            X_val_scaled = self.X_scaler.transform(X_val)
            y_train_scaled = self.y_scaler.fit_transform(y_train.values.reshape(-1, 1)).ravel()
            y_val_scaled = self.y_scaler.transform(y_val.values.reshape(-1, 1)).ravel()

            # Convert scaled arrays back to dataframes to preserve column names
            X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
            X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_train.columns)

            # Add polynomial features after scaling
            if 'LapTime_seconds' in X_train_scaled.columns:
                X_train_scaled['LapTime_seconds_squared'] = X_train_scaled['LapTime_seconds'] ** 2
                X_train_scaled['LapTime_seconds_cubed'] = X_train_scaled['LapTime_seconds'] ** 3
                X_val_scaled['LapTime_seconds_squared'] = X_val_scaled['LapTime_seconds'] ** 2
                X_val_scaled['LapTime_seconds_cubed'] = X_val_scaled['LapTime_seconds'] ** 3

            # Train LightGBM with updated parameters
            self.models['lightgbm'] = lgb.LGBMRegressor(
                objective='regression',
                metric='rmse',
                n_estimators=2000,  # Increased number of trees
                learning_rate=0.005,  # Reduced learning rate
                num_leaves=63,  # Increased number of leaves
                min_child_samples=10,  # Reduced minimum samples per leaf
                subsample=0.8,
                colsample_bytree=0.8,
                subsample_freq=5,
                reg_alpha=0.01,  # Reduced L1 regularization
                reg_lambda=0.01,  # Reduced L2 regularization
                min_split_gain=0.0,  # Allow splits with minimal gain
                min_child_weight=1,  # Reduced minimum child weight
                random_state=42,
                verbose=-1  # Suppress warnings
            )

            # Train XGBoost with updated parameters
            self.models['xgboost'] = xgb.XGBRegressor(
                objective='reg:squarederror',
                n_estimators=2000,
                learning_rate=0.005,
                max_depth=8,
                subsample=0.8,
                colsample_bytree=0.8,
                min_child_weight=1,
                gamma=0.01,
                random_state=42
            )

            # Train CatBoost with updated parameters
            self.models['catboost'] = cb.CatBoostRegressor(
                iterations=2000,
                learning_rate=0.005,
                depth=8,
                loss_function='RMSE',
                subsample=0.8,
                rsm=0.8,
                random_seed=42,
                verbose=False
            )

            # Train Neural Network with updated architecture
            self.models['neural_net'] = self.create_canadian_neural_network(
                (X_train_scaled.shape[1],))

            # Train models
            for name, model in self.models.items():
                if model is not None:
                    print(f"\nTraining {name} model...")
                    try:
                        if name == 'neural_net':
                            model.fit(
                                X_train_scaled, y_train_scaled,
                                validation_data=(X_val_scaled, y_val_scaled),
                                epochs=100,
                                batch_size=32,
                                verbose=0,
                                callbacks=[
                                    keras.callbacks.EarlyStopping(
                                        monitor='val_loss',
                                        patience=10,
                                        restore_best_weights=True
                                    )
                                ]
                            )
                        elif name == 'lightgbm':
                            model.fit(
                                X_train_scaled,
                                y_train_scaled,
                                eval_set=[(X_val_scaled, y_val_scaled)],
                                callbacks=[
                                    lgb.early_stopping(100),  # Increased patience
                                    lgb.log_evaluation(0)
                                ]
                            )
                        elif name == 'xgboost':
                            model.fit(
                                X_train_scaled,
                                y_train_scaled,
                                eval_set=[(X_val_scaled, y_val_scaled)],
                                early_stopping_rounds=100,  # Increased patience
                                verbose=False
                            )
                        elif name == 'catboost':
                            model.fit(
                                X_train_scaled,
                                y_train_scaled,
                                eval_set=(X_val_scaled, y_val_scaled),
                                early_stopping_rounds=100,  # Increased patience
                                verbose=False
                            )

                        # Log metrics
                        train_pred = self.y_scaler.inverse_transform(
                            model.predict(X_train_scaled).reshape(-1, 1)).ravel()
                        val_pred = self.y_scaler.inverse_transform(
                            model.predict(X_val_scaled).reshape(-1, 1)).ravel()

                        train_mae = mean_absolute_error(y_train, train_pred)
                        val_mae = mean_absolute_error(y_val, val_pred)
                        train_r2 = r2_score(y_train, train_pred)
                        val_r2 = r2_score(y_val, val_pred)

                        print(f"{name} Results:")
                        print(f"Train MAE: {train_mae:.3f}, Train R2: {train_r2:.3f}")
                        print(f"Val MAE: {val_mae:.3f}, Val R2: {val_r2:.3f}")

                        mlflow.log_metrics({
                            f'{name}_train_mae': train_mae,
                            f'{name}_val_mae': val_mae,
                            f'{name}_train_r2': train_r2,
                            f'{name}_val_r2': val_r2
                        })
                    except Exception as e:
                        print(f"Error training {name} model: {e}")
                        self.models[name] = None  # Mark model as failed

    def predict_performance(self, X):
        """Make predictions using ensemble of models"""
        X_scaled = self.X_scaler.transform(X)
        predictions = {}
        weights = {
            'lightgbm': 0.3,
            'xgboost': 0.25,
            'catboost': 0.25,
            'neural_net': 0.2
        }

        # Initialize final prediction array
        final_pred = np.zeros(X.shape[0])
        weight_sum = 0

        # Get predictions from each model
        for name, model in self.models.items():
            if model is not None:
                try:
                    # Ensure predictions are 1D array
                    pred = model.predict(X_scaled)
                    if len(pred.shape) > 1:
                        pred = pred.ravel()

                    weight = weights.get(name, 0)
                    final_pred += pred * weight
                    weight_sum += weight
                except Exception as e:
                    print(f"Warning: Error getting predictions from {name} model: {e}")
                    continue

        if weight_sum > 0:
            final_pred /= weight_sum
            # Scale back predictions
            final_pred = self.y_scaler.inverse_transform(final_pred.reshape(-1, 1)).ravel()
            return final_pred
        else:
            print("Warning: No valid predictions from any model")
            return None

    def analyze_performance(self, X, feature_names):
        """Analyze feature importance and performance patterns"""
        if self.models['lightgbm'] is not None:
            # SHAP analysis
            explainer = shap.TreeExplainer(self.models['lightgbm'])
            shap_values = explainer.shap_values(X)

            plt.figure(figsize=(15, 10))
            shap.summary_plot(shap_values, X, feature_names=feature_names, show=False)
            plt.title("Feature Importance Analysis")
            plt.tight_layout()
            plt.show()

            # Feature importance
            importance = pd.DataFrame({
                'Feature': feature_names,
                'Importance': np.abs(shap_values).mean(0)
            })
            importance = importance.sort_values('Importance', ascending=False)

            print("\nTop 10 Most Important Features:")
            print(importance.head(10))

    def predict_2025_canadian_gp(self):
        """Predict the finishing order for the 2025 Canadian GP"""
        try:
            print("\nPredicting 2025 Canadian Grand Prix Results...")

            # Create a sample race conditions dataframe
            race_conditions = []

            # Montreal typical June conditions
            base_conditions = {
                'TrackTemp': 35,  # Typical June temperature
                'AirTemp': 22,
                'Humidity': 65,  # Typical Montreal humidity
                'WeatherCondition': 'DRY',  # Single value, not repeated
                'Session': 'Race',
                'Year': 2025,
                'LapNumber': 1,
                'Stint': 1,
                'SpeedI1': 0,  # Will be filled with realistic values
                'SpeedI2': 0,
                'SpeedFL': 0,
                'SpeedST': 0,
                'TyreLife': 0,
                'Position': 1,
                'TrackStatus': '1',
                'Compound': 'MEDIUM',
                'BrakeWear': 1.0,
                'TrackEvolution': 1.0,
                'TirePerformance': 1.0,
                'DRS': 0,  # DRS availability
                'SafetyCar': 0,  # Safety car probability
                'TrackGrip': 1.0,  # Track grip level
                'WindSpeed': 10,  # Typical wind speed in km/h
                'WindDirection': 0,  # Wind direction in degrees
                'CloudCover': 30  # Cloud cover percentage
            }

            # Define typical sector speeds for Montreal (in km/h)
            sector_speeds = {
                'SpeedI1': (315, 325),  # Speed trap 1 range
                'SpeedI2': (300, 310),  # Speed trap 2 range
                'SpeedFL': (290, 300),  # Flying lap speed range
                'SpeedST': (305, 315)   # Speed trap range
            }

            # Base lap time for Montreal (in seconds)
            base_lap_time = 75.0  # Typical race lap time for Montreal

            # Tire strategy options for Montreal
            tire_strategies = {
                'Red Bull Racing': {'stint1': 'MEDIUM', 'stint2': 'HARD', 'stint3': 'MEDIUM'},
                'Ferrari': {'stint1': 'MEDIUM', 'stint2': 'HARD', 'stint3': 'MEDIUM'},
                'McLaren': {'stint1': 'MEDIUM', 'stint2': 'HARD', 'stint3': 'MEDIUM'},
                'Mercedes': {'stint1': 'MEDIUM', 'stint2': 'HARD', 'stint3': 'MEDIUM'},
                'Aston Martin': {'stint1': 'MEDIUM', 'stint2': 'HARD', 'stint3': 'MEDIUM'},
                'Audi': {'stint1': 'MEDIUM', 'stint2': 'HARD', 'stint3': 'MEDIUM'},
                'Alpine': {'stint1': 'MEDIUM', 'stint2': 'HARD', 'stint3': 'MEDIUM'},
                'Williams': {'stint1': 'MEDIUM', 'stint2': 'HARD', 'stint3': 'MEDIUM'},
                'Sauber': {'stint1': 'MEDIUM', 'stint2': 'HARD', 'stint3': 'MEDIUM'},
                'VCARB': {'stint1': 'MEDIUM', 'stint2': 'HARD', 'stint3': 'MEDIUM'},
                'Haas': {'stint1': 'MEDIUM', 'stint2': 'HARD', 'stint3': 'MEDIUM'}
            }

            # Generate race simulation data for each driver
            for driver_code, info in self.drivers_2025.items():
                # Get team and driver performance factors
                team_factor = self.team_factors[info['team']]
                driver_factor = info['performance_factor']
                team_strategy = tire_strategies[info['team']]

                # Simulate multiple laps for each driver
                for lap in range(1, 71):  # 70 laps
                    conditions = base_conditions.copy()

                    # Calculate stint and compound based on strategy
                    stint = min((lap - 1) // 23 + 1, 3)  # Cap at 3 stints
                    compound = team_strategy[f'stint{stint}']

                    # Calculate base lap time with team and driver factors
                    lap_time = base_lap_time * team_factor * driver_factor

                    # Apply tire wear effect
                    tire_wear = 1 + (conditions['TyreLife'] * 0.001)  # 0.1% degradation per lap
                    lap_time *= tire_wear

                    # Apply compound effect
                    compound_factor = 1.0
                    if compound == 'HARD':
                        compound_factor = 1.02  # Hard tires are 2% slower
                    elif compound == 'SOFT':
                        compound_factor = 0.98  # Soft tires are 2% faster
                    lap_time *= compound_factor

                    # Apply track evolution
                    track_evolution = 1.0 - (lap / 70) * 0.05  # Up to 5% improvement
                    lap_time *= track_evolution

                    # Apply DRS effect if available
                    if conditions['DRS']:
                        lap_time *= 0.98  # 2% faster with DRS

                    # Add small random variation (±0.2%)
                    lap_time *= (0.998 + np.random.random() * 0.004)

                    # Update conditions with calculated lap time
                    conditions.update({
                        'Driver': driver_code,
                        'Team': info['team'],
                        'LapNumber': lap,
                        'Stint': stint,
                        'Compound': compound,
                        'TyreLife': lap % 23,  # Reset after each pit stop
                        'TrackStatus': '1',  # Normal racing conditions
                        'Position': 1,  # Will be adjusted based on predictions
                        'TrackEvolution': track_evolution,
                        'BrakeWear': 1.0 - (lap / 70) * 0.15,  # Brake degradation
                        'DRS': 1 if lap > 2 else 0,  # DRS available after lap 2
                        'SafetyCar': 0.05 if lap > 10 else 0,  # Small chance of SC after lap 10
                        'TrackGrip': 1.0 + (lap / 70) * 0.05,  # Track grip improves slightly
                        'LapTime_seconds': lap_time  # Add the calculated lap time
                    })

                    # Add realistic speed variations with team and driver factors
                    for speed_col, (min_speed, max_speed) in sector_speeds.items():
                        base_speed = (min_speed + max_speed) / 2
                        # Add some random variation and account for tire wear
                        speed_factor = 1.0 - (conditions['TyreLife'] * 0.002)  # 0.2% degradation per lap
                        # Apply team and driver performance factors
                        speed_factor *= team_factor * driver_factor
                        # Apply DRS effect if available
                        if conditions['DRS']:
                            speed_factor *= 1.03  # 3% speed boost with DRS
                        conditions[speed_col] = base_speed * speed_factor * (0.98 + np.random.random() * 0.04)

                    race_conditions.append(conditions)

            # Convert to DataFrame
            race_df = pd.DataFrame(race_conditions)

            # Preprocess the race simulation data
            processed_data = self.preprocess_data(race_df)
            if processed_data is None:
                raise ValueError("Failed to preprocess race simulation data")

            # Ensure columns match training data
            if not hasattr(self, 'training_columns'):
                raise ValueError("Model hasn't been trained yet. No training columns available.")

            # Add missing columns with zeros using pd.concat to avoid fragmentation
            missing_cols = set(self.training_columns) - set(processed_data.columns)
            if missing_cols:
                missing_data = pd.DataFrame(0, index=processed_data.index, columns=list(missing_cols))
                processed_data = pd.concat([processed_data, missing_data], axis=1)

            # Remove extra columns
            processed_data = processed_data[self.training_columns]

            # Make predictions
            predictions = self.predict_performance(processed_data)
            if predictions is None:
                raise ValueError("Failed to generate predictions")

            # Calculate average lap time for each driver
            driver_performances = {}
            for i, pred in enumerate(predictions):
                driver = race_conditions[i]['Driver']
                if driver not in driver_performances:
                    driver_performances[driver] = []
                driver_performances[driver].append(pred)

            # Calculate average performance (excluding outliers)
            final_performances = {}
            for driver, times in driver_performances.items():
                times_array = np.array(times)
                # Remove outliers (times outside 1.5 IQR)
                Q1 = np.percentile(times_array, 25)
                Q3 = np.percentile(times_array, 75)
                IQR = Q3 - Q1
                mask = (times_array >= Q1 - 1.5 * IQR) & (times_array <= Q3 + 1.5 * IQR)
                final_performances[driver] = np.mean(times_array[mask])

            # Sort drivers by performance
            sorted_drivers = sorted(final_performances.items(), key=lambda x: x[1])

            # Print predictions
            print("\n2025 Canadian Grand Prix - Predicted Top 10:")
            print("\nPos  Driver                  Team                  Predicted Avg Lap")
            print("-" * 65)

            for pos, (driver_code, avg_time) in enumerate(sorted_drivers[:10], 1):
                driver_info = self.drivers_2025[driver_code]
                print(f"{pos:2d}.  {driver_info['name']:<20s} {driver_info['team']:<20s} {avg_time:.3f}s")

            return sorted_drivers

        except Exception as e:
            print(f"Error predicting 2025 race results: {e}")
            return None



In [13]:
def main():
    # Initialize predictor
    predictor = CanadianGPPredictor()

    # Load historical data
    print("Loading historical data...")
    data = predictor.load_historical_data()
    if data is None:
        print("Error: Could not load historical data")
        return

    # Preprocess data
    print("\nPreprocessing data...")
    processed_data = predictor.preprocess_data(data)
    if processed_data is None:
        print("Error: Data preprocessing failed")
        return

    # Prepare features and target
    y = processed_data['LapTime_seconds']
    X = processed_data.drop(['LapTime_seconds'], axis=1)

    # Split data
    tscv = TimeSeriesSplit(n_splits=5)
    for train_idx, val_idx in tscv.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Train models
        predictor.train_models(X_train, y_train, X_val, y_val)
        break  # Only use first split for now

    # Make predictions for 2025 Canadian GP
    print("\nGenerating predictions for 2025 Canadian Grand Prix...")
    predictor.predict_2025_canadian_gp()

if __name__ == "__main__":
    main()

core           INFO 	Loading data for Canadian Grand Prix - Race [v3.5.3]
INFO:fastf1.fastf1.core:Loading data for Canadian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
INFO:fastf1.fastf1.req:Using cached data for session_info
req            INFO 	Using cached data for driver_info
INFO:fastf1.fastf1.req:Using cached data for driver_info


Loading historical data...

Loading 2024 data...


req            INFO 	Using cached data for session_status_data
INFO:fastf1.fastf1.req:Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
INFO:fastf1.fastf1.req:Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
INFO:fastf1.fastf1.req:Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
INFO:fastf1.fastf1.req:Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
INFO:fastf1.fastf1.req:Using cached data for timing_app_data
core           INFO 	Processing timing data...
INFO:fastf1.fastf1.core:Processing timing data...
req            INFO 	Using cached data for car_data
INFO:fastf1.fastf1.req:Using cached data for car_data
req            INFO 	Using cached data for position_data
INFO:fastf1.fastf1.req:Using cached data for position_data
req            INFO 	Using cached data for weather_data
INFO:fastf1.f

No sprint data for 2024: Session type 'S' does not exist for this event

Loading 2023 data...


req            INFO 	Using cached data for car_data
INFO:fastf1.fastf1.req:Using cached data for car_data
req            INFO 	Using cached data for position_data
INFO:fastf1.fastf1.req:Using cached data for position_data
req            INFO 	Using cached data for weather_data
INFO:fastf1.fastf1.req:Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
INFO:fastf1.fastf1.req:Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '14', '44', '16', '55', '11', '23', '31', '18', '77', '81', '10', '4', '22', '27', '24', '20', '21', '63', '2']
INFO:fastf1.fastf1.core:Finished loading data for 20 drivers: ['1', '14', '44', '16', '55', '11', '23', '31', '18', '77', '81', '10', '4', '22', '27', '24', '20', '21', '63', '2']
core           INFO 	Loading data for Canadian Grand Prix - Qualifying [v3.5.3]
INFO:fastf1.fastf1.core:Loading data for Canadian Grand Prix - Qualifying [v3.5.3]
req         

No sprint data for 2023: Session type 'S' does not exist for this event

Loading 2022 data...


req            INFO 	Using cached data for car_data
INFO:fastf1.fastf1.req:Using cached data for car_data
req            INFO 	Using cached data for position_data
INFO:fastf1.fastf1.req:Using cached data for position_data
req            INFO 	Using cached data for weather_data
INFO:fastf1.fastf1.req:Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
INFO:fastf1.fastf1.req:Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '55', '44', '63', '16', '31', '77', '24', '14', '18', '3', '5', '23', '10', '4', '6', '20', '22', '47', '11']
INFO:fastf1.fastf1.core:Finished loading data for 20 drivers: ['1', '55', '44', '63', '16', '31', '77', '24', '14', '18', '3', '5', '23', '10', '4', '6', '20', '22', '47', '11']
core           INFO 	Loading data for Canadian Grand Prix - Qualifying [v3.5.3]
INFO:fastf1.fastf1.core:Loading data for Canadian Grand Prix - Qualifying [v3.5.3]
req            I

No sprint data for 2022: Session type 'S' does not exist for this event

Unique tire compounds found: ['INTERMEDIATE' 'MEDIUM' 'HARD' 'WET' 'SOFT']

Preprocessing data...

Starting data preprocessing...
Initial columns: ['Time', 'Driver', 'DriverNumber', 'LapTime', 'LapNumber', 'Stint', 'PitOutTime', 'PitInTime', 'Sector1Time', 'Sector2Time', 'Sector3Time', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime', 'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'IsPersonalBest', 'Compound', 'TyreLife', 'FreshTyre', 'Team', 'LapStartTime', 'LapStartDate', 'TrackStatus', 'Position', 'Deleted', 'DeletedReason', 'FastF1Generated', 'IsAccurate', 'Session', 'Year', 'TirePerformance', 'TrackEvolution', 'BrakeWear']

Processing categorical column: Driver
Unique values in Driver: ['VER' 'GAS' 'PER' 'ALO' 'LEC' 'STR' 'SAR' 'MAG' 'TSU' 'ALB' 'ZHO' 'HUL'
 'RIC' 'OCO' 'NOR' 'HAM' 'SAI' 'RUS' 'BOT' 'PIA' 'DEV' 'MSC' 'VET' 'LAT']
Successfully created dummies for Driver

Processing categorical col

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Training lightgbm model...
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's rmse: 0.348124
lightgbm Results:
Train MAE: 0.145, Train R2: 0.997
Val MAE: 2.370, Val R2: 0.908

Training xgboost model...
Error training xgboost model: XGBModel.fit() got an unexpected keyword argument 'early_stopping_rounds'

Training catboost model...
catboost Results:
Train MAE: 0.409, Train R2: 0.998
Val MAE: 4.580, Val R2: 0.732

Training neural_net model...
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
neural_net Results:
Train MAE: 1.841, Train R2: 0.963
Val MAE: 4.543, Val R2: 0.779

Generating predictions for 2025 Canadian Grand Prix...

Predicting 2025 Canadian Grand Prix Results...

Starting data preprocessing...
Initial columns: ['TrackTemp', 'AirTemp', 'Humidity', 'WeatherCondition', 'Session', 'Year', 'LapNumber', 'Sti



[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 

2025 Canadian Grand Prix - Predicted Top 10:

Pos  Driver                  Team                  Predicted Avg Lap
-----------------------------------------------------------------
 1.  Alexander Albon      Williams             125.252s
 2.  Max Verstappen       Red Bull Racing      126.000s
 3.  Esteban Ocon         Alpine               126.007s
 4.  Carlos Sainz         Audi                 126.017s
 5.  Pierre Gasly         Alpine               126.151s
 6.  Lewis Hamilton       Ferrari              126.243s
 7.  Charles Leclerc      Ferrari              126.261s
 8.  Lando Norris         McLaren              126.307s
 9.  Lance Stroll         Aston Martin         126.428s
10.  Oscar Piastri        McLaren              126.489s
