In [13]:
import pandas as pd
import numpy as np
import joblib
import os
import fastf1
import logging
from datetime import datetime

# --- 1. Configuration ---

In [14]:
SEASON_TO_PREDICT = 2023
MODEL_ARTIFACTS_DIR = 'model_artifacts'
TARGET_COLUMN = 'FinalPoints'
CACHE_PATH_PREDICT = 'fastf1_cache_predict'

logging.basicConfig(
    level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s'
)

fastf1.set_log_level(
    'WARNING'
)

try:
    if not os.path.exists(CACHE_PATH_PREDICT):
        os.makedirs(CACHE_PATH_PREDICT)
    fastf1.Cache.enable_cache(CACHE_PATH_PREDICT)
    logging.info(f"FastF1 cache enabled at: {CACHE_PATH_PREDICT}")
except Exception as e:
    logging.error(f"Error enabling FastF1 cache: {e}")

2025-05-06 21:50:37,655 - INFO - FastF1 cache enabled at: fastf1_cache_predict


# --- 2. Loading the files ---

In [5]:
try:
    model_path = os.path.join(MODEL_ARTIFACTS_DIR, 'trained_f1_model_1.joblib')
    rf_model_loaded = joblib.load(model_path)

    preprocessor_path = os.path.join(MODEL_ARTIFACTS_DIR, 'fitted_f1_preprocessor_1.joblib')
    preprocessor_loaded = joblib.load(preprocessor_path)

    raw_feature_names_path = os.path.join(MODEL_ARTIFACTS_DIR, 'raw_feature_cols_for_preprocessor_1.joblib')
    raw_feature_cols_for_input = joblib.load(raw_feature_names_path)

    processed_feature_names_path = os.path.join(MODEL_ARTIFACTS_DIR, 'processed_feature_names_1.joblib')
    processed_feature_names_for_model = joblib.load(processed_feature_names_path)
    print("Trained model, preprocessor, and feature name lists loaded.")
except FileNotFoundError as e:
    print(f"Error: Could not load model artifacts: {e}")

Trained model, preprocessor, and feature name lists loaded.


In [17]:
# roster_2023 = [
#     ('max_verstappen', 'red_bull'), ('perez', 'red_bull'),
#     ('leclerc', 'ferrari'), ('sainz', 'ferrari'),
#     ('russell', 'mercedes'), ('hamilton', 'mercedes'),
#     ('norris', 'mclaren'), ('piastri', 'mclaren'),
#     ('alonso', 'aston_martin'), ('stroll', 'aston_martin'),
#     ('gasly', 'alpine'), ('ocon', 'alpine'),
#     ('zhou', 'alfa'), ('bottas', 'alfa'),
#     ('tsunoda', 'rb'), ('de_vries', 'rb'),
#     ('albon', 'williams'), ('sargeant', 'williams'),
#     ('hulkenberg', 'haas'), ('kevin_magnussen', 'haas')
# ]

# roster_2023_df = pd.DataFrame(roster_2023, columns=['DriverId', 'TeamId'])

path_standings = 'csv/f1_standings_data_raw.csv'
try:
    standings_df = pd.read_csv(path_standings)
    standings_df_2022 = standings_df[standings_df['Season'] == 2022]
except FileNotFoundError as e:
    print(f"Error: Could not load standings data: {e}")

In [None]:
all_2023_race_stats = []
completed_races_count = 0
# current_date = datetime.now().date() 
current_date = datetime(2023, 5, 1).date()  # For testing purposes

try:
    schedule_2023 = fastf1.get_event_schedule(
        SEASON_TO_PREDICT, include_testing=False
    )

    potential_race_events = schedule_2023[
        (schedule_2023['Session5'].str.lower() == 'race')
        & (pd.to_datetime(schedule_2023['EventDate']).dt.date < current_date)
    ]
    potential_race_events = potential_race_events.sort_values(by='RoundNumber')

    logging.info(
        f"Found {len(potential_race_events)} potential past race events in {SEASON_TO_PREDICT} schedule."
    )

    for round_num_event, event in potential_race_events.iterrows():
        logging.info(
            f"  Attempting to load Round {event['RoundNumber']} - {event['EventName']}..."
        )
        try:
            session = fastf1.get_session(
                SEASON_TO_PREDICT, event['RoundNumber'], 'R'
            )
            session.load(weather=False, messages=False, telemetry=False)

            if session.results is None or session.results.empty:
                logging.warning(
                    f"    --> No results found for {session.event['EventName']}. Assuming not yet completed or data unavailable."
                )
                continue

            completed_races_count += 1
            logging.info(
                f"    -> Results found. Processing {len(session.results)} drivers."
            )
            for idx, result in session.results.iterrows():
                driver_abbr = result["Abbreviation"]
                avg_lap_time_sec = np.nan
                try:
                    driver_laps = session.laps.pick_driver(
                        driver_abbr
                    ).pick_quicklaps()
                    if not driver_laps.empty and 'LapTime' in driver_laps.columns:
                        valid_laps = pd.to_timedelta(
                            driver_laps['LapTime'], errors='coerce'
                        ).dropna()
                        if not valid_laps.empty:
                            avg_lap_time_sec = valid_laps.mean().total_seconds()
                except Exception:
                    pass

                grid_pos = pd.to_numeric(
                    result["GridPosition"], errors='coerce'
                )
                finish_pos = pd.to_numeric(result["Position"], errors='coerce')

                all_2023_race_stats.append({
                    "Season": SEASON_TO_PREDICT,
                    "Round": event['RoundNumber'],
                    "EventName": event['EventName'],
                    "Driver": driver_abbr,
                    "DriverId": result["DriverId"],
                    "TeamId": result["TeamId"],
                    "GridPosition": grid_pos,
                    "FinishPosition": finish_pos,
                    "PointsGained": result["Points"],
                    "AvgLapTimeSec": avg_lap_time_sec,
                    "Status": result["Status"],
                })
        except fastf1.ErgastError as ergast_err:
            logging.warning(f"    --> Ergast/API error for {event['EventName']}: {ergast_err}. Skipping.")
        except Exception as e:
            logging.warning(
                f"    --> Error loading session for {event['EventName']}: {e}. Skipping."
            )

    if not all_2023_race_stats:
        logging.error(
            f"No completed race data found for {SEASON_TO_PREDICT}. Cannot proceed."
        )
        raise ValueError("No completed race data found.")

    race_df_2023_raw = pd.DataFrame(all_2023_race_stats)
    logging.info(
        f"Successfully collected data for {completed_races_count} completed races in {SEASON_TO_PREDICT}."
    )

except Exception as e:
    logging.error(f"Could not fetch schedule or race data for {SEASON_TO_PREDICT}: {e}")

2025-05-06 21:54:59,553 - INFO - Found 4 potential past race events in 2023 schedule.
2025-05-06 21:54:59,554 - INFO -   Attempting to load Round 1 - Bahrain Grand Prix...
2025-05-06 21:54:59,563 - INFO - Loading data for Bahrain Grand Prix - Race [v3.5.3]
2025-05-06 21:54:59,564 - INFO - No cached data found for session_info. Loading data...
2025-05-06 21:54:59,565 - INFO - Fetching session info data...
2025-05-06 21:54:59,988 - INFO - Data has been written to cache!
2025-05-06 21:54:59,989 - INFO - No cached data found for driver_info. Loading data...
2025-05-06 21:54:59,989 - INFO - Fetching driver list...
2025-05-06 21:55:00,289 - INFO - Data has been written to cache!
2025-05-06 21:55:00,487 - INFO - No cached data found for session_status_data. Loading data...
2025-05-06 21:55:00,488 - INFO - Fetching session status data...
2025-05-06 21:55:00,894 - INFO - Data has been written to cache!
2025-05-06 21:55:00,895 - INFO - No cached data found for lap_count. Loading data...
2025-05-

# --- 2. Construct Feature Rows for Each 2023 Driver (Pre-Season State) ---

In [None]:
X_2023_preseason_list = []
default_finish_pos_median = 10.0
default_grid_pos_median = 10.0

for _, driver_info in roster_2023_df.iterrows():
    driver_id = driver_info['DriverId']
    team_id = driver_info['TeamId']

    prev_season_stats = standings_df_2022[standings_df_2022['DriverId'] == driver_id]
    prev_rank = prev_season_stats['position'].iloc[0] if not prev_season_stats.empty else 25
    prev_points = prev_season_stats['points'].iloc[0] if not prev_season_stats.empty else 0
    prev_wins = prev_season_stats['wins'].iloc[0] if not prev_season_stats.empty else 0

    window_size_fe = 5

    driver_features = {
        'TeamId': team_id,
        'RaceCount': 0,
        'DidFinish': 1.0,
        'PitLaneStart': 0,
        'CumulativePoints': 0.0,
        'AvgPoints': 0.0,
        'AvgFinishPos': default_finish_pos_median,
        'AvgGridPos': default_grid_pos_median,
        'AvgPosGained': 0.0,
        'FinishRate': 1.0,
        'DNFCount': 0.0,
        'StdDevFinishPos': 0.0,
        'Wins': 0.0,
        'Podiums': 0.0,
        'Top10s': 0.0,
        f'PointsLast{window_size_fe}': 0.0,
        f'AvgFinishLast{window_size_fe}': default_finish_pos_median,
        f'AvgGridLast{window_size_fe}': default_grid_pos_median,
        f'FinishRateLast{window_size_fe}': 1.0,
        f'DNFCountLast{window_size_fe}': 0.0,
        'PrevSeasonRank': prev_rank,
        'PrevSeasonPoints': prev_points,
        'PrevSeasonWins': prev_wins
    }
    X_2023_preseason_list.append(driver_features)

X_2023_preseason_raw = pd.DataFrame(X_2023_preseason_list)

for col in raw_feature_cols_for_input:
    if col not in X_2023_preseason_raw.columns:
        print(f"Warning: Column '{col}' missing from constructed features. Adding with NaN.")
        X_2023_preseason_raw[col] = np.nan
X_2023_preseason_raw = X_2023_preseason_raw[raw_feature_cols_for_input]


print(f"Raw pre-season features for {SEASON_TO_PREDICT} (shape: {X_2023_preseason_raw.shape}):")
X_2023_preseason_raw

Raw pre-season features for 2023 (shape: (20, 23)):


Unnamed: 0,TeamId,RaceCount,DidFinish,PitLaneStart,CumulativePoints,AvgPoints,AvgFinishPos,AvgGridPos,AvgPosGained,FinishRate,...,Podiums,Top10s,PointsLast5,AvgFinishLast5,AvgGridLast5,FinishRateLast5,DNFCountLast5,PrevSeasonRank,PrevSeasonPoints,PrevSeasonWins
0,red_bull,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,1,454.0,15
1,red_bull,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,3,305.0,2
2,ferrari,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,2,308.0,3
3,ferrari,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,5,246.0,1
4,mercedes,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,4,275.0,1
5,mercedes,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,6,240.0,0
6,mclaren,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,7,122.0,0
7,mclaren,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,25,0.0,0
8,aston_martin,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,9,81.0,0
9,aston_martin,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,15,18.0,0


# --- 3. Preprocess and Predict ---

In [10]:
print(f"\n--- Preprocessing {SEASON_TO_PREDICT} Pre-Season Data ---")
X_2023_preseason_processed_array = preprocessor_loaded.transform(X_2023_preseason_raw)
X_2023_preseason_predict = pd.DataFrame(X_2023_preseason_processed_array, columns=processed_feature_names_for_model, index=X_2023_preseason_raw.index)

print(f"\n--- Predicting Final {TARGET_COLUMN} for {SEASON_TO_PREDICT} (Pre-Season) ---")
preseason_predictions_values = rf_model_loaded.predict(X_2023_preseason_predict)


--- Preprocessing 2023 Pre-Season Data ---

--- Predicting Final FinalPoints for 2023 (Pre-Season) ---


# --- 4. Display Results ---

In [None]:
predictions_2023_preseason_df = roster_2023_df.copy()
predictions_2023_preseason_df[f'Predicted_{TARGET_COLUMN}_PreSeason'] = preseason_predictions_values
predictions_2023_preseason_df.sort_values(by=f'Predicted_{TARGET_COLUMN}_PreSeason', ascending=False, inplace=True)

print(f"\nPredicted Final {TARGET_COLUMN} for {SEASON_TO_PREDICT} (Pure Pre-Season Estimate):")
print(predictions_2023_preseason_df)
print("\nNOTE: These predictions are based SOLELY on 2020-2022 performance and 2023 team affiliations,")
print("with all 2023 in-season performance metrics set to a 'start of season' baseline.")


Predicted Final FinalPoints for 2023 (Pure Pre-Season Estimate):
           DriverId        TeamId  Predicted_FinalPoints_PreSeason
0    max_verstappen      red_bull                       384.202558
1             perez      red_bull                       360.317910
2           leclerc       ferrari                       306.665119
4           russell      mercedes                       216.652380
3             sainz       ferrari                       207.600159
5          hamilton      mercedes                       192.279291
11             ocon        alpine                       148.914434
8            alonso  aston_martin                       133.437893
6            norris       mclaren                       119.402761
13           bottas          alfa                       116.860278
7           piastri       mclaren                        96.744683
10            gasly        alpine                        77.129021
15         de_vries            rb                        58.305