In [56]:
import pandas as pd
import numpy as np
import joblib
import os
import fastf1
import logging
import ast

# --- 1. Configuration ---

In [57]:
SEASON_TO_PREDICT = 2025
MODEL_ARTIFACTS_DIR = 'model_artifacts'
TARGET_COLUMN = 'FinalPoints'
CACHE_PATH_PREDICT = 'fastf1_cache_predict'

logging.basicConfig(
    level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s'
)

fastf1.set_log_level(
    'WARNING'
)

try:
    if not os.path.exists(CACHE_PATH_PREDICT):
        os.makedirs(CACHE_PATH_PREDICT)
    fastf1.Cache.enable_cache(CACHE_PATH_PREDICT)
    logging.info(f"FastF1 cache enabled at: {CACHE_PATH_PREDICT}")
except Exception as e:
    logging.error(f"Error enabling FastF1 cache: {e}")

2025-05-07 23:59:53,821 - INFO - FastF1 cache enabled at: fastf1_cache_predict


# --- 2. Loading the files ---

In [58]:
try:
    model_path = os.path.join(MODEL_ARTIFACTS_DIR, 'trained_f1_model_xgb_final.joblib')
    rf_model_loaded = joblib.load(model_path)

    preprocessor_path = os.path.join(MODEL_ARTIFACTS_DIR, 'fitted_f1_preprocessor_xgb_final.joblib')
    preprocessor_loaded = joblib.load(preprocessor_path)

    raw_feature_names_path = os.path.join(MODEL_ARTIFACTS_DIR, 'raw_feature_cols_for_preprocessor_xgb_final.joblib')
    raw_feature_cols_for_input = joblib.load(raw_feature_names_path)

    processed_feature_names_path = os.path.join(MODEL_ARTIFACTS_DIR, 'processed_feature_names_xgb_final.joblib')
    processed_feature_names_for_model = joblib.load(processed_feature_names_path)
    print("Trained model, preprocessor, and feature name lists loaded.")
except FileNotFoundError as e:
    print(f"Error: Could not load model artifacts: {e}")

Trained model, preprocessor, and feature name lists loaded.


In [59]:
path_roster = 'csv/f1_standings_data_raw_2025.csv'
path_standings = 'csv/f1_standings_data_raw_2024.csv'
try:
    roster = pd.read_csv(path_roster)
    standings = pd.read_csv(path_standings)
    standings = standings[standings['Season'] == SEASON_TO_PREDICT - 1]
except FileNotFoundError as e:
    print(f"Error: Could not load standings data: {e}")
    


# --- 2. Construct Feature Rows for Each 2023 Driver (Pre-Season State) ---

In [60]:
X_preseason_list = []
default_finish_pos_median = 10.0
default_grid_pos_median = 10.0

for _, driver_info in roster.iterrows():
    driver_id = driver_info['driverId']
    team_id = ast.literal_eval(driver_info['constructorIds'])[-1]

    prev_season_stats = standings[standings['DriverId'] == driver_id]
    prev_rank = prev_season_stats['position'].iloc[0] if not prev_season_stats.empty else 25
    prev_points = prev_season_stats['points'].iloc[0] if not prev_season_stats.empty else 0
    prev_wins = prev_season_stats['wins'].iloc[0] if not prev_season_stats.empty else 0

    window_size_fe = 5

    driver_features = {
        'TeamId': team_id,
        'RaceCount': 0,
        'DidFinish': 1.0,
        'PitLaneStart': 0,
        'CumulativePoints': 0.0,
        'AvgPoints': 0.0,
        'AvgFinishPos': default_finish_pos_median,
        'AvgGridPos': default_grid_pos_median,
        'AvgPosGained': 0.0,
        'FinishRate': 1.0,
        'DNFCount': 0.0,
        'StdDevFinishPos': 0.0,
        'Wins': 0.0,
        'Podiums': 0.0,
        'Top10s': 0.0,
        f'PointsLast{window_size_fe}': 0.0,
        f'AvgFinishLast{window_size_fe}': default_finish_pos_median,
        f'AvgGridLast{window_size_fe}': default_grid_pos_median,
        f'FinishRateLast{window_size_fe}': 1.0,
        f'DNFCountLast{window_size_fe}': 0.0,
        'PrevSeasonRank': prev_rank,
        'PrevSeasonPoints': prev_points,
        'PrevSeasonWins': prev_wins
    }
    X_preseason_list.append(driver_features)

X_preseason_raw = pd.DataFrame(X_preseason_list)

for col in raw_feature_cols_for_input:
    if col not in X_preseason_raw.columns:
        print(f"Warning: Column '{col}' missing from constructed features. Adding with NaN.")
        X_preseason_raw[col] = np.nan
X_preseason_raw = X_preseason_raw[raw_feature_cols_for_input]


print(f"Raw pre-season features for {SEASON_TO_PREDICT} (shape: {X_preseason_raw.shape}):")
X_preseason_raw

Raw pre-season features for 2025 (shape: (20, 23)):


Unnamed: 0,TeamId,RaceCount,DidFinish,PitLaneStart,CumulativePoints,AvgPoints,AvgFinishPos,AvgGridPos,AvgPosGained,FinishRate,...,Podiums,Top10s,PointsLast5,AvgFinishLast5,AvgGridLast5,FinishRateLast5,DNFCountLast5,PrevSeasonRank,PrevSeasonPoints,PrevSeasonWins
0,mclaren,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,4,292.0,2
1,mclaren,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,2,374.0,4
2,red_bull,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,1,437.0,9
3,mercedes,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,6,245.0,2
4,ferrari,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,3,356.0,3
5,mercedes,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,25,0.0,0
6,ferrari,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,7,223.0,2
7,williams,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,16,12.0,0
8,haas,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,14,23.0,0
9,aston_martin,0,1.0,0,0.0,0.0,10.0,10.0,0.0,1.0,...,0.0,0.0,0.0,10.0,10.0,1.0,0.0,13,24.0,0


# --- 3. Preprocess and Predict ---

In [61]:
print(f"\n--- Preprocessing {SEASON_TO_PREDICT} Pre-Season Data ---")
X_preseason_processed_array = preprocessor_loaded.transform(X_preseason_raw)
X_preseason_predict = pd.DataFrame(X_preseason_processed_array, columns=processed_feature_names_for_model, index=X_preseason_raw.index)

print(f"\n--- Predicting Final {TARGET_COLUMN} for {SEASON_TO_PREDICT} (Pre-Season) ---")
preseason_predictions_values = rf_model_loaded.predict(X_preseason_predict)


--- Preprocessing 2025 Pre-Season Data ---

--- Predicting Final FinalPoints for 2025 (Pre-Season) ---


# --- 4. Display Results ---

In [62]:
predictions_preseason_df = roster[['driverId', 'givenName', 'constructorIds', 'constructorNames']].copy()
predictions_preseason_df[f'Predicted_{TARGET_COLUMN}_PreSeason'] = preseason_predictions_values
predictions_preseason_df.sort_values(by=f'Predicted_{TARGET_COLUMN}_PreSeason', ascending=False, inplace=True)

display_columns = ['driverId', 'constructorIds', f'Predicted_{TARGET_COLUMN}_PreSeason']
result_display = predictions_preseason_df[display_columns].copy()

result_display.columns = ['Driver', 'Team', 'Predicted Points']

print(f"\nPredicted Final {TARGET_COLUMN} for {SEASON_TO_PREDICT} (Pure Pre-Season Estimate):")
result_display


Predicted Final FinalPoints for 2025 (Pure Pre-Season Estimate):


Unnamed: 0,Driver,Team,Predicted Points
2,max_verstappen,['red_bull'],430.416473
5,antonelli,['mercedes'],277.184143
1,norris,['mclaren'],254.394241
4,leclerc,['ferrari'],252.755768
6,hamilton,['ferrari'],215.814468
3,russell,['mercedes'],195.070923
10,tsunoda,"['rb', 'red_bull']",167.3246
0,piastri,['mclaren'],142.128647
16,alonso,['aston_martin'],78.533112
11,gasly,['alpine'],66.102554
