In [1]:
import pandas as pd
import numpy as np
import joblib
import os
import fastf1
import logging

# --- 1. Configuration ---

In [2]:
SEASON_TO_PREDICT = 2025
MODEL_ARTIFACTS_DIR = 'model_artifacts'
TARGET_COLUMN = 'FinalPoints'
CACHE_PATH_PREDICT = 'fastf1_cache_predict'

logging.basicConfig(
    level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s'
)

fastf1.set_log_level(
    'WARNING'
)

try:
    if not os.path.exists(CACHE_PATH_PREDICT):
        os.makedirs(CACHE_PATH_PREDICT)
    fastf1.Cache.enable_cache(CACHE_PATH_PREDICT)
    logging.info(f"FastF1 cache enabled at: {CACHE_PATH_PREDICT}")
except Exception as e:
    logging.error(f"Error enabling FastF1 cache: {e}")

2025-05-07 23:01:24,678 - INFO - FastF1 cache enabled at: fastf1_cache_predict


# --- 2. Loading the files ---

In [3]:
try:
    model_path = os.path.join(MODEL_ARTIFACTS_DIR, 'trained_f1_model_xgb_final.joblib')
    rf_model_loaded = joblib.load(model_path)

    preprocessor_path = os.path.join(MODEL_ARTIFACTS_DIR, 'fitted_f1_preprocessor_xgb_final.joblib')
    preprocessor_loaded = joblib.load(preprocessor_path)

    raw_feature_names_path = os.path.join(MODEL_ARTIFACTS_DIR, 'raw_feature_cols_for_preprocessor_xgb_final.joblib')
    raw_feature_cols_for_input = joblib.load(raw_feature_names_path)

    processed_feature_names_path = os.path.join(MODEL_ARTIFACTS_DIR, 'processed_feature_names_xgb_final.joblib')
    processed_feature_names_for_model = joblib.load(processed_feature_names_path)
    print("Trained model, preprocessor, and feature name lists loaded.")
except FileNotFoundError as e:
    print(f"Error: Could not load model artifacts: {e}")

Trained model, preprocessor, and feature name lists loaded.


In [None]:
path_standings = 'csv/f1_standings_data_raw.csv'
try:
    standings_df = pd.read_csv(path_standings)
    standings_df_predict = standings_df[standings_df['Season'] == 2024]
except FileNotFoundError as e:
    print(f"Error: Could not load standings data: {e}")

In [6]:
try:
    engineered_df = pd.read_csv('csv/f1_engineered_features.csv')
    print("Data loaded successfully.")
    print(f"Race DF shape: {engineered_df.shape}")
except FileNotFoundError:
    print("Error: CSV files not found. Make sure you ran the data collection script.")

engineered_df_raw = engineered_df[engineered_df['Season'] == SEASON_TO_PREDICT]

engineered_df_raw

Data loaded successfully.
Race DF shape: (2896, 29)


Unnamed: 0,Season,Round,EventName,DriverId,TeamId,RaceCount,DidFinish,PitLaneStart,CumulativePoints,AvgPoints,...,PointsLast5,AvgFinishLast5,AvgGridLast5,FinishRateLast5,DNFCountLast5,PrevSeasonRank,PrevSeasonPoints,PrevSeasonWins,FinalRank,FinalPoints
2816,2025,1,Australian Grand Prix,albon,williams,0,1,0,0.0,0.000000,...,0.0,10.000000,11.000000,1.000000,0.0,16.0,12.0,0.0,8,30.0
2817,2025,1,Australian Grand Prix,alonso,aston_martin,0,0,0,0.0,0.000000,...,0.0,10.000000,11.000000,1.000000,0.0,9.0,70.0,0.0,17,0.0
2818,2025,1,Australian Grand Prix,antonelli,mercedes,0,1,0,0.0,0.000000,...,0.0,10.000000,11.000000,1.000000,0.0,25.0,0.0,0.0,6,48.0
2819,2025,1,Australian Grand Prix,bearman,haas,0,1,0,0.0,0.000000,...,0.0,10.000000,11.000000,1.000000,0.0,18.0,7.0,0.0,15,6.0
2820,2025,1,Australian Grand Prix,bortoleto,sauber,0,0,0,0.0,0.000000,...,0.0,10.000000,11.000000,1.000000,0.0,25.0,0.0,0.0,20,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2891,2025,5,Saudi Arabian Grand Prix,piastri,mclaren,3,1,0,42.0,14.000000,...,42.0,4.333333,2.000000,1.000000,0.0,4.0,292.0,2.0,1,131.0
2892,2025,5,Saudi Arabian Grand Prix,russell,mercedes,3,1,0,43.0,14.333333,...,43.0,3.333333,4.000000,1.000000,0.0,6.0,245.0,2.0,4,93.0
2893,2025,5,Saudi Arabian Grand Prix,sainz,williams,3,1,0,0.0,0.000000,...,0.0,17.000000,11.000000,0.333333,2.0,5.0,290.0,2.0,13,7.0
2894,2025,5,Saudi Arabian Grand Prix,stroll,aston_martin,3,0,0,8.0,2.666667,...,8.0,14.333333,17.333333,0.666667,1.0,13.0,24.0,0.0,10,14.0


# --- 3. Prepare 2023 Data for Prediction ---

In [7]:
X_for_prediction_raw = engineered_df_raw.groupby(['Season', 'DriverId']).tail(1).copy()
X_for_prediction_raw = X_for_prediction_raw[X_for_prediction_raw['Season'] == SEASON_TO_PREDICT]

for col in raw_feature_cols_for_input:
    if col not in X_for_prediction_raw.columns:
        logging.warning(f"Column '{col}' missing from 2023 features. Adding with NaN.")
        X_for_prediction_raw[col] = np.nan
X_for_prediction_raw = X_for_prediction_raw[raw_feature_cols_for_input]

if X_for_prediction_raw.empty:
    logging.error(f"No data available for prediction for season {SEASON_TO_PREDICT} after feature engineering. Check data and filtering.")
    exit()
print(f"Raw features for {SEASON_TO_PREDICT} prediction (shape: {X_for_prediction_raw.shape}):")
X_for_prediction_raw

Raw features for 2025 prediction (shape: (20, 23)):


Unnamed: 0,TeamId,RaceCount,DidFinish,PitLaneStart,CumulativePoints,AvgPoints,AvgFinishPos,AvgGridPos,AvgPosGained,FinishRate,...,Podiums,Top10s,PointsLast5,AvgFinishLast5,AvgGridLast5,FinishRateLast5,DNFCountLast5,PrevSeasonRank,PrevSeasonPoints,PrevSeasonWins
2876,williams,3,1,0,12.0,4.0,8.666667,10.0,1.333333,1.0,...,0.0,2.0,12.0,8.666667,10.0,1.0,0.0,16.0,12.0,0.0
2877,aston_martin,3,1,0,0.0,0.0,14.333333,12.333333,-2.0,0.666667,...,0.0,0.0,0.0,14.333333,12.333333,0.666667,1.0,9.0,70.0,0.0
2878,mercedes,3,1,0,20.0,6.666667,7.0,9.0,2.0,1.0,...,0.0,2.0,20.0,7.0,9.0,1.0,0.0,25.0,0.0,0.0
2879,haas,3,1,0,2.0,0.666667,11.333333,16.666667,5.333333,1.0,...,0.0,2.0,2.0,11.333333,16.666667,1.0,0.0,18.0,7.0,0.0
2880,sauber,3,0,0,0.0,0.0,17.666667,16.666667,-1.0,0.666667,...,0.0,0.0,0.0,17.666667,16.666667,0.666667,1.0,25.0,0.0,0.0
2881,alpine,3,0,0,0.0,0.0,16.0,14.666667,-1.333333,0.666667,...,0.0,0.0,0.0,16.0,14.666667,0.666667,1.0,24.0,0.0,0.0
2882,alpine,3,0,0,6.0,2.0,10.333333,8.0,-2.333333,1.0,...,0.0,1.0,6.0,10.333333,8.0,1.0,0.0,10.0,42.0,0.0
2883,rb,3,1,0,4.0,1.333333,13.666667,10.0,-3.666667,0.666667,...,0.0,1.0,4.0,13.666667,10.0,0.666667,1.0,25.0,0.0,0.0
2884,ferrari,3,1,0,17.0,5.666667,7.333333,8.333333,1.0,1.0,...,0.0,3.0,17.0,7.333333,8.333333,1.0,0.0,7.0,223.0,2.0
2885,sauber,3,0,0,6.0,2.0,14.333333,16.333333,2.0,0.666667,...,0.0,1.0,6.0,14.333333,16.333333,0.666667,1.0,11.0,41.0,0.0


# --- 3. Preprocess and Predict ---

In [8]:
print(f"\n--- Preprocessing {SEASON_TO_PREDICT} Data ---")
X_processed_array = preprocessor_loaded.transform(X_for_prediction_raw)
X_predict = pd.DataFrame(X_processed_array, columns=processed_feature_names_for_model, index=X_for_prediction_raw.index)

print(f"\n--- Predicting Final {TARGET_COLUMN} for {SEASON_TO_PREDICT} ---")
predictions_values = rf_model_loaded.predict(X_predict)


--- Preprocessing 2025 Data ---

--- Predicting Final FinalPoints for 2025 ---


# --- 4. Display Results ---

In [10]:
identifiers = engineered_df_raw.loc[X_for_prediction_raw.index, ['DriverId', 'TeamId']].copy()
predictions_df = identifiers.reset_index(drop=True)
predictions_df[f'Predicted_{TARGET_COLUMN}'] = predictions_values
predictions_df.sort_values(by=f'Predicted_{TARGET_COLUMN}', ascending=False, inplace=True)

print(f"\nPredicted Final {TARGET_COLUMN} for {SEASON_TO_PREDICT} (based on data from completed races):")
predictions_df


Predicted Final FinalPoints for 2025 (based on data from completed races):


Unnamed: 0,DriverId,TeamId,Predicted_FinalPoints
2,antonelli,mercedes,167.522659
15,piastri,mclaren,160.060333
16,russell,mercedes,156.363266
13,norris,mclaren,141.088547
12,max_verstappen,red_bull,109.018852
11,leclerc,ferrari,96.038483
19,tsunoda,red_bull,73.999611
8,hamilton,ferrari,68.543747
6,gasly,alpine,39.559521
0,albon,williams,38.055183
