In [29]:
import pandas as pd
import numpy as np
import joblib
import os
import fastf1
import logging

# --- 1. Configuration ---

In [20]:
SEASON_TO_PREDICT = 2023
MODEL_ARTIFACTS_DIR = 'model_artifacts'
TARGET_COLUMN = 'FinalPoints'
CACHE_PATH_PREDICT = 'fastf1_cache_predict'

logging.basicConfig(
    level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s'
)

fastf1.set_log_level(
    'WARNING'
)

try:
    if not os.path.exists(CACHE_PATH_PREDICT):
        os.makedirs(CACHE_PATH_PREDICT)
    fastf1.Cache.enable_cache(CACHE_PATH_PREDICT)
    logging.info(f"FastF1 cache enabled at: {CACHE_PATH_PREDICT}")
except Exception as e:
    logging.error(f"Error enabling FastF1 cache: {e}")

2025-05-06 21:57:51,719 - INFO - FastF1 cache enabled at: fastf1_cache_predict


# --- 2. Loading the files ---

In [21]:
try:
    model_path = os.path.join(MODEL_ARTIFACTS_DIR, 'trained_f1_model_1.joblib')
    rf_model_loaded = joblib.load(model_path)

    preprocessor_path = os.path.join(MODEL_ARTIFACTS_DIR, 'fitted_f1_preprocessor_1.joblib')
    preprocessor_loaded = joblib.load(preprocessor_path)

    raw_feature_names_path = os.path.join(MODEL_ARTIFACTS_DIR, 'raw_feature_cols_for_preprocessor_1.joblib')
    raw_feature_cols_for_input = joblib.load(raw_feature_names_path)

    processed_feature_names_path = os.path.join(MODEL_ARTIFACTS_DIR, 'processed_feature_names_1.joblib')
    processed_feature_names_for_model = joblib.load(processed_feature_names_path)
    print("Trained model, preprocessor, and feature name lists loaded.")
except FileNotFoundError as e:
    print(f"Error: Could not load model artifacts: {e}")

Trained model, preprocessor, and feature name lists loaded.


In [18]:
path_standings = 'csv/f1_standings_data_raw.csv'
try:
    standings_df = pd.read_csv(path_standings)
    standings_df_2022 = standings_df[standings_df['Season'] == 2022]
except FileNotFoundError as e:
    print(f"Error: Could not load standings data: {e}")

In [33]:
ROUND = 5 # !FOR TESTING ONLY

try:
    engineered_df = pd.read_csv('csv/f1_engineered_features_2023.csv')
    print("Data loaded successfully.")
    print(f"Race DF shape: {engineered_df.shape}")
except FileNotFoundError:
    print("Error: CSV files not found. Make sure you ran the data collection script.")

engineered_df_2023_raw = engineered_df[engineered_df['Round'] <= ROUND]

engineered_df_2023_raw

Data loaded successfully.
Race DF shape: (1756, 29)


Unnamed: 0,Season,Round,EventName,DriverId,TeamId,RaceCount,DidFinish,PitLaneStart,CumulativePoints,AvgPoints,...,PointsLast5,AvgFinishLast5,AvgGridLast5,FinishRateLast5,DNFCountLast5,PrevSeasonRank,PrevSeasonPoints,PrevSeasonWins,FinalRank,FinalPoints
0,2023,1,Bahrain Grand Prix,albon,williams,0,1,0,0.0,0.000000,...,0.0,10.0,10.0,1.0,0.0,25.0,0.0,0.0,13,27.0
1,2023,1,Bahrain Grand Prix,albon,williams,0,1,0,0.0,0.000000,...,0.0,10.0,10.0,1.0,0.0,25.0,0.0,0.0,13,27.0
2,2023,1,Bahrain Grand Prix,albon,williams,1,1,0,1.0,1.000000,...,1.0,10.0,15.0,1.0,0.0,25.0,0.0,0.0,13,27.0
3,2023,1,Bahrain Grand Prix,albon,williams,1,1,0,1.0,1.000000,...,1.0,10.0,15.0,1.0,0.0,25.0,0.0,0.0,13,27.0
4,2023,1,Bahrain Grand Prix,alonso,aston_martin,0,1,0,0.0,0.000000,...,0.0,10.0,10.0,1.0,0.0,25.0,0.0,0.0,4,206.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,2023,5,Miami Grand Prix,tsunoda,alphatauri,9,1,0,4.0,0.444444,...,4.0,10.2,11.4,1.0,0.0,25.0,0.0,0.0,14,17.0
396,2023,5,Miami Grand Prix,zhou,alfa,8,1,0,4.0,0.500000,...,4.0,13.8,15.0,0.6,2.0,25.0,0.0,0.0,18,6.0
397,2023,5,Miami Grand Prix,zhou,alfa,8,1,0,4.0,0.500000,...,4.0,13.8,15.0,0.6,2.0,25.0,0.0,0.0,18,6.0
398,2023,5,Miami Grand Prix,zhou,alfa,9,1,0,4.0,0.444444,...,4.0,14.4,15.6,0.6,2.0,25.0,0.0,0.0,18,6.0


# --- 3. Prepare 2023 Data for Prediction ---

In [35]:
X_2023_for_prediction_raw = engineered_df_2023_raw.groupby(['Season', 'DriverId']).tail(1).copy()
X_2023_for_prediction_raw = X_2023_for_prediction_raw[X_2023_for_prediction_raw['Season'] == SEASON_TO_PREDICT]

for col in raw_feature_cols_for_input:
    if col not in X_2023_for_prediction_raw.columns:
        logging.warning(f"Column '{col}' missing from 2023 features. Adding with NaN.")
        X_2023_for_prediction_raw[col] = np.nan
X_2023_for_prediction_raw = X_2023_for_prediction_raw[raw_feature_cols_for_input]

if X_2023_for_prediction_raw.empty:
    logging.error(f"No data available for prediction for season {SEASON_TO_PREDICT} after feature engineering. Check data and filtering.")
    exit()
print(f"Raw features for {SEASON_TO_PREDICT} prediction (shape: {X_2023_for_prediction_raw.shape}):")
X_2023_for_prediction_raw

Raw features for 2023 prediction (shape: (20, 23)):


Unnamed: 0,TeamId,RaceCount,DidFinish,PitLaneStart,CumulativePoints,AvgPoints,AvgFinishPos,AvgGridPos,AvgPosGained,FinishRate,...,Podiums,Top10s,PointsLast5,AvgFinishLast5,AvgGridLast5,FinishRateLast5,DNFCountLast5,PrevSeasonRank,PrevSeasonPoints,PrevSeasonWins
323,williams,9,1,0,2.0,0.222222,14.888889,12.777778,-2.111111,0.555556,...,0.0,2.0,0.0,15.2,10.2,0.6,2.0,25.0,0.0,0.0
327,aston_martin,9,1,0,129.0,14.333333,3.222222,4.0,0.777778,1.0,...,7.0,9.0,69.0,3.4,4.4,1.0,0.0,25.0,0.0,0.0
331,alfa,9,1,0,8.0,0.888889,13.666667,14.0,0.333333,0.555556,...,0.0,2.0,0.0,14.2,14.8,0.6,2.0,25.0,0.0,0.0
335,alphatauri,9,1,0,0.0,0.0,16.0,17.222222,1.222222,0.555556,...,0.0,0.0,0.0,17.6,16.2,0.6,2.0,25.0,0.0,0.0
339,alpine,9,1,0,12.0,1.333333,10.888889,12.777778,1.888889,1.0,...,0.0,5.0,4.0,12.4,11.4,1.0,0.0,25.0,0.0,0.0
343,mercedes,9,1,0,100.0,11.111111,4.666667,6.333333,1.666667,1.0,...,2.0,9.0,60.0,4.4,5.8,1.0,0.0,25.0,0.0,0.0
347,haas,9,1,0,12.0,1.333333,13.0,12.444444,-0.555556,0.555556,...,0.0,2.0,12.0,12.6,14.4,0.6,2.0,25.0,0.0,0.0
351,haas,9,1,0,3.0,0.333333,12.888889,13.777778,0.888889,0.777778,...,0.0,3.0,1.0,14.0,12.8,1.0,0.0,25.0,0.0,0.0
355,ferrari,9,1,0,48.0,5.333333,11.666667,5.888889,-5.777778,0.555556,...,2.0,5.0,36.0,10.6,4.6,0.6,2.0,25.0,0.0,0.0
359,red_bull,9,1,0,200.0,22.222222,1.444444,5.222222,3.777778,1.0,...,9.0,9.0,112.0,1.4,3.0,1.0,0.0,25.0,0.0,0.0


# --- 3. Preprocess and Predict ---

In [37]:
print(f"\n--- Preprocessing {SEASON_TO_PREDICT} Data ---")
X_2023_processed_array = preprocessor_loaded.transform(X_2023_for_prediction_raw)
X_2023_predict = pd.DataFrame(X_2023_processed_array, columns=processed_feature_names_for_model, index=X_2023_for_prediction_raw.index)

print(f"\n--- Predicting Final {TARGET_COLUMN} for {SEASON_TO_PREDICT} ---")
predictions_2023_values = rf_model_loaded.predict(X_2023_predict)


--- Preprocessing 2023 Data ---

--- Predicting Final FinalPoints for 2023 ---


# --- 4. Display Results ---

In [39]:
identifiers_2025 = engineered_df_2023_raw.loc[X_2023_for_prediction_raw.index, ['DriverId', 'TeamId']].copy()
predictions_2025_df = identifiers_2025.reset_index(drop=True)
predictions_2025_df[f'Predicted_{TARGET_COLUMN}'] = predictions_2023_values
predictions_2025_df.sort_values(by=f'Predicted_{TARGET_COLUMN}', ascending=False, inplace=True)

print(f"\nPredicted Final {TARGET_COLUMN} for {SEASON_TO_PREDICT} (based on data from lol completed races):")
predictions_2025_df


Predicted Final FinalPoints for 2023 (based on data from lol completed races):


Unnamed: 0,DriverId,TeamId,Predicted_FinalPoints
5,hamilton,mercedes,249.345209
9,max_verstappen,red_bull,231.189709
12,perez,red_bull,224.584581
14,russell,mercedes,204.056141
1,alonso,aston_martin,153.44445
15,sainz,ferrari,143.555346
17,stroll,aston_martin,102.912897
8,leclerc,ferrari,98.952947
4,gasly,alpine,77.604056
11,ocon,alpine,77.474243
