In [2]:
import pandas as pd
import fastf1
import numpy as np

In [3]:
df = pd.read_parquet('../data/raw.parquet')
df

Unnamed: 0,season,round,event_name,circuit,driver,fp1_time,fp2_time,fp3_time,quali_time
0,2018,1,Australian Grand Prix,Melbourne,GRO,85.730,84.648,96.171,83.339
1,2018,1,Australian Grand Prix,Melbourne,HAR,87.745,85.925,95.438,84.532
2,2018,1,Australian Grand Prix,Melbourne,STR,86.636,85.543,95.828,84.230
3,2018,1,Australian Grand Prix,Melbourne,VAN,86.482,85.285,94.233,83.853
4,2018,1,Australian Grand Prix,Melbourne,ERI,87.964,86.814,88.890,84.556
...,...,...,...,...,...,...,...,...,...
2162,2025,11,Austrian Grand Prix,Spielberg,GAS,65.780,65.613,65.366,64.846
2163,2025,11,Austrian Grand Prix,Spielberg,BOR,65.874,65.411,65.182,64.846
2164,2025,11,Austrian Grand Prix,Spielberg,HAM,66.099,65.511,64.790,64.582
2165,2025,11,Austrian Grand Prix,Spielberg,STR,66.160,65.022,65.062,65.329


In [4]:
podium_dict = {}

for season in df['season'].unique():
    for rnd in df[df['season'] == season]['round'].unique():
        try:
            session = fastf1.get_session(season, rnd, 'R')
            session.load()
            results = session.results
            podium_drivers = set(results.iloc[:3]['Abbreviation']) 
            podium_dict[(season, rnd)] = podium_drivers
        except Exception as e:
            print(f"Error processing season {season}, round {rnd}: {e}")
            podium_dict[(season, rnd)] = set()
            user_input = input("Press enter to continue: ")







        


core           INFO 	Loading data for Australian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	No cached data found for position_data. Loading data...
_api           INFO 	Fetching position data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['5', '44', '7', '3', '14', '33', '27', '77', '2', '55', '11', '31', '16', '18', '28', '8', '20', '10', '9', 

In [5]:
def is_podium(row):
    return int(row['driver'] in podium_dict.get((row['season'], row['round']), set()))

df['podium_finish'] = df.apply(is_podium, axis=1)

In [10]:

selected_features = [
    'circuit', 'fp1_time', 'fp2_time', 'fp3_time', 'quali_time', 'podium_finish'
]
filtered_df = df[selected_features].copy()

# Add engineered features if desired
filtered_df['fp2_minus_fp1'] = filtered_df['fp2_time'] - filtered_df['fp1_time']
filtered_df['fp3_minus_fp2'] = filtered_df['fp3_time'] - filtered_df['fp2_time']

filtered_df.to_parquet('../data/processed.parquet')




