In [12]:
import fastf1
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d

In [13]:
# CONFIGURATION AND METADATA
YEAR = 2025      
RACE = 'Abu Dhabi' 
SESSION = 'R' # 'R' for race session    
SPATIAL_RESOLUTION = 10 # meters
SELECTED_DRIVERS = ['VER', 'NOR', 'PIA', 'RUS', 'ANT', 'LEC', 'HAM']

fastf1.Cache.enable_cache('f1_cache')

print(f"1 - DATA EXTRACTION ({RACE} {YEAR})")
session = fastf1.get_session(YEAR, RACE, SESSION)
session.load()

# DATA PRE-PROCESSING AND FILTRATION
# Filtering for representative laps: excluding outlier laps (first lap, pit stops, yellow flags, slow laps)
laps = session.laps.pick_drivers(SELECTED_DRIVERS).pick_quicklaps()
laps = laps[laps['LapNumber'] > 1] 
laps = laps[laps['TrackStatus'] == '1'] # strict inclusion of Green Flag conditions
laps = laps[laps['PitInTime'].isnull() & laps['PitOutTime'].isnull()] 

print(f"Valid laps identified for analysis: {len(laps)}")

# SPATIAL DOMAIN DEFINITION
# Defining the reference spatial grid based on the fastest lap telemetry
print("Defining circuit spatial domain")
fastest_lap = session.laps.pick_fastest()
telemetry_fastest = fastest_lap.get_telemetry()
circuit_length = telemetry_fastest['Distance'].max()

# Yas Marina Circuit standard length is approx 5281m (post-2021 layout)
if pd.isna(circuit_length) or circuit_length < 2000:
    circuit_length = 5281.0 

spatial_grid = np.arange(0, int(circuit_length), SPATIAL_RESOLUTION)
print(f"Spatial Grid: {len(spatial_grid)} nodes (Step size: {SPATIAL_RESOLUTION}m)")


1 - DATA EXTRACTION (Abu Dhabi 2025)


core           INFO 	Loading data for Abu Dhabi Grand Prix - Race [v3.5.3]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No 

Valid laps identified for analysis: 379
Defining circuit spatial domain
Spatial Grid: 522 nodes (Step size: 10m)


In [14]:
# FEATURE ENGINEERING (MECHANICAL & KINEMATIC)
def calculate_curvature(x, y):
    """
    Computes continuous curvature k(s) using numerical gradients.
    k = |x'y'' - y'x''| / (x'^2 + y'^2)^(3/2)
    """
    dx = np.gradient(x)
    dy = np.gradient(y)
    ddx = np.gradient(dx)
    ddy = np.gradient(dy)
    numerator = dx * ddy - dy * ddx
    denominator = (dx**2 + dy**2)**1.5
    with np.errstate(divide='ignore', invalid='ignore'):
        k = numerator / denominator
        k[np.isnan(k)] = 0
    return np.abs(k)

def classify_corner_type(k):
    """
    Discretizes curvature values into categorical corner types for 
    qualitative interpretability based on F1 aerodynamic/mechanical thresholds.
    - < 0.002: Straight (R > 500m)
    - < 0.01:  HighSpeed (R 100-500m)
    - < 0.03:  Medium (R 33-100m)
    - < 0.08:  Slow (R 12-33m)
    - > 0.08:  Hairpin (R < 12m)
    """
    if k < 0.002:
        return 'Straight'
    elif k < 0.01:
        return 'HighSpeed'
    elif k < 0.03:
        return 'Medium'
    elif k < 0.08:
        return 'Slow'
    else:
        return 'Hairpin'

def estimate_fuel_load(total_laps, current_lap):
    """
    Estimates instantaneous fuel mass assuming a linear burn rate 
    (approx. 110kg start, 0kg finish in accordance with the regulations).
    """
    fuel_per_lap = 110.0 / total_laps
    return 110.0 - (fuel_per_lap * current_lap)

# Abu Dhabi GP standard race distance is 58 laps
total_race_laps = session.total_laps
if pd.isna(total_race_laps) or total_race_laps == 0: 
    total_race_laps = 58 


In [15]:
# FUNCTIONAL DATA PROCESSING LOOP
data_list = []
print("2 - FUNCTIONAL TRANSFORMATION")

for i, lap in laps.iterrows():
    try:
        tel = lap.get_telemetry()
        if len(tel) < 50: continue 
        
        # Signal interpolation (temporal to spatial mapping)
        f_speed = interp1d(tel['Distance'], tel['Speed'], kind='linear', fill_value="extrapolate")
        f_x = interp1d(tel['Distance'], tel['X'], kind='linear', fill_value="extrapolate")
        f_y = interp1d(tel['Distance'], tel['Y'], kind='linear', fill_value="extrapolate")
        
        # DRS is treated as a boolean indicator (flap activation > 10)
        drs_binary_raw = (tel['DRS'] >= 10).astype(int)
        f_drs = interp1d(tel['Distance'], drs_binary_raw, kind='nearest', fill_value="extrapolate")

        # Projection onto the common spatial grid
        s_speed = f_speed(spatial_grid)
        s_x = f_x(spatial_grid)
        s_y = f_y(spatial_grid)
        s_drs_binary = f_drs(spatial_grid).astype(int)
        
        # Physics-based regressor calculation
        s_curvature = calculate_curvature(s_x, s_y)
        fuel = estimate_fuel_load(total_race_laps, lap['LapNumber'])
        
        # Data structuring
        lap_df = pd.DataFrame({
            'Driver': lap['Driver'],
            'Team': lap['Team'],
            'LapNumber': lap['LapNumber'],
            'Space_Distance': spatial_grid,
            'Speed': s_speed,
            'Curvature': s_curvature,
            'DRS': s_drs_binary,
            'Fuel_Load': fuel,
            'TyreLife': lap['TyreLife'],
            'Compound': lap['Compound'], 
            'X_Coord': s_x,
            'Y_Coord': s_y
        })
        data_list.append(lap_df)
            
    except Exception as e:
        print(f"Error processing lap {lap['LapNumber']} of driver {lap['Driver']}: {e}")
        continue 

df_final = pd.concat(data_list, ignore_index=True)


2 - FUNCTIONAL TRANSFORMATION


In [16]:
# POST-PROCESSING AND FORMATTING
print("3 - POST-PROCESSING AND FORMATTING")

# Corner classification (continuous + categorical)
# Temporal categorization
df_final['Corner_Category'] = df_final['Curvature'].apply(classify_corner_type)
# One-Hot Encoding for categorical regressors (D-STEM ready)
df_final = pd.get_dummies(df_final, columns=['Corner_Category'], prefix='Type')
# Cast to integer for computational efficiency
corner_cols = [c for c in df_final.columns if 'Type_' in c]
for c in corner_cols:
    df_final[c] = df_final[c].astype(int)

# Tyre compound encoding
df_final['Compound_Temp'] = df_final['Compound']
df_final = pd.get_dummies(df_final, columns=['Compound_Temp'], prefix='Tyre')
dry_compounds = ['Tyre_SOFT', 'Tyre_MEDIUM', 'Tyre_HARD']

for col in dry_compounds:
    if col not in df_final.columns:
        df_final[col] = 0
    else:
        df_final[col] = df_final[col].astype(int)

# Drop redundant or non-dry compound columns if any (e.g., INTERMEDIATE and WET)
cols_to_drop = [c for c in df_final.columns if 'Tyre_' in c and c not in dry_compounds]
if cols_to_drop:
    df_final.drop(columns=cols_to_drop, inplace=True)

# Numerical precision standardization
print("Applying scientific precision formatting")

# Integer casting
df_final['LapNumber'] = df_final['LapNumber'].astype(int)
df_final['TyreLife'] = df_final['TyreLife'].fillna(0).astype(int) 

# Float
df_final['Speed'] = df_final['Speed'].round(1)        
df_final['Fuel_Load'] = df_final['Fuel_Load'].round(3) 
df_final['Curvature'] = df_final['Curvature'].round(6) 


3 - POST-PROCESSING AND FORMATTING
Applying scientific precision formatting


In [17]:
# EXPORT
filename = f"dataset_{RACE}_{YEAR}.csv"
df_final.to_csv(filename, index=False)

print(f"PROCESS COMPLETED. Exported file: {filename}")
print(f"Dataset Dimensions: {df_final.shape}")
print("Variable Preview (Formatted):")
print(df_final[['LapNumber', 'TyreLife', 'Speed', 'Curvature', 'Type_Straight']].head(3))

PROCESS COMPLETED. Exported file: dataset_Abu Dhabi_2025.csv
Dataset Dimensions: (197838, 20)
Variable Preview (Formatted):
   LapNumber  TyreLife  Speed  Curvature  Type_Straight
0          2         2  222.4   0.000015              1
1          2         2  226.3   0.000043              1
2          2         2  229.6   0.000083              1
