In [47]:
import fastf1
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d
from scipy.signal import savgol_filter


In [48]:
# CONFIGURATION AND METADATA
YEAR = 2025      
RACE = 'Abu Dhabi' 
SESSION = 'R' # 'R' for race session    
SPATIAL_RESOLUTION = 10 # meters
SELECTED_DRIVERS = ['VER', 'NOR', 'PIA', 'RUS', 'ANT', 'LEC', 'HAM']

fastf1.Cache.enable_cache('dataset/f1_cache')

print(f"1 - DATA EXTRACTION ({RACE} {YEAR})")
session = fastf1.get_session(YEAR, RACE, SESSION)
session.load()

# DATA PRE-PROCESSING AND FILTRATION
# Filtering for representative laps: excluding outlier laps (first lap, pit stops, yellow flags, slow laps)
laps = session.laps.pick_drivers(SELECTED_DRIVERS).pick_quicklaps()
laps = laps[laps['LapNumber'] > 1] 
laps = laps[laps['TrackStatus'] == '1'] 
laps = laps[laps['PitInTime'].isnull() & laps['PitOutTime'].isnull()] 

print(f"Valid laps identified for analysis: {len(laps)}")

# SPATIAL DOMAIN DEFINITION
# Defining the reference spatial grid based on the fastest lap telemetry
fastest_lap = session.laps.pick_fastest()
telemetry_fastest = fastest_lap.get_telemetry()
circuit_length = telemetry_fastest['Distance'].max()

# Yas Marina Circuit standard length is approx 5281m (post-2021 layout)
if pd.isna(circuit_length) or circuit_length < 2000:
    circuit_length = 5281.0 

spatial_grid = np.arange(0, int(circuit_length), SPATIAL_RESOLUTION)
print(f"Spatial Grid: {len(spatial_grid)} nodes (Step size: {SPATIAL_RESOLUTION}m)")

# Abu Dhabi GP standard race distance is 58 laps
total_race_laps = session.total_laps if session.total_laps > 0 else 58 


core           INFO 	Loading data for Abu Dhabi Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


1 - DATA EXTRACTION (Abu Dhabi 2025)


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '81', '4', '16', '63', '14', '31', '44', '27', '18', '5', '87', '55', '22', '12', '23', '6', '30', '10', '43']


Valid laps identified for analysis: 379
Spatial Grid: 522 nodes (Step size: 10m)


### **Data Cleaning**
Our main goal here is to enforce stationarity in the dataset. The D-STEM model assumes that the degradation trend is somewhat predictable; however, external shocks like Safety Cars (SC) or Virtual Safety Cars (VSC) introduce structural breaks in tire temperature and consumption.
To avoid biasing the estimation, we strictly filter the data to keep only 'Green Flag' racing conditions (TrackStatus == '1'). We also remove the first lap (standing start anomaly) and pit-in/out laps to ensure that every functional observation represents a comparable driving state.

### **Why Abu Dhabi?**
The selection of the Abu Dhabi GP (Yas Marina Circuit) is a deliberate design choice aimed at maximizing the Signal-to-Noise Ratio for the D-STEM estimation.
As discussed in Lesson 4 (Model Complexity), introducing too many exogenous covariates increases the computational burden and risk of overfitting. Yas Marina, being a desert track, guarantees stable atmospheric conditions (Dry), eliminating humidity and rain as confounding variables.
Furthermore, the circuit layout is ideal for the FDA approach (Lesson 6): the distinct mix of high-speed straights and a highly technical Sector 3 forces the B-Spline basis to capture significant local variations in the tire degradation coefficient ($\beta_{tyre}$), making the analysis much more robust compared to simpler tracks.

### **From Time to Space**
Finally we encounter the "misalignment problem" inherent in telemetry data. Since a fast lap is shorter in time than a slow lap, time indices $t$ do not align across observations.
To solve this, we shift from the Time Domain to the Spatial Domain. We define a reference spatial grid with a 10-meter resolution (SPATIAL_RESOLUTION) based on the session's fastest lap. This establishes a common coordinate system $s \in [0, L]$ where $L$ is the circuit length, allowing for the direct comparison of functional curves $v_t(s)$.

In [None]:
# FEATURE ENGINEERING FUNCTIONS
def calculate_curvature(x, y):
    """
    Computes continuous curvature k(s) using numerical gradients.
    k = |x'y'' - y'x''| / (x'^2 + y'^2)^(3/2)
    """
    dx = np.gradient(x)
    dy = np.gradient(y)
    ddx = np.gradient(dx)
    ddy = np.gradient(dy)
    numerator = dx * ddy - dy * ddx
    denominator = (dx**2 + dy**2)**1.5
    with np.errstate(divide='ignore', invalid='ignore'):
        k = numerator / denominator
        k[np.isnan(k)] = 0
    return np.abs(k)

def classify_corner_type(k):
    """
    Discretizes curvature values into categorical corner types for 
    qualitative interpretability based on F1 aerodynamic/mechanical thresholds.
    - < 0.005: Straight (R > 200m)
    - < 0.015: HighSpeed (R 67-200m)
    - < 0.028: MediumSpeed (R 36-67m)
    - > 0.028: SlowSpeed (R < 36m)
    """
    if k < 0.005:
        return 'Straight'
    elif k < 0.015:
        return 'HighSpeed'
    elif k < 0.028:
        return 'MediumSpeed'
    else:
        return 'SlowSpeed'

def estimate_fuel_load(total_laps, current_lap):
    """
    Estimates instantaneous fuel mass assuming a linear burn rate 
    (approx. 110kg start, 0kg finish in accordance with the regulations).
    """
    fuel_per_lap = 110.0 / total_laps
    return 110.0 - (fuel_per_lap * current_lap)


### **Spatial Covariates**
Since the maximum speed in a corner is physically limited by lateral grip ($v \approx \sqrt{\mu g R}$), the Curvature $\kappa(s)$ acts as our primary invariant regressor. We calculate it numerically using the second derivatives of the coordinates.
From a modeling perspective (Lesson 4), including $\kappa(s)$ is crucial: it allows the D-STEM model to understand that a drop in speed at a specific location is due to the track layout (a tight corner) and not due to driver error or tire degradation. We also discretize these values into "Corner Types" to help with the visual interpretation of the results later on.

### **Temporal Covariates**
Here we handle the Fuel Load, which acts as a deterministic temporal covariate.
A Formula 1 car consumes approx. 110kg of fuel during a race, making the car significantly lighter and faster as laps progress (~0.03s faster per lap).
By modeling the fuel load as a linear decay, we force the model's tire coefficient ($\beta_{tyre}$) to capture only the pure degradation, cleaning it from the mechanical advantage of the weight loss.

In [None]:
# FUNCTIONAL DATA PROCESSING LOOP
data_list = []
print("2 - FUNCTIONAL TRANSFORMATION & SMOOTHING")

for i, lap in laps.iterrows():
    try:
        tel = lap.get_telemetry()
        if len(tel) < 50: continue 
        
        # Signal interpolation (temporal to spatial mapping)
        # Coordinates converted from decimeters to meters
        f_speed = interp1d(tel['Distance'], tel['Speed'], kind='linear', fill_value="extrapolate")
        f_x = interp1d(tel['Distance'], tel['X'] / 10, kind='linear', fill_value="extrapolate")
        f_y = interp1d(tel['Distance'], tel['Y'] / 10, kind='linear', fill_value="extrapolate")

        # DRS is treated as a boolean indicator (flap activation > 10 from API docs)
        drs_binary_raw = (tel['DRS'] >= 10).astype(int)
        f_drs = interp1d(tel['Distance'], drs_binary_raw, kind='nearest', fill_value="extrapolate")

        # Projection onto the common spatial grid
        s_speed = f_speed(spatial_grid)
        s_x = f_x(spatial_grid)
        s_y = f_y(spatial_grid)
        s_drs_binary = f_drs(spatial_grid).astype(int)
        
        # Physics-based regressor calculations + smoothing
        raw_curvature = calculate_curvature(s_x, s_y)
        s_curvature = savgol_filter(raw_curvature, window_length=11, polyorder=3)
        s_curvature = np.abs(s_curvature) 
        fuel = estimate_fuel_load(total_race_laps, lap['LapNumber'])
        
        # Data structuring
        lap_df = pd.DataFrame({
            'Driver': lap['Driver'],
            'Team': lap['Team'],
            'LapNumber': lap['LapNumber'],
            'Space_Distance': spatial_grid,
            'Speed': s_speed,
            'Curvature': s_curvature,
            'DRS': s_drs_binary,
            'Fuel_Load': fuel,
            'TyreLife': lap['TyreLife'],
            'Compound': lap['Compound'], 
            'X_Coord': s_x,
            'Y_Coord': s_y
        })
        data_list.append(lap_df)
            
    except Exception as e:
        print(f"Error processing lap {lap['LapNumber']} of driver {lap['Driver']}: {e}")
        continue 

df_final = pd.concat(data_list, ignore_index=True)
print("Main loop completed")


2 - FUNCTIONAL TRANSFORMATION & SMOOTHING
Main loop completed


We apply specific signal processing techniques based on the physical nature of the variables:
1.  Continuous Variables (Speed, Coordinates): We use Linear Interpolation. This is standard for continuous physical processes and ensures smooth transitions between grid points.
2.  Binary Variables (DRS): We switch to Nearest Neighbor interpolation. Since the DRS flap is mechanically either Open (1) or Closed (0), linear interpolation would introduce meaningless artifacts (e.g., 0.5) that represent physically impossible states.

In [51]:
# POST-PROCESSING AND FORMATTING
print("3 - POST-PROCESSING AND FORMATTING")

# Corner classification 
df_final['Corner_Category'] = df_final['Curvature'].apply(classify_corner_type)
df_final = pd.get_dummies(df_final, columns=['Corner_Category'], prefix='Type')
corner_cols = [c for c in df_final.columns if 'Type_' in c]
for c in corner_cols:
    df_final[c] = df_final[c].astype(int)

# Tyre compound encoding
df_final['Compound_Temp'] = df_final['Compound']
df_final = pd.get_dummies(df_final, columns=['Compound_Temp'], prefix='Tyre')
dry_compounds = ['Tyre_MEDIUM', 'Tyre_HARD']

for col in dry_compounds:
    if col not in df_final.columns:
        df_final[col] = 0
    else:
        df_final[col] = df_final[col].astype(int)

# Drop redundant or non-dry compound columns if any (e.g., INTERMEDIATE and WET)
cols_to_drop = [c for c in df_final.columns if 'Tyre_' in c and c not in dry_compounds]
if cols_to_drop:
    df_final.drop(columns=cols_to_drop, inplace=True)

# Numerical precision standardization and casting
df_final['LapNumber'] = df_final['LapNumber'].astype(int)
df_final['TyreLife'] = df_final['TyreLife'].fillna(0).astype(int) 
df_final['Speed'] = df_final['Speed'].round(1)        
df_final['Fuel_Load'] = df_final['Fuel_Load'].round(3) 
df_final['Curvature'] = df_final['Curvature'].round(6) 

print("Post-processing completed")

3 - POST-PROCESSING AND FORMATTING
Post-processing completed


### **Categorical Encoding**
For the tire compounds, we apply One-Hot Encoding. Crucially, we enforce a strict schema (keeping only Dry compounds)

In [52]:
# EXPORT
filename = f"dataset/dataset_{RACE}_{YEAR}.csv"
df_final.to_csv(filename, index=False)

print(f"Exported file: {filename}")
print(f"Dataset dimensions: {df_final.shape}")
df_final.head()

Exported file: dataset/dataset_Abu Dhabi_2025.csv
Dataset dimensions: (197838, 18)


Unnamed: 0,Driver,Team,LapNumber,Space_Distance,Speed,Curvature,DRS,Fuel_Load,TyreLife,Compound,X_Coord,Y_Coord,Type_HighSpeed,Type_MediumSpeed,Type_SlowSpeed,Type_Straight,Tyre_HARD,Tyre_MEDIUM
0,VER,Red Bull Racing,2,0,222.4,0.000255,0,106.207,2,MEDIUM,55.520763,207.78898,0,0,0,1,0,1
1,VER,Red Bull Racing,2,10,226.3,0.00046,0,106.207,2,MEDIUM,64.044383,208.89263,0,0,0,1,0,1
2,VER,Red Bull Racing,2,20,229.6,0.000538,0,106.207,2,MEDIUM,72.508031,209.966034,0,0,0,1,0,1
3,VER,Red Bull Racing,2,30,232.1,0.000519,0,106.207,2,MEDIUM,80.848785,210.941597,0,0,0,1,0,1
4,VER,Red Bull Racing,2,40,235.9,0.000434,0,106.207,2,MEDIUM,89.344626,211.885675,0,0,0,1,0,1
