# F1 Lap Time Data Pipeline
Load FastF1 race sessions (2022-2023), filter laps, and build the feature table.


In [1]:
%pip install -r requirements.txt


Note: you may need to restart the kernel to use updated packages.


ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'

[notice] A new release of pip is available: 23.3.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from IPython.display import display, Javascript
display(Javascript("Jupyter.notebook.kernel.restart()"))

<IPython.core.display.Javascript object>

In [3]:
from pathlib import Path
import sys

def find_project_root(start: Path) -> Path:
    for parent in [start] + list(start.parents):
        if (parent / "src").is_dir() and (parent / "requirements.txt").exists():
            return parent
    return start

project_root = find_project_root(Path.cwd().resolve())
sys.path.insert(0, str(project_root))


In [4]:
from pathlib import Path
import numpy as np
import pandas as pd
import random

from src.data_loader import enable_cache, load_laps_for_seasons, clean_laps
from src.features import build_feature_table

RANDOM_STATE = 42
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

DATA_DIR = Path("data")
CACHE_DIR = DATA_DIR / "cache"
PROCESSED_DIR = DATA_DIR / "processed"
FEATURES_PATH = PROCESSED_DIR / "feature_table.parquet"


In [5]:
enable_cache(CACHE_DIR)

raw_laps = load_laps_for_seasons([2022, 2023, 2024], cache_dir=CACHE_DIR)
clean_laps_df = clean_laps(raw_laps)

feature_df, numeric_features, categorical_features = build_feature_table(clean_laps_df)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
feature_df.to_parquet(FEATURES_PATH, index=False)

feature_df.head()


Race sessions:   0%|          | 0/68 [00:00<?, ?it/s]core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.7.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '55', '44', '63', '20', '77', '31', '22', '14', '24', '47', '18', '23', '3', '4', '6', '27', '11', '1', '10']
Race sessions:   1%|▏         | 1/68 [00:01<01:34,  1.41s/it]core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.7.0]
req            INFO 	Using cached 

Unnamed: 0,LapNumber,Stint,TyreLife,LapTimeLag1,LapTimeLag2,LapTimeLag3,RollingMean3,Driver,Team,Compound,TrackStatusFlag,Circuit,LapTimeSeconds,Season,RoundNumber,EventName
0,2.0,1.0,2.0,,,,,ALB,Williams,SOFT,green,Sakhir,100.548,2022,1,Bahrain Grand Prix
1,3.0,1.0,3.0,100.548,,,100.548,ALB,Williams,SOFT,green,Sakhir,100.664,2022,1,Bahrain Grand Prix
2,4.0,1.0,4.0,100.664,100.548,,100.606,ALB,Williams,SOFT,green,Sakhir,101.126,2022,1,Bahrain Grand Prix
3,5.0,1.0,5.0,101.126,100.664,100.548,100.779333,ALB,Williams,SOFT,green,Sakhir,102.303,2022,1,Bahrain Grand Prix
4,6.0,1.0,6.0,102.303,101.126,100.664,101.364333,ALB,Williams,SOFT,green,Sakhir,101.708,2022,1,Bahrain Grand Prix


In [6]:
pd.Series({
    "rows": len(feature_df),
    "columns": feature_df.shape[1],
    "seasons": sorted(feature_df["Season"].unique().tolist()),
})


rows                    64479
columns                    16
seasons    [2022, 2023, 2024]
dtype: object