In [53]:
import fastf1 as f
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import tomli

In [54]:
with open("Data/compound_selection.toml", "rb") as toml:
    compound_selection = tomli.load(toml)

In [55]:
cache_path = os.getcwd() + "/Cache"
f.Cache.enable_cache(cache_path)

## Load Dataframes

In [56]:
def read_csv(path):
    '''
    Read csv file at path location and filter for relevant columns

    Requires:
    csv file located at path location is derived from a fastf1 laps object
    '''

    return pd.read_csv(path, 
                       header=0, 
                       true_values=["TRUE"], 
                       false_values=["FALSE"],
                       usecols=["Time", "DriverNumber", "LapTime", "LapNumber", "Stint", 
                                "PitOutTime", "PitInTime", "Compound", "TyreLife", "FreshTyre", 
                                "Team", "Driver", "TrackStatus", "IsAccurate", "RoundNumber", 
                                "EventName"]
                        )

In [68]:
def convert_timedelta(df_laps):
    '''
    Requires: 
    df_laps has the following columns: ["Time", "LapTime", "PitInTime", "PitOutTime"]
    '''

    df_laps[["Time", "LapTime", "PitInTime", "PitOutTime"]] = df_laps[["Time", "LapTime", "PitInTime", "PitOutTime"]].apply(pd.to_timedelta)
    df_laps["LapTime"] = df_laps["LapTime"].apply(lambda x: x.total_seconds())

    return df_laps

In [58]:
def load_laps():
    df_dict = {}
    data_files = [file for file in os.listdir("Data") if os.path.isfile("Data/"+file)]

    for file in data_files:
        if file.startswith("all"):
            year = file.split("_")[2]
            year = int(year[:year.find(".")])
            df = read_csv("Data/" + file)
            convert_timedelta(df)
            df_dict[year] = df

    return df_dict

## Data Transformation

### Add Tyre Information Columns

In [59]:
def add_is_slick(df_laps):
    '''
    Requires:
    df_laps has the following column: ["Compound"]
    '''
    
    df_laps["IsSlick"] = df_laps["Compound"].apply(lambda x: x in ["SOFT", "MEDIUM", "HARD"])

    return None

FastF1 provides relative compound information (soft, medium, hard) as the `Compound` column in its `Laps` objects

The actual compound names (C1, C2, C3 etc. or ultrasoft, supersoft etc.) needs to be added to maintain consistency. These will be recorded in the `CompoundName` column.

In [60]:
def add_compound_name(df_laps, compound_selection, year):
    '''
    Requires:
    df_laps has the following columns: ["Compound", "RoundNumber"]

    Assumes:
        - all data contained in compound_selection is from the same season 
        - df_laps contain data from the same season as compound_selection
    '''
    
    def convert_compound_name(row):
        if year == 2018:
            compound_to_index = {"SOFT":0, "MEDIUM":1, "HARD":2}
        else:
            compound_to_index = {"SOFT":2, "MEDIUM":1, "HARD":0}

        try:
            if row.loc["Compound"] not in compound_to_index:
                return row.loc["Compound"]
            else:
                return compound_selection[str(row.loc["RoundNumber"])][compound_to_index[row.loc["Compound"]]]
        except:
            # error handling for when compound_selection.toml is not up-to-date
            print("Compound selection record is missing for round " + str(row.loc["RoundNumber"]))

            # terminate cell 
            assert False

    df_laps["CompoundName"] = df_laps.apply(convert_compound_name, axis=1)

    return df_laps

### Add Timing Columns


A *representative lap time* is calculated by finding the median of the laps that meet the following condition:

- Raced on slick tyres (`IsSlick = True`).
- `IsAccurate = True`, see definition [here](https://theoehrly.github.io/Fast-F1/core.html#fastf1.core.Laps)
- Is completed under green flag (`TrackStatus == 1`), note that this definition is stricter than the one used for `IsAccurate`

Define *valid laps* as the laps that meet all above conditions. This is recorded in the new `IsValid` column.

The fastest lap time for the session is the fastest time out of the laps where `IsPersonalBest = True` ([definition](https://theoehrly.github.io/Fast-F1/core.html#laps)). Note that this is the same definiton used by the FastF1 `pick_fastest()` method.

Using these two times as benchmarks, the following columns are added:

- `DeltaToRep`
- `DeltaToFastest`
- `PctFromRep`
- `PctFromFastest`

**Caveat**: Metrics are invalid for wet races

In [61]:
def add_is_valid(df_laps):
    '''
    Requires:
    df_laps has the following columns: ["IsSlick", "IsAccurate", "TrackStatus"]
    '''

    def check_lap_valid(row):
        return row.loc["IsSlick"] and row.loc["IsAccurate"] and row.loc["TrackStatus"] == 1

    df_laps["IsValid"] = df_laps.apply(check_lap_valid, axis=1)

    return df_laps

In [62]:
def find_rep_times(df_laps):
    '''
    Requires:
    df_laps has the following columns: ["RoundNumber", "IsValid", "LapTime"]
    '''

    rounds = df_laps["RoundNumber"].unique()
    rep_times = {}

    for round_number in rounds:
        median = df_laps[(df_laps["RoundNumber"] == round_number) & (df_laps["IsValid"] == True)]["LapTime"].median()
        rep_times[round_number] = round(median, 3)

    return rep_times

def add_rep_deltas(df_laps):
    '''
    Requires:
    df_laps has the following columns: ["RoundNumber", "IsValid", "LapTime"]
    '''

    rep_times = find_rep_times(df_laps)
    
    def delta_to_rep(row):
        return row.loc["LapTime"] - rep_times[row.loc["RoundNumber"]]

    def pct_from_rep(row):
        delta = row.loc["LapTime"] - rep_times[row.loc["RoundNumber"]]
        return round(delta / rep_times[row.loc["RoundNumber"]] * 100, 3)

    df_laps["DeltaToRep"] = df_laps.apply(delta_to_rep, axis=1)
    df_laps["PctFromRep"] = df_laps.apply(pct_from_rep, axis=1)

    return df_laps

        

In [63]:
def find_fastest_times(df_laps):
    '''
    Requires:
    df_laps has the following columns: ["RoundNumber", "IsPersonalBest", "LapTime"]
    '''

    rounds = df_laps["RoundNumber"].unique()
    fastest_times = {}

    for round_number in rounds:
        median = df_laps[(df_laps["RoundNumber"] == round_number) & (df_laps["IsPersonalBest"] == True)]["LapTime"].median()
        fastest_times[round_number] = round(median, 3)
    
    return fastest_times

def add_fastest_deltas(df_laps):
    '''
    Requires:
    df_laps has the following columns: ["RoundNumber", "IsPersonalBest", "LapTime"]
    '''

    fastest_times = find_rep_times(df_laps)
    
    def delta_to_fastest(row):
        return row.loc["LapTime"] - fastest_times[row.loc["RoundNumber"]]

    def pct_from_fastest(row):
        delta = row.loc["LapTime"] - fastest_times[row.loc["RoundNumber"]]
        return round(delta / fastest_times[row.loc["RoundNumber"]] * 100, 3)

    df_laps["DeltaToFastest"] = df_laps.apply(delta_to_fastest, axis=1)
    df_laps["PctFromFastest"] = df_laps.apply(pct_from_fastest, axis=1)

    return df_laps

Track evolution and diminishing fuel load have significant influences on lap times.

The following columns are added to control for these confounding factors. Instead of comparing the lap times to a representative time for the entire event, they will be compared against a representative time at the same stage of the Grand Prix:

- `DeltaToLapRep`
- `PctFromLapRep`

The definition for the per lap representative lap times is the same as the definition for the event representative lap time.

In [73]:
def find_lap_reps(df_laps):
    '''
    Requires:
    df_laps has the following columns: ["RoundNumber", "LapNumber", "IsValid", "LapTime"]
    '''
    
    lap_reps = {}

    for round_number in df_laps["RoundNumber"].unique():
        round_lap_reps = {}
        round_laps = df_laps[(df_laps["RoundNumber"] == round_number) & (df_laps["IsValid"] == True)]

        for lap_number in round_laps["LapNumber"].unique():
            median = round_laps[round_laps["LapNumber"] == lap_number]["LapTime"].median()
            round_lap_reps[lap_number] = round(median, 3)
        
        lap_reps[round_number] = round_lap_reps

    return lap_reps

def add_lap_rep_deltas(df_laps):
    '''
    Requires:
    df_laps has the following columns: ["RoundNumber", "LapNumber", "IsValid", "LapTime"]
    '''

    lap_reps = find_lap_reps(df_laps)
    print(lap_reps[1])

    def delta_to_lap_rep(row):
        return row.loc["LapTime"] - lap_reps[row.loc["RoundNumber"]][row.loc["LapNumber"]]

    def pct_from_lap_rep(row):
        delta = row.loc["LapTime"] - lap_reps[row.loc["RoundNumber"]][row.loc["LapNumber"]]
        return round(delta / lap_reps[row.loc["RoundNumber"]][row.loc["LapNumber"]] * 100, 3)

    df_laps["DeltaToLapRep"] = df_laps.apply(delta_to_lap_rep, axis=1)
    df_laps["PctFromLapRep"] = df_laps.apply(pct_from_lap_rep, axis=1)

    return df_laps
    

# TEST

In [69]:
dfs = load_laps()
df_test = dfs[2022]

In [70]:
add_is_slick(df_test)
add_compound_name(df_test, compound_selection["2022"], 2022)
add_is_valid(df_test)
add_rep_deltas(df_test)
add_fastest_deltas(df_test)

Unnamed: 0,Time,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Compound,TyreLife,FreshTyre,...,IsAccurate,RoundNumber,EventName,IsSlick,CompoundName,IsValid,DeltaToRep,PctFromRep,DeltaToFastest,PctFromFastest
0,0 days 01:04:14.256000,16,,1.0,1.0,0 days 00:24:54.765000,NaT,SOFT,1.0,True,...,False,1,Bahrain Grand Prix,True,C3,False,,,,
1,0 days 01:05:52.109000,16,97.853,2.0,1.0,NaT,NaT,SOFT,2.0,True,...,True,1,Bahrain Grand Prix,True,C3,True,-1.4695,-1.480,-1.4695,-1.480
2,0 days 01:07:30.381000,16,98.272,3.0,1.0,NaT,NaT,SOFT,3.0,True,...,True,1,Bahrain Grand Prix,True,C3,True,-1.0505,-1.058,-1.0505,-1.058
3,0 days 01:09:08.795000,16,98.414,4.0,1.0,NaT,NaT,SOFT,4.0,True,...,True,1,Bahrain Grand Prix,True,C3,True,-0.9085,-0.915,-0.9085,-0.915
4,0 days 01:10:47.266000,16,98.471,5.0,1.0,NaT,NaT,SOFT,5.0,True,...,True,1,Bahrain Grand Prix,True,C3,True,-0.8515,-0.857,-0.8515,-0.857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23541,0 days 01:37:58.484000,14,91.819,23.0,2.0,NaT,NaT,HARD,4.0,True,...,True,22,Abu Dhabi Grand Prix,True,C3,True,0.5430,0.595,0.5430,0.595
23542,0 days 01:39:29.063000,14,90.579,24.0,2.0,NaT,NaT,HARD,5.0,True,...,True,22,Abu Dhabi Grand Prix,True,C3,True,-0.6970,-0.764,-0.6970,-0.764
23543,0 days 01:41:00.128000,14,91.065,25.0,2.0,NaT,NaT,HARD,6.0,True,...,True,22,Abu Dhabi Grand Prix,True,C3,True,-0.2110,-0.231,-0.2110,-0.231
23544,0 days 01:42:30.816000,14,90.688,26.0,2.0,NaT,NaT,HARD,7.0,True,...,True,22,Abu Dhabi Grand Prix,True,C3,True,-0.5880,-0.644,-0.5880,-0.644


In [74]:
add_lap_rep_deltas(df_test)
df_test.to_csv("Data/test_2022.csv")

{2.0: 100.48, 3.0: 100.433, 4.0: 100.301, 5.0: 100.826, 6.0: 100.635, 7.0: 100.768, 8.0: 101.478, 9.0: 100.872, 10.0: 101.267, 11.0: 101.085, 12.0: 101.281, 13.0: 101.287, 14.0: 99.629, 17.0: 98.868, 18.0: 98.894, 19.0: 98.731, 20.0: 98.954, 21.0: 99.132, 22.0: 99.379, 23.0: 99.325, 24.0: 99.576, 25.0: 99.731, 26.0: 99.513, 27.0: 99.84, 28.0: 99.774, 29.0: 99.877, 30.0: 99.8, 33.0: 99.751, 34.0: 99.285, 35.0: 98.523, 36.0: 98.647, 37.0: 98.289, 38.0: 97.942, 39.0: 97.721, 40.0: 97.924, 41.0: 98.242, 42.0: 98.111, 43.0: 98.579, 44.0: 98.288, 45.0: 96.58, 52.0: 97.367, 53.0: 97.11, 54.0: 97.284, 55.0: 97.257, 56.0: 97.603, 57.0: 97.805, 16.0: 100.079, 31.0: 99.806, 32.0: 99.738, 15.0: 101.066}


KeyError: 1.0