# Setup

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

import os
fnames = []
fpaths = []
for dirname, _, filenames in os.walk('../data/kaggle/'):
    for filename in filenames:
        fnames.append(filename.split('.')[0])
        fpaths.append(os.path.join(dirname, filename))
        print(os.path.join(dirname, filename))

../data/kaggle/circuits.csv
../data/kaggle/status.csv
../data/kaggle/lap_times.csv
../data/kaggle/sprint_results.csv
../data/kaggle/drivers.csv
../data/kaggle/races.csv
../data/kaggle/constructors.csv
../data/kaggle/constructor_standings.csv
../data/kaggle/qualifying.csv
../data/kaggle/driver_standings.csv
../data/kaggle/constructor_results.csv
../data/kaggle/pit_stops.csv
../data/kaggle/seasons.csv
../data/kaggle/results.csv


## Read Data

In [3]:
# Read data
fpath = '../data/kaggle/'
circuits = pd.read_csv(f'{fpath}circuits.csv', index_col=0, na_values=r'\N')
constructorResults = pd.read_csv(f'{fpath}constructor_results.csv', index_col=0, na_values=r'\N')
constructors = pd.read_csv(f'{fpath}constructors.csv', index_col=0, na_values=r'\N')
constructorStandings = pd.read_csv(f'{fpath}constructor_standings.csv', index_col=0, na_values=r'\N')
drivers = pd.read_csv(f'{fpath}drivers.csv', index_col=0, na_values=r'\N')
driverStandings = pd.read_csv(f'{fpath}driver_standings.csv', index_col=0, na_values=r'\N')
lapTimes = pd.read_csv(f'{fpath}lap_times.csv')
pitStops = pd.read_csv(f'{fpath}pit_stops.csv')
qualifying = pd.read_csv(f'{fpath}qualifying.csv', index_col=0, na_values=r'\N')
races = pd.read_csv(f'{fpath}races.csv', na_values=r'\N')
results = pd.read_csv(f'{fpath}results.csv', index_col=0, na_values=r'\N')
seasons = pd.read_csv(f'{fpath}seasons.csv', index_col=0, na_values=r'\N')
status = pd.read_csv(f'{fpath}status.csv', index_col=0, na_values=r'\N')

In [6]:
# Post-read in formatting
circuits = circuits.rename(columns={'name':'circuitName','location':'circuitLocation','country':'circuitCountry','url':'circuitUrl'})
drivers = drivers.rename(columns={'number':'driverNumber','nationality':'driverNationality','url':'driverUrl'})
drivers['driverName'] = drivers['forename']+' '+drivers['surname']
constructors = constructors.rename(columns={'name':'constructorName','nationality':'constructorNationality','url':'constructorUrl'})
races['date'] = races['date'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d'))
races = races.rename(columns={'year':'raceYear','name':'raceName','date':'raceDate','time':'raceTime','url':'raceUrl','round':'raceRound'})
lapTimes = lapTimes.rename(columns={'time':'lapTime','position':'lapPosition','milliseconds':'lapMilliseconds'})
lapTimes['lapSeconds'] = lapTimes['lapMilliseconds'].apply(lambda x: x/1000)
pitStops = pitStops.rename(columns={'time':'pitTime','milliseconds':'pitMilliseconds'})
pitStops['pitSeconds'] = pitStops['pitMilliseconds'].apply(lambda x: x/1000)
results = results.rename(columns={'position':'resultsPosition','time':'resultsTime','milliseconds':'resultsMilliseconds','number':'resultsNumber'})
results['resultsSeconds'] = results['resultsMilliseconds'].apply(lambda x: x/1000)

Color coding for constructors

In [7]:
# Constructor color mapping
constructor_color_map = {
    'Toro Rosso':'#0000FF',
    'Mercedes':'#6CD3BF',
    'Red Bull':'#1E5BC6',
    'Ferrari':'#ED1C24',
    'Williams':'#37BEDD',
    'Force India':'#FF80C7',
    'Virgin':'#c82e37',
    'Renault':'#FFD800',
    'McLaren':'#F58020',
    'Sauber':'#006EFF',
    'Lotus':'#FFB800',
    'HRT':'#b2945e',
    'Caterham':'#0b361f',
    'Lotus F1':'#FFB800',
    'Marussia':'#6E0000',
    'Manor Marussia':'#6E0000',
    'Haas F1 Team':'#B6BABD',
    'Racing Point':'#F596C8',
    'Aston Martin':'#2D826D',
    'Alfa Romeo':'#B12039',
    'AlphaTauri':'#4E7C9B',
    'Alpine F1 Team':'#2293D1'
}

# Processing the Data

In [8]:
resultsAnalysis = pd.merge(results,races,left_on='raceId',right_on='raceId',how='left')
resultsAnalysis = pd.merge(resultsAnalysis,circuits,left_on='circuitId',right_index=True,how='left')
resultsAnalysis = pd.merge(resultsAnalysis,constructors,left_on='constructorId',right_index=True,how='left')
resultsAnalysis = pd.merge(resultsAnalysis,drivers,left_on='driverId',right_index=True,how='left')
resultsAnalysis

Unnamed: 0,raceId,driverId,constructorId,resultsNumber,grid,resultsPosition,positionText,positionOrder,points,laps,...,constructorUrl,driverRef,driverNumber,code,forename,surname,dob,driverNationality,driverUrl,driverName
0,18,1,1,22.0,1,1.0,1,1,10.0,58,...,http://en.wikipedia.org/wiki/McLaren,hamilton,44.0,HAM,Lewis,Hamilton,1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton,Lewis Hamilton
1,18,2,2,3.0,5,2.0,2,2,8.0,58,...,http://en.wikipedia.org/wiki/BMW_Sauber,heidfeld,,HEI,Nick,Heidfeld,1977-05-10,German,http://en.wikipedia.org/wiki/Nick_Heidfeld,Nick Heidfeld
2,18,3,3,7.0,7,3.0,3,3,6.0,58,...,http://en.wikipedia.org/wiki/Williams_Grand_Pr...,rosberg,6.0,ROS,Nico,Rosberg,1985-06-27,German,http://en.wikipedia.org/wiki/Nico_Rosberg,Nico Rosberg
3,18,4,4,5.0,11,4.0,4,4,5.0,58,...,http://en.wikipedia.org/wiki/Renault_in_Formul...,alonso,14.0,ALO,Fernando,Alonso,1981-07-29,Spanish,http://en.wikipedia.org/wiki/Fernando_Alonso,Fernando Alonso
4,18,5,1,23.0,3,5.0,5,5,4.0,58,...,http://en.wikipedia.org/wiki/McLaren,kovalainen,,KOV,Heikki,Kovalainen,1981-10-19,Finnish,http://en.wikipedia.org/wiki/Heikki_Kovalainen,Heikki Kovalainen
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25835,1096,854,210,47.0,12,16.0,16,16,0.0,57,...,http://en.wikipedia.org/wiki/Haas_F1_Team,mick_schumacher,47.0,MSC,Mick,Schumacher,1999-03-22,German,http://en.wikipedia.org/wiki/Mick_Schumacher,Mick Schumacher
25836,1096,825,210,20.0,16,17.0,17,17,0.0,57,...,http://en.wikipedia.org/wiki/Haas_F1_Team,kevin_magnussen,20.0,MAG,Kevin,Magnussen,1992-10-05,Danish,http://en.wikipedia.org/wiki/Kevin_Magnussen,Kevin Magnussen
25837,1096,1,131,44.0,5,18.0,18,18,0.0,55,...,http://en.wikipedia.org/wiki/Mercedes-Benz_in_...,hamilton,44.0,HAM,Lewis,Hamilton,1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton,Lewis Hamilton
25838,1096,849,3,6.0,20,19.0,19,19,0.0,55,...,http://en.wikipedia.org/wiki/Williams_Grand_Pr...,latifi,6.0,LAT,Nicholas,Latifi,1995-06-29,Canadian,http://en.wikipedia.org/wiki/Nicholas_Latifi,Nicholas Latifi


In [9]:
lapTimesAnalysis = pd.merge(lapTimes,races,left_on='raceId',right_on='raceId',how='left')
lapTimesAnalysis = pd.merge(lapTimesAnalysis,resultsAnalysis,left_on=['raceId','driverId','raceYear','raceRound','circuitId','raceName','raceUrl'],right_on=['raceId','driverId','raceYear','raceRound','circuitId','raceName','raceUrl'],how='left')
lapTimesAnalysis

Unnamed: 0,raceId,driverId,lap,lapPosition,lapTime,lapMilliseconds,lapSeconds,raceYear,raceRound,circuitId,...,constructorUrl,driverRef,driverNumber,code,forename,surname,dob,driverNationality,driverUrl,driverName
0,841,20,1,1,1:38.109,98109,98.109,2011,1,1,...,http://en.wikipedia.org/wiki/Red_Bull_Racing,vettel,5.0,VET,Sebastian,Vettel,1987-07-03,German,http://en.wikipedia.org/wiki/Sebastian_Vettel,Sebastian Vettel
1,841,20,2,1,1:33.006,93006,93.006,2011,1,1,...,http://en.wikipedia.org/wiki/Red_Bull_Racing,vettel,5.0,VET,Sebastian,Vettel,1987-07-03,German,http://en.wikipedia.org/wiki/Sebastian_Vettel,Sebastian Vettel
2,841,20,3,1,1:32.713,92713,92.713,2011,1,1,...,http://en.wikipedia.org/wiki/Red_Bull_Racing,vettel,5.0,VET,Sebastian,Vettel,1987-07-03,German,http://en.wikipedia.org/wiki/Sebastian_Vettel,Sebastian Vettel
3,841,20,4,1,1:32.803,92803,92.803,2011,1,1,...,http://en.wikipedia.org/wiki/Red_Bull_Racing,vettel,5.0,VET,Sebastian,Vettel,1987-07-03,German,http://en.wikipedia.org/wiki/Sebastian_Vettel,Sebastian Vettel
4,841,20,5,1,1:32.342,92342,92.342,2011,1,1,...,http://en.wikipedia.org/wiki/Red_Bull_Racing,vettel,5.0,VET,Sebastian,Vettel,1987-07-03,German,http://en.wikipedia.org/wiki/Sebastian_Vettel,Sebastian Vettel
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
538116,1096,822,53,16,1:32.998,92998,92.998,2022,22,24,...,http://en.wikipedia.org/wiki/Alfa_Romeo_in_For...,bottas,77.0,BOT,Valtteri,Bottas,1989-08-28,Finnish,http://en.wikipedia.org/wiki/Valtteri_Bottas,Valtteri Bottas
538117,1096,822,54,16,1:32.995,92995,92.995,2022,22,24,...,http://en.wikipedia.org/wiki/Alfa_Romeo_in_For...,bottas,77.0,BOT,Valtteri,Bottas,1989-08-28,Finnish,http://en.wikipedia.org/wiki/Valtteri_Bottas,Valtteri Bottas
538118,1096,822,55,16,1:31.236,91236,91.236,2022,22,24,...,http://en.wikipedia.org/wiki/Alfa_Romeo_in_For...,bottas,77.0,BOT,Valtteri,Bottas,1989-08-28,Finnish,http://en.wikipedia.org/wiki/Valtteri_Bottas,Valtteri Bottas
538119,1096,822,56,15,1:30.566,90566,90.566,2022,22,24,...,http://en.wikipedia.org/wiki/Alfa_Romeo_in_For...,bottas,77.0,BOT,Valtteri,Bottas,1989-08-28,Finnish,http://en.wikipedia.org/wiki/Valtteri_Bottas,Valtteri Bottas


# Lap Times Exploratory Analysis

## Lap Times over the Years

In [15]:
circuitName = lapTimesAnalysis['circuitName'].unique()[8]
df = lapTimesAnalysis[(lapTimesAnalysis['circuitName']==circuitName)].groupby(by=['raceYear','constructorName']).mean().reset_index()

# create figure
fig = px.line(
    df,
    x='raceYear',
    y='lapSeconds',
    color='constructorName',
    color_discrete_map=constructor_color_map,
)

fig.update_layout(
    title_text=f'Lap Time Trend by Constructor - {circuitName}',
)

fig.update_traces(opacity=0.65)
fig.show()

## Lap Times Over the Course of a Season (by Circuit)

In [11]:
year = 2022
circuitName = lapTimesAnalysis['circuitName'].unique()[5]
driverList = lapTimesAnalysis[(lapTimesAnalysis['raceYear']==year)]['driverName'].unique()

In [12]:
df = lapTimesAnalysis[(lapTimesAnalysis['raceYear']==year)&(lapTimesAnalysis['driverName'].isin(driverList))].groupby(by=['circuitName','raceYear','driverName']).mean().reset_index().sort_values(by='raceId')[['raceYear','lap','lapSeconds','driverName','circuitName']]

fig = px.line(
    data_frame = df,
    x='circuitName',
    y='lapSeconds',
    color='driverName',
)

fig.update_layout(
    title_text=f'Average Lap Times by Circuit - {year}',
)
fig.show()

## Lap Times over the course of a Race

In [10]:
df = lapTimesAnalysis[(lapTimesAnalysis['raceYear']==year)&(lapTimesAnalysis['circuitName']==circuitName)&(lapTimesAnalysis['driverName'].isin(driverList))][['raceYear','lap','lapSeconds','driverName','circuitName','constructorName']]

fig = px.line(
    data_frame = df,
    x='lap',
    y='lapSeconds',
    color='driverName',
)

fig.update_layout(
    title_text=f'Lap Time by Lap - {circuitName} {year}',
)
fig.show()

## Lap time distribution over the course of a race

In [11]:
df = lapTimesAnalysis[(lapTimesAnalysis['raceYear']==year)&(lapTimesAnalysis['circuitName']==circuitName)]

# create figure
fig = px.histogram(df,
                   x='lapSeconds',
                  color='constructorName',
                  color_discrete_map=constructor_color_map,
                  )

fig.update_layout(
    title_text=f'Lap Time Distribution by Constructor - {year} {circuitName}',
    barmode='overlay',
)

fig.update_traces(opacity=0.65)
fig.show()

## Average Historical Lap Times by Circuit

In [12]:
fig = px.line(lapTimesAnalysis.groupby(by=['circuitName','raceYear',]).mean().reset_index(),
                 x='raceYear',
                 y='lapSeconds',
                 color='circuitName',
                )
fig.update_layout(
    title_text='Average Lap Times Over Time by Circuit',
)
fig.show()

## Where in the race can we expect to see the fastest lap in a race?

In [13]:
df_temp = pd.merge(lapTimesAnalysis.groupby(by=['circuitName']).mean().sort_values(by='fastestLap',ascending=True)['fastestLap'],lapTimesAnalysis.groupby(by=['circuitName']).max().sort_values(by='fastestLap',ascending=True)['laps'],left_index=True,right_index=True, how='inner')
df_temp['fastestLapRacePct'] = df_temp['fastestLap']/df_temp['laps']*100
print(df_temp['fastestLapRacePct'].mean())
df_temp

74.29524933530688



Dropping invalid columns in DataFrameGroupBy.max is deprecated. In a future version, a TypeError will be raised. Before calling .max, select only columns which should be valid for the function.



Unnamed: 0_level_0,fastestLap,laps,fastestLapRacePct
circuitName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Circuit de Spa-Francorchamps,31.909388,44,72.521336
Suzuka Circuit,37.57001,53,70.886811
Circuit de Nevers Magny-Cours,37.938336,72,52.692133
Silverstone Circuit,38.94707,61,63.847656
Sepang International Circuit,39.212734,56,70.022739
Bahrain International Circuit,40.050483,87,46.035038
Shanghai International Circuit,40.734429,56,72.740052
Nürburgring,41.402996,67,61.795516
Fuji Speedway,41.530085,67,61.985201
Autodromo Nazionale di Monza,41.867641,53,78.99555


In [14]:
# create figure
fig = px.histogram(df_temp,
                   x='fastestLapRacePct',
                  )

fig.update_layout(
    title_text=f'Fastest Lap Time Lap Distribution',
    barmode='overlay',
)

fig.update_traces(opacity=0.65)
fig.show()

# Conclusions

We set out initially to explore the lap time data of the F1 races available in the dataset. Going temporally through the year does not glean much information since lap time data is highly dependent on the circuit being raced on. Fixing the track, one can see slightly more meaningful result when comparing them year to year. 

## How have lap times changed over the years?
Generally speaking the lap times decrease over time, but have behaviours where lap times jump up. Looking at the behaviour, it leads me to believe they are step changes related to rule changes in F1. In particular, you can see a meaningful increase in the lap times especially going from 2021 to 2022 in large part to the major design changes implemented.

## How do lap times change over the course of a season?
Yes. The changes however are largely due the track and not much can really be said about this particular question.

## How do lap times change through an entire race?
As expected, lap times decrease as we progress through the race as fuel weight goes down.

## When can we expect to see the fastest lap in a race?
The fastest lap typically occurs about 74% into a race. For good reason, the cars are running on lower fuel and depending on the current standings teams opt different strategies to gain the extra point from having the fastest lap in the race.