# Final Project: F1 EDA

Author: Alex Searle

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import swifter
import warnings
%matplotlib inline
sns.set_theme(style='darkgrid')
warnings.filterwarnings(action='once')

## Data Preparation

In [2]:
# Reading in all the necessary data
driver_standings_df = pd.read_csv('Data/driver_standings.csv')
races_df = pd.read_csv('Data/races.csv')
results_df = pd.read_csv('Data/results.csv')
constructor_standings_df = pd.read_csv('Data/constructor_standings.csv')

In [3]:
results_df = results_df.merge(races_df[['year','round', 'raceId']], left_on='raceId', right_on='raceId', how='left', suffixes=['','_drop'])
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26080 entries, 0 to 26079
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   resultId         26080 non-null  int64  
 1   raceId           26080 non-null  int64  
 2   driverId         26080 non-null  int64  
 3   constructorId    26080 non-null  int64  
 4   number           26080 non-null  object 
 5   grid             26080 non-null  int64  
 6   position         26080 non-null  object 
 7   positionText     26080 non-null  object 
 8   positionOrder    26080 non-null  int64  
 9   points           26080 non-null  float64
 10  laps             26080 non-null  int64  
 11  time             26080 non-null  object 
 12  milliseconds     26080 non-null  object 
 13  fastestLap       26080 non-null  object 
 14  rank             26080 non-null  object 
 15  fastestLapTime   26080 non-null  object 
 16  fastestLapSpeed  26080 non-null  object 
 17  statusId    

In [4]:
drop_columns = []
for column in results_df.columns:
    if '_extra' in column:
        drop_columns.append(column)
results_df.drop(columns=drop_columns, inplace=True)
results_df.drop(columns=['number','position','positionText', 'laps', 'time', 'milliseconds', 'fastestLap', 'rank', 'fastestLapTime', 'fastestLapSpeed'], inplace=True)

In [5]:
results_df.dropna(inplace=True)
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26080 entries, 0 to 26079
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   resultId       26080 non-null  int64  
 1   raceId         26080 non-null  int64  
 2   driverId       26080 non-null  int64  
 3   constructorId  26080 non-null  int64  
 4   grid           26080 non-null  int64  
 5   positionOrder  26080 non-null  int64  
 6   points         26080 non-null  float64
 7   statusId       26080 non-null  int64  
 8   year           26080 non-null  int64  
 9   round          26080 non-null  int64  
dtypes: float64(1), int64(9)
memory usage: 2.0 MB


## Feature Engineering

In [6]:
def top3_finishes(row, df):
    variable  = df[(df.year == row.year) & (df['round'] < row['round']) & (df.positionOrder < 4) & (df.driverId == row.driverId)].positionOrder.count() / (row['round'] - 1) * 100
    if variable == np.NAN:
        return 0
    else:
        return variable

In [7]:
results_df['top3_driver_season_percentage'] = results_df.swifter.apply(top3_finishes, axis=1, args=(results_df,))

  self.comm = Comm(**args)


Pandas Apply:   0%|          | 0/26080 [00:00<?, ?it/s]

  variable  = df[(df.year == row.year) & (df['round'] < row['round']) & (df.positionOrder < 4) & (df.driverId == row.driverId)].positionOrder.count() / (row['round'] - 1) * 100


In [8]:
results_df['top_3'] = results_df.positionOrder < 4

In [9]:
def avg_finish_position_season(row, df):
    return df[(df.driverId == row.driverId) & (df.year == df.year) & (df['round'] < row['round'])].positionOrder.mean()

In [10]:
results_df['driver_avg_finish_pos_season'] = results_df.swifter.apply(avg_finish_position_season, axis=1, args=(results_df,))

  self.comm = Comm(**args)


Pandas Apply:   0%|          | 0/26080 [00:00<?, ?it/s]

In [11]:
def constructor_top_3(row, df):
    return (df[(df['year'] == row.year) & (df.constructorId == row.constructorId) & (
                df['round'] < row['round'])].top_3.sum()) / ((row['round'] - 1) * 2) * 100

In [12]:
results_df['Constructor_Top3_Percent'] = results_df.swifter.apply(constructor_top_3, axis=1, args=(results_df,))

  self.comm = Comm(**args)


Pandas Apply:   0%|          | 0/26080 [00:00<?, ?it/s]

  return (df[(df['year'] == row.year) & (df.constructorId == row.constructorId) & (


In [13]:
results_df.sort_values(['year', 'round'], ascending=False, inplace=True, ignore_index=True)

In [14]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26080 entries, 0 to 26079
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   resultId                       26080 non-null  int64  
 1   raceId                         26080 non-null  int64  
 2   driverId                       26080 non-null  int64  
 3   constructorId                  26080 non-null  int64  
 4   grid                           26080 non-null  int64  
 5   positionOrder                  26080 non-null  int64  
 6   points                         26080 non-null  float64
 7   statusId                       26080 non-null  int64  
 8   year                           26080 non-null  int64  
 9   round                          26080 non-null  int64  
 10  top3_driver_season_percentage  24362 non-null  float64
 11  top_3                          26080 non-null  bool   
 12  driver_avg_finish_pos_season   23791 non-null 

In [15]:
lag_df = results_df.copy()
lag_df['year'] = lag_df['year'] + 1
results_df = results_df.merge(lag_df[['top3_driver_season_percentage', 'driver_avg_finish_pos_season', 'Constructor_Top3_Percent','year', 'round', 'driverId']],on=['year', 'round', 'driverId'], how='inner', suffixes=('', '_lag'))

In [16]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17211 entries, 0 to 17210
Data columns (total 17 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   resultId                           17211 non-null  int64  
 1   raceId                             17211 non-null  int64  
 2   driverId                           17211 non-null  int64  
 3   constructorId                      17211 non-null  int64  
 4   grid                               17211 non-null  int64  
 5   positionOrder                      17211 non-null  int64  
 6   points                             17211 non-null  float64
 7   statusId                           17211 non-null  int64  
 8   year                               17211 non-null  int64  
 9   round                              17211 non-null  int64  
 10  top3_driver_season_percentage      16060 non-null  float64
 11  top_3                              17211 non-null  boo

In [17]:
results_df

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,positionOrder,points,statusId,year,round,top3_driver_season_percentage,top_3,driver_avg_finish_pos_season,Constructor_Top3_Percent,top3_driver_season_percentage_lag,driver_avg_finish_pos_season_lag,Constructor_Top3_Percent_lag
0,26066,1110,830,9,6,1,25.0,1,2023,12,100.000000,True,6.646465,77.272727,72.727273,6.646465,63.636364
1,26067,1110,815,9,2,2,18.0,1,2023,12,54.545455,True,9.607143,77.272727,54.545455,9.607143,63.636364
2,26068,1110,844,6,1,3,15.0,1,2023,12,18.181818,True,8.772727,9.090909,45.454545,8.772727,50.000000
3,26069,1110,1,131,3,4,13.0,1,2023,12,36.363636,False,4.379679,22.727273,36.363636,4.379679,31.818182
4,26070,1110,4,117,9,5,10.0,1,2023,12,54.545455,False,7.976959,27.272727,0.000000,7.976959,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17206,19857,825,589,105,19,7,0.0,12,1951,1,,False,,,,,
17207,19859,825,627,154,8,9,0.0,13,1951,1,,False,,,,,
17208,19860,825,741,154,12,10,0.0,13,1951,1,,False,,,,,
17209,19863,825,660,154,18,13,0.0,17,1951,1,,False,,,,,


In [18]:
results_df.dropna(inplace=True)
results_df.drop(columns=['resultId', 'raceId', 'constructorId'], inplace=True)
results_df[['positionOrder', 'grid']].astype('int')
results_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15923 entries, 0 to 17202
Data columns (total 14 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   driverId                           15923 non-null  int64  
 1   grid                               15923 non-null  int64  
 2   positionOrder                      15923 non-null  int64  
 3   points                             15923 non-null  float64
 4   statusId                           15923 non-null  int64  
 5   year                               15923 non-null  int64  
 6   round                              15923 non-null  int64  
 7   top3_driver_season_percentage      15923 non-null  float64
 8   top_3                              15923 non-null  bool   
 9   driver_avg_finish_pos_season       15923 non-null  float64
 10  Constructor_Top3_Percent           15923 non-null  float64
 11  top3_driver_season_percentage_lag  15923 non-null  float64


In [19]:
results_df.to_csv('Data/Modeling_v1.csv')

In [20]:
results_df.head(40)

Unnamed: 0,driverId,grid,positionOrder,points,statusId,year,round,top3_driver_season_percentage,top_3,driver_avg_finish_pos_season,Constructor_Top3_Percent,top3_driver_season_percentage_lag,driver_avg_finish_pos_season_lag,Constructor_Top3_Percent_lag
0,830,6,1,25.0,1,2023,12,100.0,True,6.646465,77.272727,72.727273,6.646465,63.636364
1,815,2,2,18.0,1,2023,12,54.545455,True,9.607143,77.272727,54.545455,9.607143,63.636364
2,844,1,3,15.0,1,2023,12,18.181818,True,8.772727,9.090909,45.454545,8.772727,50.0
3,1,3,4,13.0,1,2023,12,36.363636,False,4.379679,22.727273,36.363636,4.379679,31.818182
4,4,9,5,10.0,1,2023,12,54.545455,False,7.976959,27.272727,0.0,7.976959,0.0
5,847,8,6,8.0,1,2023,12,9.090909,False,11.818182,22.727273,27.272727,11.818182,31.818182
6,846,7,7,6.0,1,2023,12,18.181818,False,9.181818,9.090909,9.090909,9.181818,4.545455
7,839,14,8,4.0,1,2023,12,9.090909,False,10.469697,4.545455,0.0,10.469697,0.0
8,840,10,9,2.0,1,2023,12,0.0,False,11.986842,27.272727,0.0,11.986842,0.0
9,852,11,10,1.0,1,2023,12,0.0,False,12.818182,0.0,0.0,12.818182,0.0
