# Final Project: F1 EDA

Author: Alex Searle

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
import swifter
from datetime import datetime
import warnings
%matplotlib inline
sns.set_theme(style='darkgrid')
warnings.filterwarnings(action='once')

## Data Preparation

In [2]:
# Reading in all the necessary data
driver_standings_df = pd.read_csv('Data/driver_standings.csv')
races_df = pd.read_csv('Data/races.csv')
results_df = pd.read_csv('Data/results.csv')
constructor_standings_df = pd.read_csv('Data/constructor_standings.csv')

In [3]:
results_df = results_df.merge(races_df[['year','round', 'raceId']], left_on='raceId', right_on='raceId', how='left', suffixes=['','_drop'])
results_df = results_df.merge(driver_standings_df, left_on=['driverId', 'raceId'], right_on=['driverId', 'raceId'], how='outer', suffixes=['', '_extra'])
results_df = results_df.merge(constructor_standings_df, left_on=['constructorId', 'raceId'], right_on=['constructorId', 'raceId'], how='left', suffixes=['_driver', '_constructor'])
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34684 entries, 0 to 34683
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   resultId                  26080 non-null  float64
 1   raceId                    34684 non-null  int64  
 2   driverId                  34684 non-null  int64  
 3   constructorId             26080 non-null  float64
 4   number                    26080 non-null  object 
 5   grid                      26080 non-null  float64
 6   position_driver           26080 non-null  object 
 7   positionText_driver       26080 non-null  object 
 8   positionOrder             26080 non-null  float64
 9   points_driver             26080 non-null  float64
 10  laps                      26080 non-null  float64
 11  time                      26080 non-null  object 
 12  milliseconds              26080 non-null  object 
 13  fastestLap                26080 non-null  object 
 14  rank  

In [4]:
drop_columns = []
for column in results_df.columns:
    if '_extra' in column:
        drop_columns.append(column)
results_df.drop(columns=drop_columns, inplace=True)
results_df.drop(columns=['constructorStandingsId', 'constructorStandingsId', 'positionText_constructor', 'driverStandingsId', 'positionText_driver', 'points_driver', 'laps', 'time', 'milliseconds', 'fastestLap', 'rank', 'fastestLapTime', 'fastestLapSpeed'], inplace=True)

In [5]:
results_df.dropna(inplace=True)
results_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23939 entries, 0 to 26079
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   resultId              23939 non-null  float64
 1   raceId                23939 non-null  int64  
 2   driverId              23939 non-null  int64  
 3   constructorId         23939 non-null  float64
 4   number                23939 non-null  object 
 5   grid                  23939 non-null  float64
 6   position_driver       23939 non-null  object 
 7   positionOrder         23939 non-null  float64
 8   statusId              23939 non-null  float64
 9   year                  23939 non-null  float64
 10  round                 23939 non-null  float64
 11  wins_driver           23939 non-null  float64
 12  points_constructor    23939 non-null  float64
 13  position_constructor  23939 non-null  float64
 14  wins_constructor      23939 non-null  float64
dtypes: float64(11), int64(2)

## Feature Engineering

In [6]:
def top3_finishes(row, df):
    return df[(df.year == row.year) & (df['round'] < row['round']) & (df.positionOrder < 4) & (df.driverId == row.driverId)].positionOrder.count() / (row['round'] - 1) * 100

In [7]:
results_df['top3_driver_season_percentage'] = results_df.swifter.apply(top3_finishes, axis=1, args=(results_df,))

  self.comm = Comm(**args)


Pandas Apply:   0%|          | 0/23939 [00:00<?, ?it/s]

  return df[(df.year == row.year) & (df['round'] < row['round']) & (df.positionOrder < 4) & (df.driverId == row.driverId)].positionOrder.count() / (row['round'] - 1) * 100


In [8]:
results_df['top_3'] = results_df.positionOrder < 4

In [9]:
def avg_finish_position_season(row, df):
    return df[(df.driverId == row.driverId) & (df.year == df.year) & (df['round'] < row['round'])].positionOrder.mean()

In [10]:
results_df['driver_avg_finish_pos_season'] = results_df.swifter.apply(avg_finish_position_season, axis=1, args=(results_df,))

  self.comm = Comm(**args)


Pandas Apply:   0%|          | 0/23939 [00:00<?, ?it/s]

In [11]:
def constructor_top_3(row, df):
    return (df[(df['year'] == row.year) & (df.constructorId == row.constructorId) & (
                df['round'] < row['round'])].top_3.sum()) / ((row['round'] - 1) * 2) * 100

In [12]:
results_df['Constructor_Top3_Percent'] = results_df.swifter.apply(constructor_top_3, axis=1, args=(results_df,))

  self.comm = Comm(**args)


Pandas Apply:   0%|          | 0/23939 [00:00<?, ?it/s]

  return (df[(df['year'] == row.year) & (df.constructorId == row.constructorId) & (


In [13]:
results_df.sort_values(['year', 'round'], ascending=False, inplace=True, ignore_index=True)

In [14]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23939 entries, 0 to 23938
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   resultId                       23939 non-null  float64
 1   raceId                         23939 non-null  int64  
 2   driverId                       23939 non-null  int64  
 3   constructorId                  23939 non-null  float64
 4   number                         23939 non-null  object 
 5   grid                           23939 non-null  float64
 6   position_driver                23939 non-null  object 
 7   positionOrder                  23939 non-null  float64
 8   statusId                       23939 non-null  float64
 9   year                           23939 non-null  float64
 10  round                          23939 non-null  float64
 11  wins_driver                    23939 non-null  float64
 12  points_constructor             23939 non-null 

In [15]:
lag_df = results_df.copy()
lag_df['year'] = lag_df['year'] + 1
results_df = results_df.merge(lag_df[['top3_driver_season_percentage', 'driver_avg_finish_pos_season', 'Constructor_Top3_Percent','year', 'round', 'driverId']],on=['year', 'round', 'driverId'], how='left', suffixes=('', '_lag'))

In [16]:
one_race_lag_df = results_df.copy()
one_race_lag_df.index = one_race_lag_df.index + 1
results_df = pd.merge(results_df, one_race_lag_df[['position_driver', 'position_constructor', 'year', 'round', 'driverId']], on=['year', 'round', 'driverId'], how='left', suffixes=('_drop', ''))
results_df.head(20)

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position_driver_drop,positionOrder,statusId,year,...,wins_constructor,top3_driver_season_percentage,top_3,driver_avg_finish_pos_season,Constructor_Top3_Percent,top3_driver_season_percentage_lag,driver_avg_finish_pos_season_lag,Constructor_Top3_Percent_lag,position_driver,position_constructor
0,26066.0,1110,830,9.0,1,6.0,1,1.0,1.0,2023.0,...,12.0,100.0,True,6.646465,77.272727,72.727273,6.646465,63.636364,1,1.0
1,26067.0,1110,815,9.0,11,2.0,2,2.0,1.0,2023.0,...,12.0,54.545455,True,9.517986,77.272727,54.545455,9.517986,63.636364,2,1.0
2,26068.0,1110,844,6.0,16,1.0,3,3.0,1.0,2023.0,...,0.0,18.181818,True,8.772727,9.090909,45.454545,8.772727,50.0,3,4.0
3,26069.0,1110,1,131.0,44,3.0,4,4.0,1.0,2023.0,...,0.0,36.363636,False,4.295699,22.727273,36.363636,4.295699,31.818182,4,2.0
4,26070.0,1110,4,117.0,14,9.0,5,5.0,1.0,2023.0,...,0.0,54.545455,False,7.976959,27.272727,0.0,7.976959,0.0,5,3.0
5,26071.0,1110,847,131.0,63,8.0,6,6.0,1.0,2023.0,...,0.0,9.090909,False,11.818182,22.727273,27.272727,11.818182,31.818182,6,2.0
6,26072.0,1110,846,1.0,4,7.0,7,7.0,1.0,2023.0,...,0.0,18.181818,False,9.181818,9.090909,9.090909,9.181818,4.545455,7,5.0
7,26073.0,1110,839,214.0,31,14.0,8,8.0,1.0,2023.0,...,0.0,9.090909,False,10.469697,4.545455,0.0,10.469697,0.0,8,6.0
8,26074.0,1110,840,117.0,18,10.0,9,9.0,1.0,2023.0,...,0.0,0.0,False,11.986842,27.272727,0.0,11.986842,0.0,9,3.0
9,26075.0,1110,852,213.0,22,11.0,10,10.0,1.0,2023.0,...,0.0,0.0,False,12.818182,0.0,0.0,12.818182,0.0,10,10.0


In [17]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23967 entries, 0 to 23966
Data columns (total 24 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   resultId                           23967 non-null  float64
 1   raceId                             23967 non-null  int64  
 2   driverId                           23967 non-null  int64  
 3   constructorId                      23967 non-null  float64
 4   number                             23967 non-null  object 
 5   grid                               23967 non-null  float64
 6   position_driver_drop               23967 non-null  object 
 7   positionOrder                      23967 non-null  float64
 8   statusId                           23967 non-null  float64
 9   year                               23967 non-null  float64
 10  round                              23967 non-null  float64
 11  wins_driver                        23967 non-null  flo

In [18]:
results_df.dropna(inplace=True)
results_df.drop(columns=['position_driver'], inplace=True)
results_df.drop(columns=['position_driver_drop', 'wins_driver', 'position_constructor_drop',
                        'wins_constructor', 'resultId', 'raceId', 'driverId', 'constructorId', 'round'], inplace=True)
results_df[['positionOrder', 'grid']].astype('int')
results_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15134 entries, 0 to 23728
Data columns (total 14 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   number                             15134 non-null  object 
 1   grid                               15134 non-null  float64
 2   positionOrder                      15134 non-null  float64
 3   statusId                           15134 non-null  float64
 4   year                               15134 non-null  float64
 5   points_constructor                 15134 non-null  float64
 6   top3_driver_season_percentage      15134 non-null  float64
 7   top_3                              15134 non-null  bool   
 8   driver_avg_finish_pos_season       15134 non-null  float64
 9   Constructor_Top3_Percent           15134 non-null  float64
 10  top3_driver_season_percentage_lag  15134 non-null  float64
 11  driver_avg_finish_pos_season_lag   15134 non-null  float64


In [19]:
results_df.to_csv('Data/Modeling_v1.csv')