# Final Project: F1 EDA

Author: Alex Searle

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
import swifter
from datetime import datetime
import warnings
%matplotlib inline
sns.set_theme(style='darkgrid')
warnings.filterwarnings(action='once')



## Data Preparation

In [2]:
# Reading in all the necessary data
races_df = pd.read_csv('Data/races.csv')
laptimes_df = pd.read_csv('Data/lap_times.csv')
driver_standings_df = pd.read_csv('Data/driver_standings.csv')
results_df = pd.read_csv('Data/results.csv')
circuits_df = pd.read_csv('Data/circuits.csv')
constructor_standings_df = pd.read_csv('Data/constructor_standings.csv')

In [3]:
results_df = results_df.join(races_df, on='raceId', lsuffix='_results', rsuffix='_races', how='outer')
results_df = results_df.merge(driver_standings_df, left_on=['driverId', 'raceId_results'], right_on=['driverId', 'raceId'], how='outer', suffixes=['_x', ''])
results_df = results_df.merge(constructor_standings_df, left_on=['constructorId', 'raceId_results'], right_on=['constructorId', 'raceId'], how='outer', suffixes=['_driver', '_constructor'])
results_df = results_df.merge(circuits_df, on='circuitId', how='outer')
results_df.date = pd.to_datetime(results_df.date)

In [4]:
drop_columns = []
for column in results_df.columns.to_list():
    if 'raceId' in column:
        drop_columns.append(column)
results_df.drop(columns=drop_columns, inplace=True)

more_drop_columns = []
for column in results_df.columns.to_list():
    if '_x' in column or '_left' in column or '_races' in column:
        more_drop_columns.append(column)
results_df.drop(columns=more_drop_columns, inplace=True)

results_df.drop(columns=['resultId','laps', 'rank','fastestLapTime', 'fastestLapSpeed', 'fp1_date', 'fp1_time', 'fp2_date', 'fp2_time','fp3_date', 'fp3_time', 'quali_date', 'quali_time', 'sprint_date','sprint_time', 'driverStandingsId',
                         'positionText_driver','constructorStandingsId','positionText_constructor','location', 'country','alt',  'milliseconds', 'number', 'fastestLap', 'points_driver', 'points_constructor'], inplace=True)

In [5]:
results_df.columns = results_df.columns.str.replace('_y', '')
results_df.columns = results_df.columns.str.replace('_right', '')
results_df.columns = results_df.columns.str.replace('_results', '')
results_df.fillna(0, inplace=True)
results_df.drop(index=results_df[results_df['year'] == 0].index, inplace=True)

  results_df.fillna(0, inplace=True)


## Feature Engineering

In [6]:
def top3_finishes(row, df):
    return df[(df.year == row.year) & (df['round'] < row['round']) & (df.positionOrder < 4) & (df.driverId == row.driverId)].positionOrder.count() / (row['round'] - 1) * 100

In [7]:
results_df['top3_driver_season_percentage'] = results_df.swifter.apply(top3_finishes, axis=1, args=(results_df,))

  self.comm = Comm(**args)


Pandas Apply:   0%|          | 0/25919 [00:00<?, ?it/s]

  return df[(df.year == row.year) & (df['round'] < row['round']) & (df.positionOrder < 4) & (df.driverId == row.driverId)].positionOrder.count() / (row['round'] - 1) * 100


In [8]:
results_df['top_3'] = results_df.positionOrder < 4

In [9]:
def avg_finish_position_season(row, df):
    return df[(df.driverId == row.driverId) & (df.year == df.year) & (df['round'] < row['round'])].positionOrder.mean()

In [10]:
results_df['driver_avg_finish_pos_season'] = results_df.swifter.apply(avg_finish_position_season, axis=1, args=(results_df,))

  self.comm = Comm(**args)


Pandas Apply:   0%|          | 0/25919 [00:00<?, ?it/s]

In [11]:
def constructor_top_3(row, df):
    return (df[(df['year'] == row.year) & (df.constructorId == row.constructorId) & (
                df['round'] < row['round'])].top_3.sum()) / ((row['round'] - 1) * 2) * 100

In [12]:
results_df['Constructor_Top3_Percent'] = results_df.swifter.apply(constructor_top_3, axis=1, args=(results_df,))

  self.comm = Comm(**args)


Pandas Apply:   0%|          | 0/25919 [00:00<?, ?it/s]

  return (df[(df['year'] == row.year) & (df.constructorId == row.constructorId) & (


In [13]:
def percent_wins_at_circuit(row, df):
    return (df[(df.circuitId == row.circuitId) & (df.driverId == row.driverId) & (df.positionOrder == 1) & (df['year'] < row['year'])].circuitId.count() / df[(df.circuitId == row.circuitId) & (df.driverId == row.driverId)].circuitId.count()) * 100

In [14]:
results_df['percent_wins_at_circuit'] = results_df.swifter.apply(percent_wins_at_circuit, axis=1, args=(results_df,))

  self.comm = Comm(**args)


Pandas Apply:   0%|          | 0/25919 [00:00<?, ?it/s]

In [15]:
results_df.sort_values(['year', 'round'], ascending=False, inplace=True, ignore_index=True)

In [16]:
def races_since_last_win(row, df):
    last_win = df[(df['date'] < row['date']) & (df.positionOrder == 1) & (df.driverId == row.driverId)].date.max()
    if pd.isnull(last_win):
        return 0
    else:
        last_win_index = df.index[(df.date == last_win) & (df.driverId == row.driverId)].to_list()[0]
        current_race_index = df.index[(df.date == row.date) & (df.driverId == row.driverId)].to_list()[0]
        return df.iloc[current_race_index:last_win_index][df.driverId == row.driverId].date.count()

In [17]:
results_df['races_since_last_win'] = results_df.swifter.apply(races_since_last_win, axis=1, args=(results_df,))

  self.comm = Comm(**args)


Pandas Apply:   0%|          | 0/25919 [00:00<?, ?it/s]

  return df.iloc[current_race_index:last_win_index][df.driverId == row.driverId].date.count()
  return df.iloc[current_race_index:last_win_index][df.driverId == row.driverId].date.count()
  return df.iloc[current_race_index:last_win_index][df.driverId == row.driverId].date.count()
  return df.iloc[current_race_index:last_win_index][df.driverId == row.driverId].date.count()
  return df.iloc[current_race_index:last_win_index][df.driverId == row.driverId].date.count()
  return df.iloc[current_race_index:last_win_index][df.driverId == row.driverId].date.count()
  return df.iloc[current_race_index:last_win_index][df.driverId == row.driverId].date.count()
  return df.iloc[current_race_index:last_win_index][df.driverId == row.driverId].date.count()
  return df.iloc[current_race_index:last_win_index][df.driverId == row.driverId].date.count()
  return df.iloc[current_race_index:last_win_index][df.driverId == row.driverId].date.count()
  return df.iloc[current_race_index:last_win_index][df.drive

In [18]:
results_df.drop(columns=['time', 'url'], inplace=True)

In [19]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25919 entries, 0 to 25918
Data columns (total 23 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   driverId                       25919 non-null  float64
 1   constructorId                  25919 non-null  float64
 2   grid                           25919 non-null  float64
 3   positionOrder                  25919 non-null  float64
 4   statusId                       25919 non-null  float64
 5   year                           25919 non-null  float64
 6   round                          25919 non-null  float64
 7   circuitId                      25919 non-null  float64
 8   date                           25919 non-null  object 
 9   position_driver                25919 non-null  float64
 10  wins_driver                    25919 non-null  float64
 11  position_constructor           25919 non-null  float64
 12  wins_constructor               25919 non-null 

In [20]:
lag_df = results_df.copy()
lag_df['year'] = lag_df['year'] + 1
results_df = results_df.merge(lag_df[['top3_driver_season_percentage', 'driver_avg_finish_pos_season', 'Constructor_Top3_Percent','year', 'round', 'driverId']],on=['year', 'round', 'driverId'], how='left', suffixes=('', '_lag'))

In [21]:
one_race_lag_df = results_df.copy()
one_race_lag_df.index = one_race_lag_df.index + 1
results_df = pd.merge(results_df, one_race_lag_df[['position_driver', 'position_constructor', 'year', 'round', 'driverId']], on=['year', 'round', 'driverId'], how='left', suffixes=('_drop', ''))
results_df.head(20)

Unnamed: 0,driverId,constructorId,grid,positionOrder,statusId,year,round,circuitId,date,position_driver_drop,...,top_3,driver_avg_finish_pos_season,Constructor_Top3_Percent,percent_wins_at_circuit,races_since_last_win,top3_driver_season_percentage_lag,driver_avg_finish_pos_season_lag,Constructor_Top3_Percent_lag,position_driver,position_constructor
0,830.0,9.0,1.0,1.0,1.0,2023.0,22.0,24.0,2023-11-26 00:00:00,1.0,...,True,6.435583,64.285714,11.111111,2,71.428571,6.435583,50.0,1.0,1.0
1,815.0,9.0,0.0,5.0,1.0,2023.0,22.0,24.0,2023-11-26 00:00:00,2.0,...,False,9.616034,64.285714,8.333333,1,28.571429,9.616034,50.0,2.0,1.0
2,1.0,131.0,3.0,2.0,1.0,2023.0,22.0,24.0,2023-11-26 00:00:00,4.0,...,True,4.751613,35.714286,40.0,26,66.666667,4.751613,57.142857,4.0,3.0
3,847.0,131.0,2.0,18.0,5.0,2023.0,22.0,24.0,2023-11-26 00:00:00,7.0,...,False,12.158537,35.714286,0.0,4,9.52381,12.158537,57.142857,7.0,3.0
4,4.0,117.0,4.0,3.0,1.0,2023.0,22.0,24.0,2023-11-26 00:00:00,3.0,...,True,8.463687,4.761905,0.0,158,0.0,8.463687,2.380952,3.0,2.0
5,840.0,117.0,6.0,4.0,1.0,2023.0,22.0,24.0,2023-11-26 00:00:00,6.0,...,False,12.360656,4.761905,0.0,0,0.0,12.360656,0.0,6.0,2.0
6,846.0,1.0,13.0,6.0,1.0,2023.0,22.0,24.0,2023-11-26 00:00:00,8.0,...,False,8.97561,0.0,0.0,0,14.285714,8.97561,9.52381,8.0,5.0
7,857.0,1.0,16.0,8.0,1.0,2023.0,22.0,24.0,2023-11-26 00:00:00,13.0,...,False,17.5,0.0,0.0,0,,,,13.0,5.0
8,807.0,210.0,10.0,7.0,1.0,2023.0,22.0,24.0,2023-11-26 00:00:00,9.0,...,False,11.672043,0.0,0.0,0,,,,9.0,7.0
9,825.0,210.0,14.0,17.0,3.0,2023.0,22.0,24.0,2023-11-26 00:00:00,17.0,...,False,13.244755,0.0,0.0,0,0.0,13.244755,0.0,17.0,7.0


In [22]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26366 entries, 0 to 26365
Data columns (total 28 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   driverId                           26366 non-null  float64
 1   constructorId                      26366 non-null  float64
 2   grid                               26366 non-null  float64
 3   positionOrder                      26366 non-null  float64
 4   statusId                           26366 non-null  float64
 5   year                               26366 non-null  float64
 6   round                              26366 non-null  float64
 7   circuitId                          26366 non-null  float64
 8   date                               26366 non-null  object 
 9   position_driver_drop               26366 non-null  float64
 10  wins_driver                        26366 non-null  float64
 11  position_constructor_drop          26366 non-null  flo

In [23]:
results_df.fillna(0)

Unnamed: 0,driverId,constructorId,grid,positionOrder,statusId,year,round,circuitId,date,position_driver_drop,...,top_3,driver_avg_finish_pos_season,Constructor_Top3_Percent,percent_wins_at_circuit,races_since_last_win,top3_driver_season_percentage_lag,driver_avg_finish_pos_season_lag,Constructor_Top3_Percent_lag,position_driver,position_constructor
0,830.0,9.0,1.0,1.0,1.0,2023.0,22.0,24.0,2023-11-26,1.0,...,True,6.435583,64.285714,11.111111,2,71.428571,6.435583,50.000000,1.0,1.0
1,815.0,9.0,0.0,5.0,1.0,2023.0,22.0,24.0,2023-11-26,2.0,...,False,9.616034,64.285714,8.333333,1,28.571429,9.616034,50.000000,2.0,1.0
2,1.0,131.0,3.0,2.0,1.0,2023.0,22.0,24.0,2023-11-26,4.0,...,True,4.751613,35.714286,40.000000,26,66.666667,4.751613,57.142857,4.0,3.0
3,847.0,131.0,2.0,18.0,5.0,2023.0,22.0,24.0,2023-11-26,7.0,...,False,12.158537,35.714286,0.000000,4,9.523810,12.158537,57.142857,7.0,3.0
4,4.0,117.0,4.0,3.0,1.0,2023.0,22.0,24.0,2023-11-26,3.0,...,True,8.463687,4.761905,0.000000,158,0.000000,8.463687,2.380952,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26361,609.0,141.0,10.0,12.0,5.0,1950.0,1.0,9.0,1950-05-13,22.0,...,False,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000,22.0,0.0
26362,427.0,141.0,11.0,15.0,5.0,1950.0,1.0,9.0,1950-05-13,45.0,...,False,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000,45.0,0.0
26363,580.0,105.0,17.0,10.0,88.0,1950.0,1.0,9.0,1950-05-13,39.0,...,False,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000,39.0,0.0
26364,669.0,105.0,19.0,19.0,5.0,1950.0,1.0,9.0,1950-05-13,62.0,...,False,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000,62.0,0.0


In [24]:
results_df.to_csv('Data/Modeling_v1.csv')