In [None]:
# Data transformation for Static Model
# Initial Tests with code

In [1]:
import jupyterthemes as jt
from jupyterthemes import get_themes
from jupyterthemes.stylefx import set_nb_theme
set_nb_theme("monokai")

In [2]:
import pandas as pd 
from pandas import DataFrame as df 
import numpy as np
import os 

import fastf1 as f1 
from pyergast import pyergast 

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 40)



# Load the data

In [4]:
results = pd.read_csv(os.path.join(os.getcwd(),"data/results.csv"))
races = pd.read_csv(os.path.join(os.getcwd(),"data/races.csv"))
quali = pd.read_csv(os.path.join(os.getcwd(),"data/qualifying.csv"))
drivers = pd.read_csv(os.path.join(os.getcwd(),"data/drivers.csv"))
constructors = pd.read_csv(os.path.join(os.getcwd(),"data/constructors.csv"))
circuit = pd.read_csv(os.path.join(os.getcwd(),"data/circuits.csv"))

In [5]:
df1 = pd.merge(races,results,how='inner',on=['raceId'])
df2 = pd.merge(df1,quali,how='inner',on=['raceId','driverId','constructorId'])
df3 = pd.merge(df2,drivers,how='inner',on=['driverId'])
df4 = pd.merge(df3,constructors,how='inner',on=['constructorId'])
df5 = pd.merge(df4,circuit,how='inner',on=['circuitId'])



In [6]:
active_constructors = ['Renault', 'Williams', 'McLaren', 'Ferrari', 'Mercedes',
                       'AlphaTauri', 'Racing Point', 'Alfa Romeo', 'Red Bull',
                       'Haas F1 Team']

active_drivers = ['Daniel Ricciardo', 'Kevin Magnussen', 'Carlos Sainz',
                  'Valtteri Bottas', 'Lance Stroll', 'George Russell',
                  'Lando Norris', 'Sebastian Vettel', 'Kimi Räikkönen',
                  'Charles Leclerc', 'Lewis Hamilton', 'Daniil Kvyat',
                  'Max Verstappen', 'Pierre Gasly', 'Alexander Albon',
                  'Sergio Pérez', 'Esteban Ocon', 'Antonio Giovinazzi',
                  'Romain Grosjean','Nicholas Latifi']

In [7]:
data = df5


# Data for Static model

In [28]:
# Only data from 2010 onwards
data_static = df5[df5["year"] >= 2009].sort_values(by=["year","round"],ascending=True)

# Rename
data_static.rename(columns={'name_x':'GP name',
                            'name_y':'constructor',
                            'position_y':'quali_position',
                           'grid':'starting_grid_position',
                           'positionOrder':'end_position'},inplace=True)

# Create age at gp column
data_static['date'] = pd.to_datetime(data_static['date'])
data_static['dob'] = pd.to_datetime(data_static['dob'])
data_static['age_at_gp_in_days'] = abs(data_static['dob']-data_static['date'])
data_static['age_at_gp_in_days'] = data_static['age_at_gp_in_days'].apply(lambda x: str(x).split(' ')[0])

# Create driver column
data_static['driver'] = data_static['forename']+' '+data_static['surname']

# Create active driver column
data_static["active_driver"] = data_static['driver'].apply(lambda x: int(x in active_drivers))
data_static["active_constructor"] = data_static['constructor'].apply(lambda x: int(x in active_constructors))

In [29]:

#Some of the constructors changed their name over the year so replacing old names with current name
data_static['constructor'] = data_static['constructor'].apply(lambda x: 'Racing Point' if x=='Force India' else x)
data_static['constructor'] = data_static['constructor'].apply(lambda x: 'Alfa Romeo' if x=='Sauber' else x)
data_static['constructor'] = data_static['constructor'].apply(lambda x: 'Renault' if x=='Lotus F1' else x)
data_static['constructor'] = data_static['constructor'].apply(lambda x: 'AlphaTauri' if x=='Toro Rosso' else x)

In [30]:
# might need some tweaking; http://ergast.com/api/f1/status?limit=30 
not_dnf = [1,11,12]

def get_dnf(row):
    # if the statusId is has a number different than 1, 11 or 12 than the driver dnf    
    if row["statusId"] in not_dnf:
        return 0
    return 1
    
data_static["dnf"] = data_static.apply(get_dnf,axis=1)

In [31]:
data_static["start_and_end_position_diff"] = data_static["starting_grid_position"] - data_static["end_position"]

def get_the_average(row):
    mean = data_static[(data_static["driver"]==row["driver"])&
                      (data_static["GP name"]==row["GP name"])].mean()["start_and_end_position_diff"]
    return mean
data_static["average_position_change_for_this_driver"] = data_static.apply(get_the_average,axis=1)

  mean = data_static[(data_static["driver"]==row["driver"])&


In [33]:
def find_position_prev_gp(row):
    if (row["year"] == 2009):
        return 0
    
    same_gp_diff_year = data_static[(data_static["driver"]==row["driver"]) & 
                                    (data_static["GP name"]==row["GP name"]) &
                                    (data_static["year"]<row["year"])
                                   ]
    prev_gp = same_gp_diff_year["year"].max()
    try:
        prev_position = same_gp_diff_year[same_gp_diff_year["year"]==prev_gp]["end_position"].iloc[0]
    except: 
        return 0
        
    return prev_position

data_static["previous_finish_position_at_this_gp"] = data_static.apply(find_position_prev_gp, axis=1)

In [34]:
def get_last_round(year,data):
    season = data[data["year"]==year]
    last_round = season["round"].max()
    return last_round

def find_position(row):
    # return position from final race from previous race 
    if row["year"] == 2009:
        return 0
    
    if row["round"] == 1:
        
        prev_year = data_static[(data_static["year"] == (row["year"] - 1)) & (data_static["driver"] == row["driver"])]
        prev_season_last_round=get_last_round(row["year"],data_static)
        try:
            return prev_year[prev_year["round"]==prev_season_last_round]["positionOrder"].iloc[0]
        except:
            return 0
            
    try:
        specific_driver = data_static[(data_static["year"] == row["year"])&(data_static["driver"]==row["driver"])]
        prev_round = specific_driver[specific_driver["round"] == row["round"]-1]["round"].iloc[0]
        return specific_driver[specific_driver["round"]==prev_round]["positionOrder"].iloc[0]
    except:
        return 0
    
data_static["position_at_previous_race"] = data_static.apply(find_position,axis=1)

In [35]:
data_static = data_static[data_static["active_driver"]==1]
data_static = data_static[data_static["active_constructor"]==1]
data_static = data_static[data_static["year"] >= 2010].sort_values(by=["year","round"],ascending=True)

In [None]:
target = data_static["end_position"]
data_static_final = data_static[["year",
                           "GP name",
                           "round",
                           "driver",
                           "constructor",
                           "age_at_gp_in_days",
                            "quali_position",
                           "starting_grid_position",
                           "position_at_previous_race",
                           "previous_finish_position_at_this_gp",
                           "dnf",
                           "average_position_change_for_this_driver"
                          ]]

In [None]:
data_static_final.to_csv("features.csv",index=False)
target.to_csv("target.csv",index=False)

In [39]:
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder

le = LabelEncoder()

# data_static["GP name"] = le.fit_transform(data_static["GP name"])
classes = le.fit(data_static["driver"])
# data_static["constructor"] = le.fit_transform(data_static["constructor"])
# data_static["position_at_previous_race"] = le.fit_transform(data_static["position_at_previous_race"])
# data_static["previous_finish_position_at_this_gp"] = le.fit_transform(data_static["previous_finish_position_at_this_gp"])



In [51]:
transformed2 = le.fit_transform(data_static["GP name"])
transformed = le.fit_transform(data_static["driver"])


le.classes_

array(['Alexander Albon', 'Antonio Giovinazzi', 'Carlos Sainz',
       'Charles Leclerc', 'Daniel Ricciardo', 'Daniil Kvyat',
       'Esteban Ocon', 'George Russell', 'Kevin Magnussen',
       'Kimi Räikkönen', 'Lance Stroll', 'Lando Norris', 'Lewis Hamilton',
       'Max Verstappen', 'Nicholas Latifi', 'Pierre Gasly',
       'Romain Grosjean', 'Sebastian Vettel', 'Sergio Pérez',
       'Valtteri Bottas'], dtype=object)

In [45]:
inverse = le.inverse_transform(transformed)

In [47]:
inverse
transformed

array([12, 17, 12, ..., 10,  5, 15])

# Dynamic model

In [4]:
# Load first race of 2021
race = f1.get_session(2018,"Bahrain","R")



TypeError: 'NoneType' object is not subscriptable

In [5]:
# Get info on each lap with telemetry data - USE CACHE DATA
laps = race.load_laps(with_telemetry=True)

NameError: name 'race' is not defined

In [85]:
laps

Unnamed: 0,Time,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,Sector3Time,Sector1SessionTime,Sector2SessionTime,Sector3SessionTime,SpeedI1,SpeedI2,SpeedFL,SpeedST,Compound,TyreLife,FreshTyre,LapStartTime,Team,Driver,TrackStatus,IsAccurate,LapStartDate
0,0 days 00:34:16.815000,16,NaT,0.0,1.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,,SOFT,5.0,False,0 days 00:32:54.677000,Ferrari,LEC,1,False,2021-05-23 13:02:55.440
1,0 days 00:34:16.815000,33,NaT,1.0,1.0,0 days 00:00:04.827000,NaT,NaT,0 days 00:00:36.535000,0 days 00:00:20.724000,NaT,0 days 00:33:56.247000,0 days 00:34:17.009000,192.0,187.0,250.0,258.0,SOFT,6.0,False,0 days 00:32:54.677000,Red Bull,VER,1,False,2021-05-23 13:02:55.440
2,0 days 00:35:34.295000,33,0 days 00:01:17.480000,2.0,1.0,NaT,NaT,0 days 00:00:20.615000,0 days 00:00:36.346000,0 days 00:00:20.519000,0 days 00:34:37.430000,0 days 00:35:13.776000,0 days 00:35:34.295000,193.0,191.0,250.0,259.0,SOFT,7.0,False,0 days 00:34:16.815000,Red Bull,VER,1,True,2021-05-23 13:04:17.578
3,0 days 00:36:51.593000,33,0 days 00:01:17.298000,3.0,1.0,NaT,NaT,0 days 00:00:20.460000,0 days 00:00:36.305000,0 days 00:00:20.533000,0 days 00:35:54.755000,0 days 00:36:31.060000,0 days 00:36:51.593000,199.0,191.0,251.0,258.0,SOFT,8.0,False,0 days 00:35:34.295000,Red Bull,VER,1,True,2021-05-23 13:05:35.058
4,0 days 00:38:08.830000,33,0 days 00:01:17.237000,4.0,1.0,NaT,NaT,0 days 00:00:20.609000,0 days 00:00:36.124000,0 days 00:00:20.504000,0 days 00:37:12.202000,0 days 00:37:48.326000,0 days 00:38:08.830000,,192.0,251.0,260.0,SOFT,9.0,False,0 days 00:36:51.593000,Red Bull,VER,1,True,2021-05-23 13:06:52.356
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1415,0 days 02:07:22.450000,47,0 days 00:01:16.778000,71.0,2.0,NaT,NaT,0 days 00:00:20.242000,0 days 00:00:35.893000,0 days 00:00:20.643000,0 days 02:06:25.914000,0 days 02:07:01.807000,0 days 02:07:22.450000,198.0,196.0,258.0,281.0,HARD,34.0,True,0 days 02:06:05.672000,Haas F1 Team,MSC,1,True,2021-05-23 14:36:06.435
1416,0 days 02:08:39.287000,47,0 days 00:01:16.837000,72.0,2.0,NaT,NaT,0 days 00:00:20.377000,0 days 00:00:35.874000,0 days 00:00:20.586000,0 days 02:07:42.827000,0 days 02:08:18.701000,0 days 02:08:39.287000,194.0,192.0,259.0,281.0,HARD,35.0,True,0 days 02:07:22.450000,Haas F1 Team,MSC,1,True,2021-05-23 14:37:23.213
1417,0 days 02:09:56.351000,47,0 days 00:01:17.064000,73.0,2.0,NaT,NaT,0 days 00:00:20.331000,0 days 00:00:36.037000,0 days 00:00:20.696000,0 days 02:08:59.618000,0 days 02:09:35.655000,0 days 02:09:56.351000,192.0,196.0,258.0,281.0,HARD,36.0,True,0 days 02:08:39.287000,Haas F1 Team,MSC,1,True,2021-05-23 14:38:40.050
1418,0 days 02:11:13.513000,47,0 days 00:01:17.162000,74.0,2.0,NaT,NaT,0 days 00:00:20.341000,0 days 00:00:36.049000,0 days 00:00:20.772000,0 days 02:10:16.692000,0 days 02:10:52.741000,0 days 02:11:13.513000,195.0,194.0,259.0,280.0,HARD,37.0,True,0 days 02:09:56.351000,Haas F1 Team,MSC,1,True,2021-05-23 14:39:57.114


In [97]:
# Save only the columns that we need - rename driverNumber for convenience sake
laps_info = laps[["LapNumber","Driver","Compound","DriverNumber"]]
laps_info = laps_info.rename(columns={"DriverNumber":"number"})

In [98]:
# Check what races are there 
GP_name_list = pyergast.get_schedule(2021)["raceName"]

In [99]:
# Get info on quali and grid position
quali_info = pyergast.get_qualifying_result(2021,5)[["number","position"]]
grid_info = pyergast.get_race_result(2021,5)[["number","grid"]]

In [106]:
quali_info

Unnamed: 0,number,position
0,16,1
1,33,2
2,77,3
3,55,4
4,4,5
5,10,6
6,44,7
7,5,8
8,11,9
9,99,10


In [100]:
# you can automate the process with loops here - but it might require some more work 
round_number = 4

# Create two columns: GP_name and round that are just equal to one value - name of GP and round number 
laps_info["GP_name"] = GP_name_list.iloc[round_number]
laps_info["round"] = round_number + 1
laps_info

Unnamed: 0,LapNumber,Driver,Compound,number,GP_name,round
0,0.0,LEC,SOFT,16,Monaco Grand Prix,5
1,1.0,VER,SOFT,33,Monaco Grand Prix,5
2,2.0,VER,SOFT,33,Monaco Grand Prix,5
3,3.0,VER,SOFT,33,Monaco Grand Prix,5
4,4.0,VER,SOFT,33,Monaco Grand Prix,5
5,5.0,VER,SOFT,33,Monaco Grand Prix,5
6,6.0,VER,SOFT,33,Monaco Grand Prix,5
7,7.0,VER,SOFT,33,Monaco Grand Prix,5
8,8.0,VER,SOFT,33,Monaco Grand Prix,5
9,9.0,VER,SOFT,33,Monaco Grand Prix,5


In [110]:
# Function  for correctly assigning the grid position to each driver. x is a row from our list 
def get_grid(x):
    
    # Save the driver number of the row      
    driver_number = x["number"]
    
    
    # Find in grid list the driver that we are looking for  
    
    driver_row = grid_info[grid_info["number"]==driver_number]
    try:
        grid_position = driver_row["grid"].iloc[0]
    except:
        return 
    
    # Return for this driver the grid position     
    return grid_position

# Apply get_grid to each row of our table and create at the same time a new columns "grid" that will
# get the right value of grid position for each driver
laps_info["grid"] = laps_info.apply(get_grid ,axis=1)

In [111]:
# Same as previous function just change the names of some lists and variables
def get_quali_position(x):
    driver_number = x["number"]
    driver_row = quali_info[quali_info["number"]==driver_number]
    try:
        quali_position = driver_row["position"].iloc[0]
    except:
        return
    return quali_position

laps_info["quali_position"] = laps_info.apply(get_quali_position ,axis=1)

In [112]:
pd.set_option('display.max_rows', None)

laps_info

Unnamed: 0,LapNumber,Driver,Compound,number,GP_name,round,grid,quali_position
0,0.0,LEC,SOFT,16,Monaco Grand Prix,5,1,1.0
1,1.0,VER,SOFT,33,Monaco Grand Prix,5,2,2.0
2,2.0,VER,SOFT,33,Monaco Grand Prix,5,2,2.0
3,3.0,VER,SOFT,33,Monaco Grand Prix,5,2,2.0
4,4.0,VER,SOFT,33,Monaco Grand Prix,5,2,2.0
5,5.0,VER,SOFT,33,Monaco Grand Prix,5,2,2.0
6,6.0,VER,SOFT,33,Monaco Grand Prix,5,2,2.0
7,7.0,VER,SOFT,33,Monaco Grand Prix,5,2,2.0
8,8.0,VER,SOFT,33,Monaco Grand Prix,5,2,2.0
9,9.0,VER,SOFT,33,Monaco Grand Prix,5,2,2.0


In [58]:
race2 = f1.get_session(2021,2,"R")
laps_2 = race2.load_laps(with_telemetry=True)

core           INFO 	Loading laps for Emilia Romagna Grand Prix - Race [v2.1.13]
api            INFO 	Fetching timing data...
api            INFO 	Parsing timing data...
api            INFO 	Fetching timing app data...
core           INFO 	Processing timing data...
api            INFO 	Fetching driver list...
api            INFO 	Fetching session status data...
api            INFO 	Fetching track status data...
api            INFO 	Fetching car data...
api            INFO 	Parsing car data...
api            INFO 	Fetching position data...
api            INFO 	Parsing position data...
api            INFO 	Fetching weather data...
core           INFO 	Loaded data for 20 drivers: ['3', '10', '9', '77', '33', '6', '5', '31', '63', '14', '22', '16', '7', '18', '11', '44', '99', '55', '47', '4']


In [61]:
laps_2_info = laps_2[["LapNumber","Driver","Compound","DriverNumber"]]
laps_info = laps[["LapNumber","Driver","Compound","DriverNumber"]]
laps_info

Unnamed: 0,LapNumber,Driver,Compound,DriverNumber
0,1.0,VER,MEDIUM,33
1,2.0,VER,MEDIUM,33
2,3.0,VER,MEDIUM,33
3,4.0,VER,MEDIUM,33
4,5.0,VER,MEDIUM,33
5,6.0,VER,MEDIUM,33
6,7.0,VER,MEDIUM,33
7,8.0,VER,MEDIUM,33
8,9.0,VER,MEDIUM,33
9,10.0,VER,MEDIUM,33


In [62]:
laps_2_info

Unnamed: 0,LapNumber,Driver,Compound,DriverNumber
0,1.0,HAM,INTERMEDIATE,44
1,2.0,HAM,INTERMEDIATE,44
2,3.0,HAM,INTERMEDIATE,44
3,4.0,HAM,INTERMEDIATE,44
4,5.0,HAM,INTERMEDIATE,44
5,6.0,HAM,INTERMEDIATE,44
6,7.0,HAM,INTERMEDIATE,44
7,8.0,HAM,INTERMEDIATE,44
8,9.0,HAM,INTERMEDIATE,44
9,10.0,HAM,INTERMEDIATE,44


In [63]:
# connect both races 
pd.concat([laps_info, laps_2_info])

Unnamed: 0,LapNumber,Driver,Compound,DriverNumber
0,1.0,VER,MEDIUM,33
1,2.0,VER,MEDIUM,33
2,3.0,VER,MEDIUM,33
3,4.0,VER,MEDIUM,33
4,5.0,VER,MEDIUM,33
5,6.0,VER,MEDIUM,33
6,7.0,VER,MEDIUM,33
7,8.0,VER,MEDIUM,33
8,9.0,VER,MEDIUM,33
9,10.0,VER,MEDIUM,33
