In [56]:
## PRELIM STEP: refine, organize, and merge data

In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
# Store filepaths into variable
drivers_csv = Path("data/drivers.csv")
fcyphases_csv = Path("data/fcyphases.csv")
laps_csv = Path("data/laps.csv")
qualifyings_csv = Path("data/qualifyings.csv")
races_csv = Path("data/races.csv")
retirements_csv = Path("data/retirements.csv")
starterfields_csv = Path("data/starterfields.csv")
weather = Path("data/merged_weather.csv")
weather_long = Path("data/weather_f1_long.csv")

In [3]:
# Read in files and make dfs
drivers_df = pd.read_csv(drivers_csv)
fcyphases_df = pd.read_csv(fcyphases_csv)
laps_df = pd.read_csv(laps_csv)
qualifyings_df = pd.read_csv(qualifyings_csv)
races_df = pd.read_csv(races_csv)
retirements_df = pd.read_csv(retirements_csv)
starterfields_df = pd.read_csv(starterfields_csv)
weather_df = pd.read_csv(weather)
weather_long_df = pd.read_csv(weather_long)

FileNotFoundError: [Errno 2] No such file or directory: 'data\\weather_f1_long.csv'

In [None]:
#display dfs

In [None]:
display(drivers_df)
display(fcyphases_df)
display(laps_df)
display(qualifyings_df)
display(races_df)
display(retirements_df)
display(starterfields_df)
display(weather_df)
display(weather_long_df)

In [62]:
# make column name variables constant throughout all dfs
drivers_df.rename(columns = {'id':'driver_id'}, inplace = True)
# drivers_df.to_csv('drivers.csv') only do this if create a seperate data file
races_df.rename(columns = {'id':'race_id'}, inplace = True)

In [63]:
# make adjustments to column names to better represent data 

# for "retirements" df... NOTE: accidents and failures is per driver per season (not per race)
retirements_df.rename(columns = {'accidents':'accidents per driver, per season'}, inplace = True)
retirements_df.rename(columns = {'failures':'failures per driver, per season'}, inplace = True)
#for weather_df...
weather_df.rename(columns = {'constructor': 'team'}, inplace = True)
weather_df.rename(columns = {'length': 'tracklength'}, inplace = True)
# Convert the 'date' column to datetime format
weather_df['date'] = pd.to_datetime(weather_df['date'], format='%m/%d/%Y')
# Convert the 'date' column to the desired format with dashes
weather_df['date'] = weather_df['date'].dt.strftime('%Y-%m-%d')

In [64]:
# make adjustments to data sets to make more concise/eliminate redundancies
weather_df

Unnamed: 0.1,Unnamed: 0,season,round,race_name,circuitId,date,distance,weather,driver_name,finish_position,...,initials,dateOfBirth,circuitName,lat,long,location,country,type,direction,tracklength
0,0,2014,1,australian,albert_park,2014-03-16,302.271,"overcast, 19°c (66°f) dry",rosberg,1.0,...,ROS,6/27/1985,Albert Park Grand Prix Circuit,-37.8497,144.96800,Melbourne,Australia,Street circuit,Clockwise,5.303
1,1,2015,1,australian,albert_park,2015-03-15,307.574,partly cloudy 17.5°c (63.5°f) air temperature...,rosberg,2.0,...,ROS,6/27/1985,Albert Park Grand Prix Circuit,-37.8497,144.96800,Melbourne,Australia,Street circuit,Clockwise,5.303
2,2,2016,1,australian,albert_park,2016-03-20,302.271,partly cloudy 22.5°c (72.5°f) air temperature...,rosberg,1.0,...,ROS,6/27/1985,Albert Park Grand Prix Circuit,-37.8497,144.96800,Melbourne,Australia,Street circuit,Clockwise,5.303
3,3,2014,1,australian,albert_park,2014-03-16,302.271,"overcast, 19°c (66°f) dry",kevin_magnussen,2.0,...,MAG,10/5/1992,Albert Park Grand Prix Circuit,-37.8497,144.96800,Melbourne,Australia,Street circuit,Clockwise,5.303
4,4,2015,1,australian,albert_park,2015-03-15,307.574,partly cloudy 17.5°c (63.5°f) air temperature...,kevin_magnussen,17.0,...,MAG,10/5/1992,Albert Park Grand Prix Circuit,-37.8497,144.96800,Melbourne,Australia,Street circuit,Clockwise,5.303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3045,3045,2020,14,turkish,istanbul,2020-11-15,309.396,overcast. wet and drying track. air 12.5°c (...,russell,16.0,...,RUS,2/15/1998,Istanbul Park,40.9517,29.40500,Istanbul,Turkey,Race circuit,Anti-clockwise,5.338
3046,3046,2020,14,turkish,istanbul,2020-11-15,309.396,overcast. wet and drying track. air 12.5°c (...,latifi,19.0,...,LAT,6/29/1995,Istanbul Park,40.9517,29.40500,Istanbul,Turkey,Race circuit,Anti-clockwise,5.338
3047,3047,2021,16,turkish,istanbul,2021-10-03,,,,,...,,,Istanbul Park,40.9517,29.40500,Istanbul,Turkey,Race circuit,Anti-clockwise,5.338
3048,3048,2021,13,dutch,zandvoort,2021-09-05,,,,,...,,,Circuit Park Zandvoort,52.3888,4.54092,Zandvoort,Netherlands,Race circuit,Clockwise,4.259


In [65]:
# make adjustments to data sets to make more concise/eliminate redundancies
weather_df_unique_weather = weather_df.drop_duplicates(subset=['date'])
weather_df_unique_weather = weather_df_unique_weather.set_index(['season','date'])
weather_df_unique_weather.sort_index()

# save large weather dataset to weather_df_inclusive csv
weather_df_unique_weather.to_csv('data/weather_df_inclusive.csv')


# refine weather df
weather_df_unique_weather_refined = weather_df_unique_weather.drop(columns=['distance','race_name','circuitId','driver_name','time','dateOfBirth','type','direction','tracklength','nationality'])
weather_df_unique_weather_refined.sort_index()

# save medium weather dataset to weather_df_inclusive_refined csv
weather_df_unique_weather_refined.to_csv('data/weather_df_inclusive_refined.csv')


# refine weather_df_unique_weather_refined weather df further
weather_df_unique_weather_refined2 = weather_df_unique_weather_refined.drop(columns=['Unnamed: 0','round','name','team','initials','finish_position','grid','points','qual_position','q_best', 'q_worst','q_mean', 'status'])
weather_df_unique_weather_refined2.sort_index()

# save refined weather dataset to weather_df_refined csv
weather_df_unique_weather_refined2.to_csv('data/weather_df_refined.csv')

In [66]:
weather_df_unique_weather_refined2.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,weather,circuitName,lat,long,location,country
season,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014,2014-03-16,"overcast, 19°c (66°f) dry",Albert Park Grand Prix Circuit,-37.84970,144.96800,Melbourne,Australia
2014,2014-03-30,"dry , 32°c dry",Sepang International Circuit,2.76083,101.73800,Kuala Lumpur,Malaysia
2014,2014-04-06,dry,Bahrain International Circuit,26.03250,50.51060,Sakhir,Bahrain
2014,2014-04-20,"mostly cloudy , dry",Shanghai International Circuit,31.33890,121.22000,Shanghai,China
2014,2014-05-11,sunny,Circuit de Barcelona-Catalunya,41.57000,2.26111,Montmeló,Spain
...,...,...,...,...,...,...,...
2021,2021-10-31,,Autódromo Hermanos Rodríguez,19.40420,-99.09070,Mexico City,Mexico
2021,2021-11-07,,Autódromo José Carlos Pace,-23.70360,-46.69970,São Paulo,Brazil
2021,2021-11-21,,Albert Park Grand Prix Circuit,-37.84970,144.96800,Melbourne,Australia
2021,2021-12-05,,Jeddah Street Circuit,21.54330,39.17280,Jeddah,Saudi Arabia


In [67]:
#start merging

In [68]:
merge1 = pd.merge(drivers_df, starterfields_df)

In [69]:
merge2 = pd.merge(merge1, races_df)

In [70]:
merge3 = pd.merge(merge2, retirements_df)

In [71]:
meta_data = pd.merge(weather_df_unique_weather_refined2, merge3)
meta_data

Unnamed: 0,weather,circuitName,lat,long,location,country,driver_id,carno,initials,name,...,speedtrap,date,season,availablecompounds,comment,nolaps,nolapsplanned,tracklength,"accidents per driver, per season","failures per driver, per season"
0,"overcast, 19°c (66°f) dry",Albert Park Grand Prix Circuit,-37.8497,144.9680,Melbourne,Australia,1,44,HAM,Lewis Hamilton,...,252.8,2014-03-16,2014,"A2,A3,I,W",,57,58,5303,0.0,3.0
1,"overcast, 19°c (66°f) dry",Albert Park Grand Prix Circuit,-37.8497,144.9680,Melbourne,Australia,2,3,RIC,Daniel Ricciardo,...,292.7,2014-03-16,2014,"A2,A3,I,W",,57,58,5303,0.0,2.0
2,"overcast, 19°c (66°f) dry",Albert Park Grand Prix Circuit,-37.8497,144.9680,Melbourne,Australia,3,6,ROS,Nico Rosberg,...,299.1,2014-03-16,2014,"A2,A3,I,W",,57,58,5303,0.0,2.0
3,"overcast, 19°c (66°f) dry",Albert Park Grand Prix Circuit,-37.8497,144.9680,Melbourne,Australia,4,20,MAG,Kevin Magnussen,...,316.9,2014-03-16,2014,"A2,A3,I,W",,57,58,5303,0.0,1.0
4,"overcast, 19°c (66°f) dry",Albert Park Grand Prix Circuit,-37.8497,144.9680,Melbourne,Australia,5,14,ALO,Fernando Alonso,...,304.5,2014-03-16,2014,"A2,A3,I,W",,57,58,5303,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10936,sunny dry 24.5°c,Baku City Circuit,40.3725,49.8533,Baku,Azerbaijan,40,16,LEC,Charles Leclerc,...,325.8,2019-04-28,2019,"A3,A4,A6,I,W",,51,51,6003,1.0,1.0
10937,sunny dry 24.5°c,Baku City Circuit,40.3725,49.8533,Baku,Azerbaijan,42,4,NOR,Lando Norris,...,321.6,2019-04-28,2019,"A3,A4,A6,I,W",,51,51,6003,1.0,3.0
10938,sunny dry 24.5°c,Baku City Circuit,40.3725,49.8533,Baku,Azerbaijan,43,23,ALB,Alexander Albon,...,327.9,2019-04-28,2019,"A3,A4,A6,I,W",,51,51,6003,0.0,1.0
10939,sunny dry 24.5°c,Baku City Circuit,40.3725,49.8533,Baku,Azerbaijan,44,63,RUS,George Russell,...,320.0,2019-04-28,2019,"A3,A4,A6,I,W",,51,51,6003,2.0,0.0


In [72]:
meta_data_refined = meta_data[['season', 'date', 'race_id','location','circuitName','weather','lat','long','driver_id','name','team','enginemanufacturer','resultposition','gridposition','status','completedlaps','comment','nolaps','tracklength']]
meta_data_refined = meta_data_refined.set_index(['season', 'date'])
meta_data_refined.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,race_id,location,circuitName,weather,lat,long,driver_id,name,team,enginemanufacturer,resultposition,gridposition,status,completedlaps,comment,nolaps,tracklength
season,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2014,2014-03-16,1,Melbourne,Albert Park Grand Prix Circuit,"overcast, 19°c (66°f) dry",-37.8497,144.9680,1,Lewis Hamilton,Mercedes,Mercedes,19,1,DNF,2,,57,5303
2014,2014-03-16,1,Melbourne,Albert Park Grand Prix Circuit,"overcast, 19°c (66°f) dry",-37.8497,144.9680,2,Daniel Ricciardo,RedBull,Renault,22,2,DQ,57,,57,5303
2014,2014-03-16,1,Melbourne,Albert Park Grand Prix Circuit,"overcast, 19°c (66°f) dry",-37.8497,144.9680,3,Nico Rosberg,Mercedes,Mercedes,1,3,F,57,,57,5303
2014,2014-03-16,1,Melbourne,Albert Park Grand Prix Circuit,"overcast, 19°c (66°f) dry",-37.8497,144.9680,4,Kevin Magnussen,McLaren,Mercedes,2,4,F,57,,57,5303
2014,2014-03-16,1,Melbourne,Albert Park Grand Prix Circuit,"overcast, 19°c (66°f) dry",-37.8497,144.9680,5,Fernando Alonso,Ferrari,Ferrari,4,5,F,57,,57,5303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019,2019-11-03,119,Austin,Circuit of the Americas,,30.1328,-97.6411,40,Charles Leclerc,Ferrari,Ferrari,4,4,F,56,,56,5513
2019,2019-11-03,119,Austin,Circuit of the Americas,,30.1328,-97.6411,42,Lando Norris,McLaren,Renault,7,8,F,56,,56,5513
2019,2019-11-03,119,Austin,Circuit of the Americas,,30.1328,-97.6411,43,Alexander Albon,RedBull,Honda,5,6,F,56,,56,5513
2019,2019-11-03,119,Austin,Circuit of the Americas,,30.1328,-97.6411,44,George Russell,Williams,Mercedes,17,18,F,54,,56,5513


In [73]:
# save meta data frame to files
meta_data.to_csv('meta_data_inclusive.csv')
meta_data_refined.to_csv('data/meta_data_refined.csv')

In [74]:
## creating refined weather data sets filtered on year for each track location

#2014
weather_df_2014 = weather_df_unique_weather_refined2[weather_df_unique_weather_refined2.index.get_level_values('season') == 2014]
weather_df_2014.sort_index()
weather_df_2014.to_csv('data/weathermap/weather_df_2014.csv')

#2015
weather_df_2015 = weather_df_unique_weather_refined2[weather_df_unique_weather_refined2.index.get_level_values('season') == 2015]
weather_df_2015.sort_index()
weather_df_2015.to_csv('data/weathermap/weather_df_2015.csv')

#2016
weather_df_2016 = weather_df_unique_weather_refined2[weather_df_unique_weather_refined2.index.get_level_values('season') == 2016]
weather_df_2016.sort_index()
weather_df_2016.to_csv('data/weathermap/weather_df_2016.csv')

#2017
weather_df_2017 = weather_df_unique_weather_refined2[weather_df_unique_weather_refined2.index.get_level_values('season') == 2017]
weather_df_2017.sort_index()
weather_df_2017.to_csv('data/weathermap/weather_df_2017.csv')

#2018
weather_df_2018 = weather_df_unique_weather_refined2[weather_df_unique_weather_refined2.index.get_level_values('season') == 2018]
weather_df_2018.sort_index()
weather_df_2018.to_csv('data/weathermap/weather_df_2018.csv')

#2019
weather_df_2019 = weather_df_unique_weather_refined2[weather_df_unique_weather_refined2.index.get_level_values('season') == 2019]
weather_df_2019.sort_index()
weather_df_2019.to_csv('data/weathermap/weather_df_2019.csv')

In [75]:
## creating refined incident map data sets filtered on year for each track location
    # first refine incident data frame from meta data
incident_report_df =  meta_data[['season', 'date', 'race_id','location','circuitName','weather','lat','long','name','status', 'accidents per driver, per season','failures per driver, per season']]
incident_report_df.sort_index()
incident_report_df.to_csv('data/incidentmap/incident_report_df_INCLUSIVE.csv')
# refine the incident data per location per year
incident_report_df_refined = incident_report_df[['season', 'date', 'race_id','location','circuitName','weather','lat','long','name','status']]
incident_report_df_refined = incident_report_df_refined.set_index(['season', 'date','location'])
incident_report_df_refined.sort_index()
incident_report_df_refined.to_csv('data/incidentmap/incident_report_df_REFINED.csv')


#2014
incident_df_2014 = incident_report_df_refined[incident_report_df_refined.index.get_level_values('season') == 2014]
incident_df_2014.sort_index()
incident_df_2014.to_csv('data/incidentmap/incident_df_2014.csv')

#2015
weather_df_2015 = incident_report_df_refined[incident_report_df_refined.index.get_level_values('season') == 2015]
weather_df_2015.sort_index()
weather_df_2015.to_csv('data/incidentmap/incident_df_2015.csv')

#2016
weather_df_2016 = incident_report_df_refined[incident_report_df_refined.index.get_level_values('season') == 2016]
weather_df_2016.sort_index()
weather_df_2016.to_csv('data/incidentmap/incident_df_2016.csv')

#2017
weather_df_2017 = incident_report_df_refined[incident_report_df_refined.index.get_level_values('season') == 2017]
weather_df_2017.sort_index()
weather_df_2017.to_csv('data/incidentmap/incident_df_2017.csv')

#2018
weather_df_2018 = incident_report_df_refined[incident_report_df_refined.index.get_level_values('season') == 2018]
weather_df_2018.sort_index()
weather_df_2018.to_csv('data/incidentmap/incident_df_2018.csv')

#2019
weather_df_2019 = incident_report_df_refined[incident_report_df_refined.index.get_level_values('season') == 2019]
weather_df_2019.sort_index()
weather_df_2019.to_csv('data/incidentmap/incident_df_2019.csv')

In [85]:
## creating refined driver race result data sets filtered on year for each driver
    # first refine data frame from meta data
driver_report_df =  meta_data[['season', 'date', 'race_id','location','circuitName','weather','name','team','enginemanufacturer','status', 'accidents per driver, per season','failures per driver, per season']]
driver_report_df.sort_index()
driver_report_df.to_csv('data/driverdata/driver_report_df_INCLUSIVE.csv')
# refine the driver data per driver per year
driver_report_df_refined = driver_report_df[['season', 'name','team','enginemanufacturer','accidents per driver, per season','failures per driver, per season']]
driver_report_df_refined = driver_report_df_refined.set_index(['season', 'name'])
driver_report_df_refined.sort_index()
driver_report_df_refined.to_csv('data/driverdata/driver_report_df_REFINED.csv')


#2014
driver_report_df_2014 = driver_report_df_refined[driver_report_df_refined.index.get_level_values('season') == 2014]
driver_report_df_2014.sort_index()
driver_report_df_2014.to_csv('data/driverdata/driver_report_df_2014.csv')

#2015
driver_report_df_2015 = driver_report_df_refined[driver_report_df_refined.index.get_level_values('season') == 2014]
driver_report_df_2015.sort_index()
driver_report_df_2015.to_csv('data/driverdata/driver_report_df_2015.csv')

#2016
driver_report_df_2016 = driver_report_df_refined[driver_report_df_refined.index.get_level_values('season') == 2014]
driver_report_df_2016.sort_index()
driver_report_df_2016.to_csv('data/driverdata/driver_report_df_2016.csv')

#2017
driver_report_df_2017 = driver_report_df_refined[driver_report_df_refined.index.get_level_values('season') == 2014]
driver_report_df_2017.sort_index()
driver_report_df_2017.to_csv('data/driverdata/driver_report_df_2017.csv')

#2018
driver_report_df_2018 = driver_report_df_refined[driver_report_df_refined.index.get_level_values('season') == 2014]
driver_report_df_2018.sort_index()
driver_report_df_2018.to_csv('data/driverdata/driver_report_df_2018.csv')

#2019
driver_report_df_2019 = driver_report_df_refined[driver_report_df_refined.index.get_level_values('season') == 2014]
driver_report_df_2019.sort_index()
driver_report_df_2019.to_csv('data/driverdata/driver_report_df_2019.csv')  






## creating driver report data personal to driver filtered on year (season)
driver_report_df_refined_perdriver = meta_data[['name','season','team','enginemanufacturer','accidents per driver, per season','failures per driver, per season']]
driver_report_df_refined_perdriver = driver_report_df_refined_perdriver.set_index(['name'])
    #drop season duplicates so results are per season
driver_report_df_refined_perdriver = driver_report_df_refined_perdriver.drop_duplicates(subset=['season'])
driver_report_df_refined_perdriver.sort_index()
driver_report_df_refined_perdriver.to_csv('data/driverdata/driver_report_df_REFINED_perdriver.csv')

##LewisHam
driver_report_df_Hamilton = driver_report_df_refined_perdriver[driver_report_df_refined_perdriver.index.get_level_values('name') == 'Lewis Hamilton']
driver_report_df_Hamilton.sort_index()
driver_report_df_Hamilton.to_csv('data/driverdata/driver_report_df_Hamilton.csv')

##DaniRic
driver_report_df_Ricciardo = driver_report_df_refined_perdriver[driver_report_df_refined_perdriver.index.get_level_values('name') == 'Daniel Ricciardo']
driver_report_df_Ricciardo.sort_index()
driver_report_df_Ricciardo.to_csv('data/driverdata/driver_report_df_Ricciardo.csv')

##FernAlon
driver_report_df_Alonso = driver_report_df_refined_perdriver[driver_report_df_refined_perdriver.index.get_level_values('name') == 'Fernando Alonso']
driver_report_df_Alonso.sort_index()
driver_report_df_Alonso.to_csv('data/driverdata/driver_report_df_Alonso.csv')

##Leclerc
driver_report_df_Leclerc = driver_report_df_refined_perdriver[driver_report_df_refined_perdriver.index.get_level_values('name') == 'Charles Leclerc']
driver_report_df_Leclerc.sort_index()
driver_report_df_Leclerc.to_csv('data/driverdata/driver_report_df_Leclerc.csv')

##GRussell
driver_report_df_Russell = driver_report_df_refined_perdriver[driver_report_df_refined_perdriver.index.get_level_values('name') == 'George Russell']
driver_report_df_Russell.sort_index()
driver_report_df_Russell.to_csv('data/driverdata/driver_report_df_Russell.csv')
