# Formula 1 - Data Collection

### Import Relevant Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
import requests
import bs4
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import math

In [3]:
from collections import defaultdict
import regex as re
import datetime

### Extract Individual Race Information from the API

#### Steps taken were:
##### Ergast API:

    - Use json to extract information from 2014 onwards from Ergast API, containing F1 data from 1950 to present.
    - All scraped information stored in a dictionary of lists and loaded into a dataframe.

##### Processing:

    - Alter 'race_name' variable by custom function for using in further scraping from URLs via format strings.
    - Create further functions for cycling through each race's URL and extracting weather and distance information.

In [4]:
races = defaultdict(list)

for year in list(range(2014, (datetime.datetime.now().date().year + 1))):
    
    url = f'https://ergast.com/api/f1/{year}.json'
    r = requests.get(url)
    json = r.json()

    for item in json['MRData']['RaceTable']['Races']:
        try:
            races['season'].append(int(item['season']))
        except:
            races['season'].append(None)

        try:
            races['round'].append(int(item['round']))
        except:
            races['round'].append(None)

        try:
            races['race_name'].append(item['raceName'])
        except:
            races['race_name'].append(None)
            
        try:
            races['circuitId'].append(item['Circuit']['circuitId'])
        except:
            races['circuitId'].append(None)
            
        try:
            races['lat'].append(float(item['Circuit']['Location']['lat']))
        except:
            races['lat'].append(None)

        try:
            races['long'].append(float(item['Circuit']['Location']['long']))
        except:
            races['long'].append(None)

        try:
            races['country'].append(item['Circuit']['Location']['country'])
        except:
            races['country'].append(None)

        try:
            races['date'].append(item['date'])
        except:
            races['date'].append(None)

        try:
            races['url'].append(item['url'])
        except:
            races['url'].append(None)
        
races = pd.DataFrame(races)

In [5]:
races.columns

Index(['season', 'round', 'race_name', 'circuitId', 'lat', 'long', 'country',
       'date', 'url'],
      dtype='object')

#### Transform the Data into the Required Format

In [6]:
def race_name(name):
    split_name = name.split()
    new_name = ('-'.join(split_name[:-2])).lower()
    if new_name=='mexico-city':
        return 'mexican'
    if new_name=='united-states':
        return 'us'
    return new_name

races.race_name = races.race_name.apply(race_name)

In [7]:
race_name('Abu Dhabi Grand Prix')

'abu-dhabi'

In [8]:
races.race_name.unique()

array(['australian', 'malaysian', 'bahrain', 'chinese', 'spanish',
       'monaco', 'canadian', 'austrian', 'british', 'german', 'hungarian',
       'belgian', 'italian', 'singapore', 'japanese', 'russian', 'us',
       'brazilian', 'abu-dhabi', 'mexican', 'european', 'azerbaijan',
       'french', 'styrian', '70th-anniversary', 'tuscan', 'eifel',
       'portuguese', 'emilia-romagna', 'turkish', 'sakhir', 'dutch',
       'são-paulo', 'qatar', 'saudi-arabian', 'miami', 'las-vegas'],
      dtype=object)

In [9]:
races.season.unique()

array([2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024],
      dtype=int64)

In [10]:
def extract_weather(cols):
    for col in cols:
        try:
            if 'Weather' in str(col.find('th', attrs={'scope':'row'})):
                return col.find('td').text.strip('\n')
        except:
            return ''

In [11]:
def extract_distance(cols):
    for col in cols:
        try:
            if 'Distance' in str(col.find('th', attrs={'scope':'row'})):
                return col.find('td').text.split(', ')[1].split('km')[0].strip(' ')
        except:
            return ''

In [12]:
def weather(url):
    try:
        result = requests.get(url)
        soup = BeautifulSoup(result.text, 'html.parser')
        table = soup.find('table', attrs={'class':'infobox'})
        cols = table.find_all('tr')
        weather = extract_weather(cols)
        return weather
    except:
        return ''

In [13]:
def distance(url):
    try:
        result = requests.get(url)
        soup = BeautifulSoup(result.text, 'html.parser')
        table = soup.find('table', attrs={'class':'infobox'})
        cols = table.find_all('tr')
        distance = extract_distance(cols)
        return distance
    except:
        return ''

In [14]:
races['weather'] = races.url.apply(weather)
races['distance'] = races.url.apply(distance)

### F1 Fansite

### To fill gaps in the weather data, data was taken from f1-fansite.com. 
#### Steps taken were:
##### Scraping:

    - Use selenium to initiate an undetected chromedriver instance to circumvent cloud flare protection.
    - Iterate through the different url structures used for races during the prescribed years.
    - Use BeautifulSoup to search through the results and identify the location of the Weather Conditions info.
    - Many commonly labelled sections, so the list of html objects was converted into string format.
        - The strings were then iterated through to locate an instance containing the appropriate data.
    - All unformatted output was then stored in the 'scraped_all' dict of lists, which was converted to a DF.

##### Processing:

    - Rows containing null values were then dropped and a process of raw data refinement ensued.
    - Refinement included:
            - String-section replacement.
            - Regular expression to locate temperature ranges and take averages.
    - Since the scraping took some time, the weather data was stored in csv format in the notebook directory.
    - This weather was then combined with previously extracted weather data using the races dataframe's urls.
        - Combination was completed via string concatenation.
        - Multiple occurences of certain words is later dealt with by Binarised Count Vectorisation.




<!--     - First search through each string typed section for the word 'Weather'.
    - Then search through the section with regular expression to separate out terms. -->

In [15]:
f1_fan_urls = ['https://www.f1-fansite.com/f1-result/race-results-{}-{}-f1-grand-prix/',
               'https://www.f1-fansite.com/f1-result/race-results-{}-{}-f1-gp/',
               'https://www.f1-fansite.com/f1-result/race-result-{}-{}-f1-gp/',
               'https://www.f1-fansite.com/f1-result/{}-{}-grand-prix-race-results/',
               'https://www.f1-fansite.com/f1-result/{}-{}-grand-prix-results/']

In [16]:
# for use in scraping

race_dps = []

for years in np.array(races.season.unique()):
    race_dps.extend([(years, race_name) if race_name != '70th-anniversary' 
                     else (race_name.split('-')[0], race_name.split('-')[1], years)
                     for race_name in 
                     list(races[races.season == years]['race_name'])])
    race_dps.extend([(years, 'usa') for race_name in
                 list(races[races.season == years]['race_name']) if race_name=='us'])
    race_dps.extend([(years, 'mexico') for race_name in
                 list(races[races.season == years]['race_name']) if race_name=='mexican'])

In [17]:
import undetected_chromedriver as uc
from selenium import webdriver

scraped_all = defaultdict(list)
failed_all = []


for race in race_dps:
    found_weather = False
    found_page = False
    options = webdriver.ChromeOptions() 
    options.add_argument("start-maximized")
    for url in f1_fan_urls:
        driver = uc.Chrome(options=options)
        try:
            driver.get(url.format(race[0], race[1]))
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            driver.quit()
            if soup.find('body').get('class')[0] != 'error404':
                found_page = True
                break
            else:
                snippet = url.format(race[0], race[1]).strip('/').split('/')[-1]
        except:
            pass
    if found_page == False:
        if race[0] == '70th':
            scraped_all['season'].append(race[2])
            scraped_all['race_name'].append('-'.join([race[0], race[1]]))
            scraped_all['weather'].append(np.nan)
        else:
            print(f'page not found for {race}')
            scraped_all['season'].append(race[0])
            scraped_all['race_name'].append(race[1])
            scraped_all['weather'].append(np.nan)
        continue

    sections = soup.find_all('p')

    for section in sections:
        if 'Weather' in str(section):
            found_weather = True
            w_section = str(section)
            cats = w_section.split('br')

            for cat in cats:
                if 'Weather' in cat:
                    print(f"data found for {race}")
                    if race[0] == '70th':
                        scraped_all['season'].append(race[2])
                        scraped_all['race_name'].append('-'.join([race[0], race[1]]))
                        scraped_all['weather'].append(cat)
                        break
                    
                    else:
                        scraped_all['season'].append(race[0])
                        scraped_all['race_name'].append(race[1])
                        scraped_all['weather'].append(cat)
    
    if found_weather == False:
        if race[0] == '70th':
            scraped_all['season'].append(race[2])
            scraped_all['race_name'].append('-'.join([race[0], race[1]]))
            scraped_all['weather'].append(np.nan)
        else:
            scraped_all['season'].append(race[0])
            scraped_all['race_name'].append(race[1])
            scraped_all['weather'].append(np.nan)


f1_fan_weather = pd.DataFrame(scraped_all)

ModuleNotFoundError: No module named 'undetected_chromedriver'

In [None]:
len(race_dps)

In [None]:
f1_fan_weather[f1_fan_weather.weather.isnull()==True]
f1_fan_complete = f1_fan_weather[f1_fan_weather.weather.isnull()==False]
f1_fan_complete

In [None]:
weather_df = f1_fan_weather.copy()
weather_df.dropna(inplace=True)

In [None]:
def text_filter(weather):
    weather = weather.lower().replace('Weather:', '').replace('\xa0', '').replace('<p> ', '').replace('<', '').replace('º', '°').replace('&amp;', '').replace(',', '').replace('/>', '').replace('/p>', '') \
    .replace('p>', '').replace('/a>', '').replace('☁', 'clouds').replace('☂', 'rain') \
    .replace('9.4.5', '9.4-9.5').replace('/', ' ').replace('dryovercast', 'dry overcast') \
    .replace('dryclouded', 'dry clouded').replace('drysunny', 'dry sunny') \
    .replace('\xa0', '').replace('overcast22°c', 'overcast 22°c').replace('239', '23.9') \
    .replace('296', '29.6').replace('26.°c', '26°c').replace('22.3-24', '22.3-24.0') \
    .replace('20.4-°c', '20.4°c').replace('24.°c', '24°c').replace('clear26°c', 'clear 26°c') \
    .replace('dryclear', 'dry clear').replace('20.0', '20').replace('34.0', '34') \
    .replace('21.0', '21').replace('and', '').strip(' ')
    
    return weather

In [None]:
def range_filter(weather):
    if '-' in weather:
        nums = re.findall(r'[0-9]+[.][0-9]', weather)
        try:
            mean = str((eval(nums[0])+eval(nums[1]))/2)
            for num in nums:
                weather = weather.replace('-'.join(nums), mean)
            try:
                wrong_format = re.findall(r'[0-9]+[\.\,]?[0-9]*', weather)[0]
                new_format = wrong_format.replace(',', '.')
                weather = weather.replace(wrong_format, new_format)
            except:
                pass
        except:
            pass
    return weather

In [None]:
def weather_extract(sub_soup):
    try:
        if '</i>' in sub_soup:
            weath = sub_soup.split('</i>')[1:]
            if len(weath)==1:
                weather = weath[0].strip(' ').strip('<').replace('\xa0', '') \
                .replace('i class="fa fa-thermometer-half">', '')
            else:
                weather = (' '.join(weath)).replace('i class="fa fa-thermometer-half">', '').replace('<', '')
        else:
            weather = sub_soup.replace('<p>Weather: ', '') \
            .replace('<', '')('i class="fa fa-thermometer-half">', '').replace('<', '')
    except:
        weather = sub_soup.replace('Weather:', '').replace('\xa0', '').replace('<p> ', '').replace('<', '')
    
    weather = text_filter(weather)
    weather = range_filter(weather)
    
    return weather



weather_df.weather = weather_df.weather.apply(weather_extract)

In [None]:
weather_df.weather.unique()

In [None]:
# weather_df.to_csv('./CSV/weather.csv')

In [None]:
weather_df = pd.read_csv('./CSV/weather.csv').drop(['Unnamed: 0'], axis=1)

weather_df.head()

In [None]:
weather_df[weather_df.race_name.str.contains('anniversary')]

In [None]:
def race_name_paralleliser(race):
    if race == 'usa':
        return 'us'
    elif race == 'mexico':
        return 'mexican'
    else:
        return race
    
    
    
weather_df.race_name = weather_df.race_name.apply(race_name_paralleliser)

In [None]:
def remove_citations(data):
    try:
        citations = re.findall(r'[\[][0-9][\]]', data)
        data = data.replace(citations[0], '')
    except:
        pass
    
    return data

In [None]:
def range_average(data):
    try:
        nums = re.findall(r'[0-9]+\sto\s[0-9]+', data)
        for num in nums:
            nums_sep = num.split(' to ')
            average = (eval(nums_sep[0])+eval(nums_sep[1]))/2
            data = data.replace(str(num), str(average))
            
    except:
        pass
    
    try:
        nums = re.findall(r'[0-9]+[–][0-9]+', data)
        for num in nums:
            nums_sep = num.split('–')
            average = (eval(nums_sep[0])+eval(nums_sep[1]))/2
            data = data.replace(str(num), str(average))
            
    except:
        pass
    
    return data

In [None]:
def remove_rogue_endings(data):
    try:
        rogue_s = re.findall(r'temperature[\s]+[s]\s', data)
        for s in rogue_s:
            data = data.replace(s, 'temperatures ')
    except:
        pass
    
    
    try:
        rogue_ing = re.findall(r'[\s]+ing[\s\.\,\:\;]+', data)
        for ing in rogue_ing:
            data = data.replace(ing, 'ing ')
    except:
        pass
    
    return data

In [None]:
def race_weather_extract(data):
    
    if (data == 'None') or (data == np.nan) or (data == 'nan'):
        return ''
    
    try:
        data = data.lower()
    except:
        pass
    
    data = str(data).replace('\xa0', '').replace('sunny', 'sunny ') \
    .replace('temperature', 'temperature ').replace(';', ' ').replace('(', ' (') \
    .replace('cloudy', 'cloudy ').replace('clear', 'clear ').replace('later', 'later ') \
    .replace('dry', 'dry ').replace('times', 'times ').replace(':', ' ')
    
    data = remove_citations(data)
    data = range_average(data)
    data = remove_rogue_endings(data)    
    
    return data + ' '

In [None]:
races.weather = races.weather.apply(race_weather_extract)

In [None]:
# races.weather.apply(race_weather_extract).unique()

In [None]:
weather_df.columns

In [None]:
races_plus_all_weather = pd.merge(races, weather_df, on=['race_name', 'season'], how='outer')
races_plus_all_weather.weather_y.fillna('', inplace=True)
races_plus_all_weather['weather'] = races_plus_all_weather.weather_x + races_plus_all_weather.weather_y
races_plus_all_weather.drop(['weather_x', 'weather_y', 'url'], axis=1, inplace=True)
races_plus_all_weather

### Extract Results from the API

#### Steps taken were:
##### Ergast API:

    - Use json to extract the information from the Ergast API, containing F1 data from 1950 to present.
    - All scraped information stored in a dictionary of lists and loaded into a dataframe.

##### Processing:

    - Overall race time for the drivers in millions of milliseconds.
        - For the purpose of preliminary scaling, time variable divided by 1000 to convert to seconds.

In [None]:
rounds = []
for year in np.array(races.season.unique()):
    rounds.append([year, list(races[races.season == year]['round'])])

# query API
    
results = defaultdict(list)

for season in rounds:
    for race in season[1]:
        try:
            url = f'https://ergast.com/api/f1/{season[0]}/{race}/results.json'
            r = requests.get(url)
            json = r.json()

            item = json['MRData']['RaceTable']['Races'][0]
            
            for j in range(len(item['Results'])):
                try:
                    results['season'].append(int(item['season']))
                except:
                    results['season'].append(np.nan)

                try:
                    results['round'].append(int(item['round']))
                except:
                    results['round'].append(np.nan)

                try:
                    results['circuitId'].append(item['Circuit']['circuitId'])
                except:
                    results['circuitId'].append(np.nan)

                try:
                    results['driverId'].append(item['Results'][j]['Driver']['driverId'])
                except:
                    results['driverId'].append(np.nan)
                
                try:
                    results['finish_position'].append(int(item['Results'][j]['position']))
                except:
                    results['finish_position'].append(np.nan)    

                try:
                    results['date_of_birth'].append(item['Results'][j]['Driver']
                                                    ['dateOfBirth'])
                except:
                    results['date_of_birth'].append(np.nan)

                try:
                    results['nationality'].append(item['Results'][j]['Driver']
                                                  ['nationality'])
                except:
                    results['nationality'].append(np.nan)

                try:
                    results['constructor'].append(item['Results'][j]['Constructor']
                                                  ['constructorId'])
                except:
                    results['constructor'].append(np.nan)

                try:
                    results['grid'].append(int(item['Results'][j]['grid']))
                except:
                    results['grid'].append(np.nan)

                try:
                    results['time'].append(int(item['Results'][j]['Time']['millis']))
                except:
                    results['time'].append(np.nan)

                try:
                    results['status'].append(item['Results'][j]['status'])
                except:
                    results['status'].append(np.nan)

                try:
                    results['points'].append(int(item['Results'][j]['points']))
                except:
                    results['points'].append(np.nan)


        except:
            pass

results = pd.DataFrame(results)

In [None]:
results.time = results.time/1000
results[results.season==2014].head(50)

### Extract Qualifying Data from the API

#### Steps taken were:
##### Ergast API:

    - Use json to extract the information from the Ergast API, containing F1 data from 1950 to present.
    - All scraped information stored in a dictionary of lists and loaded into a dataframe.

##### Processing:

    - Qualifying times reformatted to seconds and milliseconds using qual_time_formatter function.
    - Average and Best qualifying times then produced for each driver for each race.
        - This removes the issue of non-appearances in Q2 and Q3 for drivers qualifying below the threshold.
    - Q1, Q2 & Q3 then dropped from the dataframe.

In [None]:
rounds = []
for year in np.array(races.season.unique()):
    rounds.append([year, list(races[races.season == year]['round'])])

# query API
    
qualis = defaultdict(list)

for season in rounds:
    for race in season[1]:
        try:
            url = f'https://ergast.com/api/f1/{season[0]}/{race}/qualifying.json'
            r = requests.get(url)
            json = r.json()

            item = json['MRData']['RaceTable']['Races'][0]
            for j in range(len(item['QualifyingResults'])):
                try:
                    qualis['season'].append(int(item['season']))
                except:
                    qualis['season'].append(np.nan)

                try:
                    qualis['round'].append(int(item['round']))
                except:
                    qualis['round'].append(np.nan)

                try:
                    qualis['circuitId'].append(item['Circuit']['circuitId'])
                except:
                    qualis['circuitId'].append(np.nan)

                try:
                    qualis['driverId'].append(item['QualifyingResults'][j]['Driver']['driverId'])
                except:
                    qualis['driverId'].append(np.nan)

                try:
                    qualis['qual_position'].append(int(item['QualifyingResults'][j]['position']))
                except:
                    qualis['qual_position'].append(np.nan)    

                try:
                    qualis['constructor'].append(item['QualifyingResults'][j]['Constructor']
                                                  ['constructorId'])
                except:
                    qualis['constructor'].append(np.nan)

                try:
                    qualis['q1'].append(str(item['QualifyingResults'][j]['Q1']))
                except:
                    qualis['q1'].append(np.nan)

                try:
                    qualis['q2'].append(str(item['QualifyingResults'][j]['Q2']))
                except:
                    qualis['q2'].append(np.nan)

                try:
                    qualis['q3'].append(str(item['QualifyingResults'][j]['Q3']))
                except:
                    qualis['q3'].append(np.nan)


        except:
            pass

qualifying = pd.DataFrame(qualis)

In [None]:
qualifying.head()

In [None]:
def qual_time_formatter(time):
    try:
        mins = eval(time[0])
        if time[2] == '0':
            secs = eval(time[3])
        else:
            secs = eval(time[2:4])
        mils = float(time[-4:])
        return((mins*60)+secs+mils)
    except:
        return np.nan

for qual in ['q1', 'q2', 'q3']:
    qualifying[qual] = qualifying[qual].apply(qual_time_formatter)

In [None]:
def quali_best(a, b, c):
    return min(a, b, c)

qualifying['q_best'] = qualifying.apply(lambda x: quali_best(x[6], x[7], x[8]), axis=1)

In [None]:
def quali_worst(a, b, c):
    return max(a, b, c)

qualifying['q_worst'] = qualifying.apply(lambda x: quali_worst(x[6], x[7], x[8]), axis=1)

In [None]:
def quali_average(a, b, c):
    try:
        sums = []
        for qual in [a, b, c]:
            if qual > 0:
                sums.append(qual)
        return (sum(sums)/len(sums))
    except:
        return np.nan

qualifying['q_mean'] = qualifying.apply(lambda x: quali_average(x[6], x[7], x[8]), axis=1)

Not all drivers make it out of q1 into q2, and then in to q3 - therefore for q2 and q3 there is some null data.

For this reason I've taken the overall best, mean and worst qualifying times for each driver as an indicator of their capability at that particular track on that particular weekend.

Since these are not strictly related, the three times should help identify consistency within qualifying for each driver.

In [None]:
qualifying.drop(['q1', 'q2', 'q3'], axis=1, inplace=True)

In [None]:
qualifying.corr()

### Extract Driver Information from the API

#### Steps taken were:
##### Ergast API:

    - Use json to extract the information from the Ergast API, containing F1 data from 1950 to present.
    - All scraped information stored in a dictionary of lists and loaded into a dataframe.

##### Processing:

    - No processing required at this stage.
        - Further processing on this data during merge with other dataframes.

In [None]:
# query API

drivers = defaultdict(list)

for year in results.season.unique():
    url = f'http://ergast.com/api/f1/{year}/drivers.json'
    r = requests.get(url)
    json = r.json()
    
    try:
        items = json['MRData']['DriverTable']['Drivers']
        for j in range(len(items)):
            
            try:
                drivers['driverId'].append(items[j]['driverId'])
            except:
                drivers['driverId'].append(np.nan)
            
            try:
                forename = items[j]['givenName']
                surname = items[j]['familyName']
                
                drivers['name'].append(forename + ' ' + surname)
            except:
                drivers['name'].append(np.nan)
            
            try:
                drivers['nationality'].append(items[j]['nationality'])
            except:
                drivers['nationality'].append(np.nan)
            
            try:
                drivers['code'].append(items[j]['code'])
            except:
                drivers['code'].append(np.nan)            
            
            try:
                drivers['dateOfBirth'].append(items[j]['dateOfBirth'])
            except:
                drivers['dateOfBirth'].append(np.nan)
        
    except:
        pass
        
drivers = pd.DataFrame(drivers)
drivers = drivers.drop_duplicates().reset_index(drop=True)
drivers.head()

In [None]:
results.status.unique()

In [None]:
drivers.nationality.unique()

### Extract Circuit Information from API

#### Steps taken were:
##### Ergast API:

    - Use json to extract information contained within the Ergast API, containing F1 data from 1950 to present.
    - All scraped information stored in a dictionary of lists and loaded into a dataframe.
    
##### Further Scraping from Wikipedia

    - I found that wikipedia held all of the more in-depth information I required for the circuits, e.g.,
        - Street or Race circuit.
        - Clockwise/Counterclockwise (perhaps affecting the tyres of certain cars more for certain setups...)
        - Individual lap length.
        - Further information including name and location that were required for creating keys for merging.
    - Functions were created for each of the above to process a BeautifulSoup object.

##### Processing:

    - No processing required at this stage.
        - Further processing on this data during merge with other dataframes.
        
# still not finished

In [None]:
# query API

circuits = defaultdict(list)

for year in results.season.unique():
    url = f'http://ergast.com/api/f1/{year}/circuits.json'
    r = requests.get(url)
    json = r.json()
    
    try:
        items = json['MRData']['CircuitTable']['Circuits']
        for j in range(len(items)):
            
            try:
                circuits['circuitId'].append(items[j]['circuitId'])
            except:
                circuits['circuitId'].append(np.nan)
            
            try:
                circuits['circuitName'].append(items[j]['circuitName'])
            except:
                circuits['circuitName'].append(np.nan)
            
            try:
                circuits['lat'].append(items[j]['Location']['lat'])
            except:
                circuits['lat'].append(np.nan)

            try:
                circuits['long'].append(items[j]['Location']['long'])
            except:
                circuits['long'].append(np.nan)

            try:
                circuits['locality'].append(items[j]['Location']['locality'])
            except:
                circuits['locality'].append(np.nan)
            
            try:
                circuits['country'].append(items[j]['Location']['country'])
            except:
                circuits['country'].append(np.nan)
            
            try:
                circuits['url'].append(items[j]['url'])
            except:
                circuits['url'].append(np.nan)
        
    except:
        pass
        
circuits = pd.DataFrame(circuits)
circuits = circuits.drop_duplicates().reset_index(drop=True)
circuits

In [None]:
results.columns

In [None]:
circuits.circuitId

In [None]:
def extractor(line, i):
    try:
        return line[i]
    except:
        return np.nan

In [None]:
scraped = defaultdict(list)

url = 'https://en.wikipedia.org/wiki/List_of_Formula_One_circuits'
result = requests.get(url)
soup = BeautifulSoup(result.text, 'html.parser')
tables = soup.find_all('table', attrs={'class':'wikitable'})
table = tables[1]
body = table.find('tbody')
for row in body.find_all('tr'):
    row_info = []
    line = [r.text.strip('\n').strip('✔') for r in row.find_all('td') if r.text.strip('\n')!='']

    scraped['name'].append(extractor(line, 0))
    track_type = extractor(line, 1)
    try:
        track_type.strip('circuit').strip(' ')
    except:
        pass
    scraped['type'].append(track_type)
    scraped['direction'].append(extractor(line, 2))
    scraped['location'].append(extractor(line, 3))
    scraped['length'].append(extractor(line, 4))

In [None]:
circuits_info = pd.DataFrame(scraped)
circuits_info.drop([0], inplace=True)
circuits_info.reset_index(drop=True, inplace=True)
circuits_info.head()

In [None]:
circuits_info.length

In [None]:
def length_formatter(length):
    return float(length.split('km')[0].strip(' ').strip('\xa0'))

In [None]:
circuits_info.length = circuits_info.length.apply(length_formatter)

In [None]:
circuits_info.length.unique()

    None of the following circuits have not yet been completed since 2014 at the time of EDA, being scheduled for later in 2021.
    
    Circuit Zandvoort
    Jeddah Street Circuit
    Hard Rock Stadium Circuit

In [None]:
circuits.shape

In [None]:
circuits.circuitName.unique()

In [None]:
def referencer(name):
    name = name.lower()
    
    if 'adelaide' in name:
        return 'adelaide'
    elif 'diab' in name:
        return 'ain-diab'
    elif 'aintree' in name:
        return 'aintree'
    elif 'albert' in name:
        return 'albert_park'
    elif 'americas' in name:
        return 'americas'
    elif 'scandinavian' in name:
        return 'anderstorp'
    elif 'avus' in name:
        return 'avus'
    elif 'baku' in name:
        return 'BAK'
    elif 'bahrain' in name:
        return 'bahrain'
    elif 'boavista' in name:
        return 'boavista'
    elif 'brands hatch' in name:
        return 'brands_hatch'
    elif 'bremgarten' in name:
        return 'bremgarten'
    elif 'buddh' in name:
        return 'buddh'
    elif 'catalunya' in name:
        return 'catalunya'
    elif 'charade' in name:
        return 'charade'
    elif ('fair park' in name) or ('dallas' in name):
        return 'dallas'
    elif 'detroit' in name:
        return 'detroit'
    elif 'dijon' in name:
        return 'dijon'
    elif 'donington' in name:
        return 'donington'
    elif 'essarts' in name:
        return 'essarts'
    elif 'estoril' in name:
        return 'estoril'
    elif 'fuji' in name:
        return 'fuji'
    elif 'juan g' in name:
        return 'galvez'
    elif 'prince george' in name:
        return 'george'
    elif 'hanoi' in name:
        return 'hanoi'
    elif 'hockenheimring' in name:
        return 'hockenheimring'
    elif 'hungaroring' in name:
        return 'hungaroring'
    elif 'imperial' in name:
        return 'port_imperial'
    elif 'dino ferrari' in name:
        return 'imola'
    elif 'indianapolis' in name:
        return 'indianapolis'
    elif 'carlos pace' in name:
        return 'interlagos'
    elif 'istanbul' in name:
        return 'istanbul'
    elif 'jarama' in name:
        return 'jarama'
    elif 'piquet' in name:
        return 'jacarepagua'
    elif 'jeddah' in name:
        return 'jeddah'
    elif 'jerez' in name:
        return 'jerez'
    elif 'kyalami' in name:
        return 'kyalami'
    elif ('las vegas' in name) or ('caesars' in name):
        return 'las_vegas'
    elif ('le mans' in name) or ('bugatti' in name):
        return 'lemans'
    elif 'long beach' in name:
        return 'long_beach'
    elif 'magny' in name:
        return 'magny_cours'
    elif 'marina bay' in name:
        return 'marina_bay'
    elif 'monaco' in name:
        return 'monaco'
    elif 'monsanto' in name:
        return 'monsanto'
    elif 'montju' in name:
        return 'montjuic'
    elif 'monza' in name:
        return 'monza'
    elif 'mosport' in name:
        return 'mosport'
    elif 'mugello' in name:
        return 'mugello'
    elif 'nivelles' in name:
        return 'nivelles'
    elif 'nürburgring' in name:
        return 'nurburgring'
    elif 'okayama' in name:
        return 'okayama'
    elif 'a1' in name:
        return 'osterreichring'
    elif 'pedralbes' in name:
        return 'pedralbes'
    elif 'pescara' in name:
        return 'pescara'
    elif 'phoenix' in name:
        return 'phoenix'
    elif 'algarve' in name:
        return 'portimao'
    elif 'red bull' in name:
        return 'red_bull_ring'
    elif 'reims' in name:
        return 'reims'
    elif 'ricard' in name:
        return 'ricard'
    elif 'riverside' in name:
        return 'riverside'
    elif 'hermanos' in name:
        return 'rodriguez'
    elif 'sebring' in name:
        return 'sebring'
    elif 'sepang' in name:
        return 'sepang'
    elif 'shanghai' in name:
        return 'shanghai'
    elif 'silverstone' in name:
        return 'silverstone'
    elif 'sochi' in name:
        return 'sochi'
    elif 'francorchamps' in name:
        return 'spa'
    elif 'suzuka' in name:
        return 'suzuka'
    elif 'tremblant' in name:
        return 'tremblant'
    elif 'valencia' in name:
        return 'valencia'
    elif 'villeneuve' in name:
        return 'villeneuve'
    elif 'watkins' in name:
        return 'watkins_glen'
    elif 'yas marina' in name:
        return 'yas_marina'
    elif 'korea' in name:
        return 'yeongam'
    elif 'zandvoort' in name:
        return 'zandvoort'
    elif 'zeltweg' in name:
        return 'zeltweg'
    elif 'zolder' in name:
        return 'zolder'
    else:
        return np.nan

In [None]:
circuits_info['circuitId'] = circuits_info.name.apply(referencer)

In [None]:
circuits_info.circuitId.unique()

In [None]:
circuits_info.shape

In [None]:
circuits_info[circuits_info.circuitId.isnull()]

    Hard Rock Stadium is a circuit that has been newly designated for the 2022 season and therefore can be dropped from the data.

In [None]:
circuits_info.dropna(inplace=True)

In [None]:
circuits_info

In [None]:
circuits_info.shape

    The names and locations of tracks are contained within both circuits and circuits_info dataframes, and therefore the names and locations of tracks will be dropped from the circuits_info dataframe before joining on cicuitRef for continuity purposes.

In [None]:
circuits_info.drop(['name', 'location'], axis=1, inplace=True)

circuits_complete = circuits.merge(circuits_info, on='circuitId', how='inner')

In [None]:
circuits_complete.head()

# Collecting the Comparison Data

### Constructor Standings Data for 2020

In [None]:
# query API

constructor_standings = defaultdict(list)

url = f'http://ergast.com/api/f1/2020/constructorStandings.json'
r = requests.get(url)
json = r.json()

try:
    items = json['MRData']['StandingsTable']['StandingsLists'][0]['ConstructorStandings']
    for item in items:

        try:
            constructor_standings['position'].append(int(item['position']))
        except:
            constructor_standings['position'].append(np.nan)

        try:
            constructor_standings['constructor'].append(item['Constructor']['constructorId'])
        except:
            constructor_standings['constructor'].append(np.nan)

        try:
            constructor_standings['points'].append(int(item['points']))
        except:
            constructor_standings['points'].append(np.nan)

except:
    pass
        

constructor_standings = pd.DataFrame(constructor_standings)
constructor_standings = constructor_standings.drop_duplicates().reset_index(drop=True)
constructor_standings.to_csv('./CSV/constructor_standings.csv')
constructor_standings

### Driver Standings Data for 2020

In [None]:
# query API

driver_standings = defaultdict(list)

url = f'http://ergast.com/api/f1/2020/driverStandings.json'
r = requests.get(url)
json = r.json()

try:
    items = json['MRData']['StandingsTable']['StandingsLists'][0]['DriverStandings']
    for item in items:
        
        try:
            driver_standings['driverId'].append(item['Driver']['driverId'])
        except:
            driver_standings['driverId'].append(np.nan)
        
        try:
            driver_standings['position'].append(int(item['position']))
        except:
            driver_standings['position'].append(np.nan)
        
        try:
            driver_standings['points'].append(int(item['points']))
        except:
            driver_standings['points'].append(np.nan)
        
except:
    pass
        

driver_standings = pd.DataFrame(driver_standings)
driver_standings.to_csv('./CSV/driver_standings.csv')
driver_standings

# Merging the Data

In [None]:
res_qual = pd.merge(results, qualifying, on=['circuitId', 'season', 'round', 'driverId', 'constructor'], how='outer')
races_final = races_plus_all_weather.drop(['lat', 'long', 'country'], axis=1)
race_res_qual = pd.merge(races_final, res_qual, on=['circuitId', 'season', 'round'], how='outer')
race_res_qual.drop(['date_of_birth', 'nationality'], axis=1, inplace=True)
driver_race_res_qual = pd.merge(race_res_qual, drivers, on='driverId', how='outer')
merged = pd.merge(driver_race_res_qual, circuits_complete, on='circuitId', how='outer').drop(['url'], axis=1)

merged.head()

In [None]:
merged.to_csv('./CSV/merged_database.csv')

In [None]:
# merged = pd.read_csv('./CSV/merged_database.csv').drop(['Unnamed: 0'], axis=1)

In [None]:
merged.dateOfBirth = pd.to_datetime(merged.dateOfBirth)
merged.date = pd.to_datetime(merged.date)
merged['ageDuringRace'] = merged.apply(lambda x: x[4] - x[21], axis=1)

Many constructors' names have changed over the years due to things such as sponsorship and mergers, with minimal changes to the structure of the team. Therefore the names of those such as Aston Martin, which has changed names twice in the hybrid era since being bought by Lawrence Stroll from Force India and rebranding to Racing Point until the end of the 2020 season before teaming up with Aston Martin as sponsors.

All names have been kept synonymous with the names as of the 2020 season.

In [None]:
merged.constructor.unique()

In [None]:
def constructor_combine(constructor):
    if (constructor=='alpine') or (constructor=='lotus_f1'):
        return 'renault'
    if (constructor=='force_india') or (constructor=='aston_martin'):
        return 'racing_point'
    if constructor=='toro_rosso':
        return 'alphatauri'
    if constructor=='marussia':
        return 'manor'
    if constructor=='sauber':
        return 'alfa'
    return constructor

In [None]:
merged.constructor = merged.constructor.apply(constructor_combine)

In [None]:
merged.constructor.unique()

In [None]:
merged.head()

In [None]:
merged.columns

In [None]:
main_df = merged[['season', 'round', 'race_name',
                     'name', 'constructor', 'grid', 'qual_position',
                     'q_best', 'q_worst', 'q_mean', 'ageDuringRace', 'circuitId',
                     'locality', 'country', 'type', 'direction', 'length', 
                     'weather', 'time', 'finish_position', 'status', 'points']]

# Additional Feature Extraction

In [None]:
main_df = main_df.sort_values(by=['season', 'round', 'finish_position'])
main_df.reset_index(inplace=True, drop=True)


time_exerpt = main_df.time
position_exerpt = main_df.finish_position
minimum = []

for i in range(len(position_exerpt)):
    if position_exerpt[i]==1:
        minimum.append(time_exerpt[i])
    else:
        minimum.append(np.nan)

main_df['min'] = minimum
main_df['min'].ffill(inplace=True)
main_df['split_times'] = main_df['time'] - main_df['min']
main_df.split_times.isnull().sum()

In [None]:
main_df.split_times.ffill(inplace=True)

def split_compute(split, status):
    try:
        if 'Laps' in status:
            if type(eval(status[1])) == int:
                return split*int(status[1])
        else:
            return split
    except:
        return split



new_splits = []

for i in range(main_df.shape[0]):
    new_splits.append(split_compute(main_df.split_times[i], main_df.status[i]))

main_df['filled_splits'] = new_splits

In [None]:
main_df.isnull().sum()

In [None]:
main_df.drop(['time', 'min', 'status', 'split_times'], axis=1, inplace=True)
main_df.dropna(inplace=True)
main_df.head()

In [None]:
main_df.shape

In [None]:
main_df.dtypes

In [None]:
main_df.to_csv('./CSV/main_df.csv')

# Tableau CSVs

### Age vs. Pointscoring

In [None]:
age_points = merged[['name', 'ageDuringRace', 'points']].copy()
age_points.isnull().sum()
age_points.dropna(inplace=True)
age_points.ageDuringRace = age_points.ageDuringRace.dt.days

In [None]:
def age_bracket(age):
    return age//365

In [None]:
age_points.ageDuringRace = age_points.ageDuringRace.apply(age_bracket)

In [None]:
age_points.to_csv('./CSV/age_points.csv')

In [None]:
age_points[age_points.ageDuringRace==38]

### Home Nation Pointscoring

In [None]:
merged.nationality.unique()

In [None]:
merged.country.unique()

In [None]:
def home_nation(nat):
    if nat == 'German':
        return 'Germany'
    elif nat == 'British':
        return 'UK'
    elif nat == 'Spanish':
        return 'Spain'
    elif nat == 'French':
        return 'France'
    elif nat == 'Russian':
        return 'Russia'
    elif nat == 'Mexican':
        return 'Mexico'
    elif nat == 'Brazilian':
        return 'Brazil'
    elif nat == 'Japanese':
        return 'Japan'
    elif nat == 'Australian':
        return 'Australia'
    elif nat == 'Dutch':
        return 'Austria'
    elif nat == 'Belgian':
        return 'Belgium'
    elif nat == 'Italian':
        return 'Italy'
    elif nat == 'Canadian':
        return 'Canada'
    elif nat == 'Monegasque':
        return 'Monaco'
    elif nat == 'American':
        return 'USA'
    else:
        return np.nan

In [None]:
def driver_country_filter(country):
    driver_nations = ['Germany', 'Denmark', 'UK', 'Spain', 'Finland', 'France',
                      'Russia', 'Mexico', 'Venezuela', 'Sweden', 'Brazil',
                      'Japan', 'Australia', 'Austria', 'Indonesia', 'Belgium',
                      'Italy', 'Canada', 'New Zealand', 'Monaco', 'Thailand',
                      'Poland', 'USA']
    if country in driver_nations:
        return country
    else:
        return np.nan


In [None]:
home_races = merged[['country', 'nationality', 'points']].copy()

In [None]:
home_races.dropna(inplace=True)

In [None]:
home_races['scored'] = (home_races['points']>0)*1

In [None]:
home_races.drop(['points'], axis=1, inplace=True)

In [None]:
home_races['nat_country'] = home_races.nationality.apply(home_nation)

In [None]:
home_races.country = home_races.country.apply(driver_country_filter)

In [None]:
home_races.dropna(inplace=True)

In [None]:
home_races.head()

In [None]:
h = home_races.sort_values(by='scored', ascending=False).groupby(['country', 'nationality']).agg('mean')#.sort_values(by='scored', ascending=False)

In [None]:
h.sort_values(by=['country', 'scored'], ascending=[True, False]).head(30)

In [None]:
home_races.to_csv('./CSV/home_races.csv')

### Driver & Constructor Consistency

In [None]:
no_fault = ['Finished', '+2 Laps', '+1 Lap','+8 Laps', '+3 Laps', '+4 Laps', '+5 Laps','+6 Laps']
 
driver_fault = ['Retired', 'Withdrew', 'Collision', 'Accident', 'Disqualified', 'Damage',
                'Spun off', 'Collision damage', 'Puncture', 'Rear wing', 'Tyre', 'Front wing',
                'Excluded', 'Illness']

car_fault = ['Suspension', 'Wheel', 'Vibrations', 'Engine', 'ERS',
              'Power loss', 'Water leak',  'Oil pressure', 'Hydraulics',
              'Steering', 'Power Unit', 'Brakes', 'Mechanical', 'Turbo',
              'Battery', 'Electrical', 'Gearbox', 'Wheel nut', 'Technical',
              'Fuel system', 'Clutch', 'Out of fuel', 'Driveshaft',
              'Transmission', 'Fuel pressure', 'Exhaust','Oil leak', 
              'Electronics', 'Drivetrain','Overheating',  'Water pressure',
              'Radiator','Debris', 'Throttle', 'Spark plugs', 'Brake duct', 'Seat']

In [None]:
def status_fault(status, no=no_fault, driver=driver_fault):
    if status in no:
        return 'finish'
    elif status in driver:
        return 'driver'
    else:
        return 'car'

In [None]:
status_issues = merged[['season', 'name', 'constructor', 'status']].copy()

In [None]:
status_issues['fault'] = status_issues.status.apply(status_fault)

In [None]:
status_issues.drop(['status'], axis=1, inplace=True)

In [None]:
driver_issues = status_issues[['season', 'name', 'fault']].copy()
driver_issues = driver_issues[driver_issues.fault!='car']

In [None]:
# driver_issues.fault = (driver_issues.fault=='finish')*1

In [None]:
# driver_issues.to_csv('./CSV/driver_issues.csv')

In [None]:
driver_issues.fault = (driver_issues.fault=='driver')*1

In [None]:
driver_issues.to_csv('./CSV/driver_issues_faults.csv')

In [None]:
driver_issues_grouped = driver_issues.groupby(['season', 'name']).agg('mean').sort_values(by=['season', 'fault'], ascending=False)[6:]
driver_issues_grouped.head(20)

In [None]:
constructor_issues = status_issues[['season', 'constructor', 'fault']].copy()
constructor_issues = constructor_issues[constructor_issues.fault!='driver']

In [None]:
constructor_issues.fault = (constructor_issues.fault=='car')*1

In [None]:
constructor_issues.to_csv('./CSV/constructor_issues_faults.csv')

In [None]:
constructor_issues.fault.value_counts()

In [None]:
constructor_issues.groupby('constructor').agg('mean').sort_values(by='fault', ascending=False).head(20)