In [28]:
import os
import csv
import pandas as pd

In [29]:
pwd = os.getcwd()
file_paths = ['/scraped_data/scrape_of_rows_odd.csv', '/scraped_data/scrape_of_rows_even.csv']

In [30]:
def read_to_list(index):
    with open(pwd + file_paths[index], 'r') as f:
        reader = csv.reader(f)
        return list(reader)

In [48]:
odd_rows_raw = read_to_list(0)
even_rows_raw = read_to_list(1)

# <p align=center> Pre Processing Data </p>
## <p>Noticing that the data from the website is displayed rather annoyingly, in the form of span rows, I have decided to alternate the rows with the view of pasting together both list into a DataFrame. To acheieve this, I have to pre process the data to have it in the correct sizing, format to import into the DataFrame.  The odd rows in particular, the way the data was displayed resulted in some entries being not of the same length, thus being out of sync with most of the data pulled from this subsection of rows.  The best way I came about to solving this issue, was to isolate the rows that are not needed, in particular the empty lists, and those that took the header of the table, and remove them.  Then insert a generic entry into the lists that remain, to align the data with the rest.  I will go through and clean this data once It is active in the DataFrame</p>

In [49]:
odd_df = pd.DataFrame(odd_rows_raw, columns=["id", "track", "team", "driver_num", "driver", "model", "engine", "tyre", "grid_pos", "fin_pos", "notes"]) 
print(odd_df.shape)

odd_df = odd_df.iloc[odd_df.index[46:68]].shift(periods=2, axis="columns")

(344, 11)


In [86]:
# Cleaning the list removing unnessecary lists
def remove_unneeded_lists(list_name):
    for ls in list_name: 
        #print(ls)
        if len(ls) == 0 or ls[0] == '' or ls[0] == 'n': 
            list_name.remove(ls)

In [87]:
# Add's 'id' and 'track' column to the lists where it is missing
def add_generic_data(list_name):
    for ls in list_name: 
        if len(ls) < 11: 
            ls.insert(0, 'id')
            ls.insert(1, 'track')

In [57]:
# Adding the year to the list throughout the season results. 
def insert_years(list_name):
    races_per_season = { 
        '2015' : '19',
        '2016' : '21',
        '20116': '21',
        '2017' : '20',
        '2018' : '21',
        '2019' : '21', 
        '2020' : '17', 
        '2021' : '16'
    }

    i = 0
    for k,v in races_per_season.items(): 
        total_races = 0
        while total_races < int(v):
            try:
                list_name[i].insert(1, k)
                total_races += 1
                i += 1
            except IndexError:
                print("Race's don't add up, are you trying to pass more races than you have data?")
                break

In [90]:
remove_unneeded_lists(odd_rows_raw)
insert_years(odd_rows_raw)

In [95]:
# Copy of the uniformed list as bacup. 
odd_rows_uniformed = [row.copy() for row in odd_rows_raw]

In [96]:
odd_rows_uniformed[1]

['168',
 '2015',
 'Malaysia',
 'Scuderia Toro Rosso',
 '33',
 'VERSTAPPEN Max',
 'STR10',
 'Renault',
 'Pirelli',
 '6',
 '7',
 '']

# <p align=center>Creating the first DataFrame</p>
## Now that we have pre processed our odd data, by adding the requried indices to make it conform to the main list we can create our first DataFrame. To enable us to have something to join on easier later, I am making the id, start from 0 - this makes sense as it is designed to follow the stats from the start of Max's F1 career. 

In [84]:
# Sets the race id according to the race number in which Max Participated from 2015 onwards. 
def set_race_id(list_name, race_id_start, race_number_start, race_number_end):    
    race_id = race_id_start
    for ls in list_name[race_number_start:race_number_end]:
        try: 
            ls[0] = race_id
            race_id += 1
        except IndexError:
            print("index error, not enough races to complete.")

In [99]:
odd_df = pd.DataFrame(odd_rows_uniformed, columns=["id", "year", "track", "team", "driver_num", "driver", "model", "engine", "tyre", "grid_pos", "fin_pos", "notes"])
odd_df.head()

Unnamed: 0,id,year,track,team,driver_num,driver,model,engine,tyre,grid_pos,fin_pos,notes
0,1,2015,Australia,Scuderia Toro Rosso,33,VERSTAPPEN Max,STR10,Renault,Pirelli,11,ab,Engine
1,2,2015,Malaysia,Scuderia Toro Rosso,33,VERSTAPPEN Max,STR10,Renault,Pirelli,6,7,
2,3,2015,China,Scuderia Toro Rosso,33,VERSTAPPEN Max,STR10,Renault,Pirelli,13,17,Engine
3,4,2015,Bahrain,Scuderia Toro Rosso,33,VERSTAPPEN Max,STR10,Renault,Pirelli,15,ab,Electrics
4,5,2015,Spain,Scuderia Toro Rosso,33,VERSTAPPEN Max,STR10,Renault,Pirelli,6,11,


In [98]:
set_race_id(odd_rows_uniformed, 1, 0, 40) # Isolates Toro Rosso from 2015 - 2016
set_race_id(odd_rows_uniformed, 20, 40, -1) # RedBull from 2016 onward. 

In [100]:
odd_rows_uniformed[39:41]

[[40,
  '2016',
  'Abu Dhabi',
  'Scuderia Toro Rosso',
  '26',
  'KVYAT Daniil',
  'STR11',
  'Ferrari',
  'Pirelli',
  '17',
  'ab',
  'Gearbox'],
 [20,
  '20116',
  'Australia',
  'Red Bull Racing',
  '3',
  'RICCIARDO Daniel',
  'RB12',
  'TAG Heuer',
  'Pirelli',
  '8',
  '4',
  '']]

# <p align=center> Now for the even rows </p>
## Now we repeat the above process with the even rows

In [103]:
remove_unneeded_lists(even_rows_raw)

In [106]:
even_rows_uniformed = [row.copy() for row in even_rows_raw]

In [108]:
even_rows_uniformed[1]

['id',
 'track',
 'Scuderia Toro Rosso',
 '55',
 'SAINZ Carlos',
 'STR10',
 'Renault',
 'Pirelli',
 '15',
 '8',
 '']

In [110]:
insert_years(even_rows_uniformed)

Race's don't add up, are you trying to pass more races than you have data?


In [124]:
set_race_id(even_rows_uniformed, 1, 0, 40)
set_race_id(even_rows_uniformed, 21, 40, len(even_rows_uniformed))