In [64]:
import os

from typing import List

import numpy as np
import pandas as pd

In [61]:
SEASON_YEAR = 2021

## Filter on the races in the season and curate the data

In [21]:
races = pd.read_csv('./archive/races.csv')

races_in_season = pd.DataFrame(races[races['year'] == SEASON_YEAR].reset_index(drop=True))
races_in_season.to_csv('./data/races.csv', index=False)

In [36]:
def scope_to_season(filename: str, year: int):
    df = pd.read_csv('./archive/' + filename)
    if 'year' not in df.columns:
        return
    to_save_df = pd.DataFrame(df[df['year'] == year].reset_index(drop=True))
    to_save_df.to_csv(f'./data/season_{filename}', index=False)

In [24]:
files = os.listdir('archive/')

In [37]:
for fname in files:
    if fname.startswith('.'):
        continue
    
    scope_to_season(fname, SEASON_YEAR)

In [29]:
season_races_ids = races_in_season.raceId.to_list()

In [40]:
all_quali = pd.read_csv('./archive/qualifying.csv')
season_quali = all_quali[all_quali.raceId.apply(lambda x: x in season_races_ids)]
season_quali.reset_index(drop=True).to_csv('./data/season_qualifying.csv', index=False)

In [45]:
def filter_for_ids(filename: str, id_col: str, ids: List[int]):
    try:
        all_df = pd.read_csv(f'./archive/{filename}')
    except FileNotFoundError:
        print('Supplied file does not exist in the archive')
        return 
    
    if id_col not in all_df.columns:
        return 
    for_drivers_df = all_df[all_df[id_col].apply(lambda x: x in ids)]
    for_drivers_df.reset_index(drop=True).to_csv(f'./data/season_{filename}', index=False)

In [47]:
for fname in ['lap_times.csv', 'pit_stops.csv', 'results.csv'
           'driver_standings.csv', 'constructor_standings.csv']:
    filter_for_ids(fname, 'raceId', season_races_ids)

## Get the drivers 

In [51]:
all_resulst = pd.read_csv('./archive/results.csv')
results_in_season = all_resulst[all_resulst.raceId.apply(lambda x: x in season_races_ids)]
drivers_in_season = results_in_season.driverId.unique()
len(drivers_in_season)

In [63]:
all_drivers = pd.read_csv('./archive/drivers.csv')
all_driver_standings = pd.read_csv('./archive/driver_standings.csv')

In [66]:
filter_for_ids('drivers.csv', 'driverId', drivers_in_season)

In [67]:
filter_for_ids('driver_standings.csv', 'driverId', drivers_in_season)