In [17]:
import json 
import os

import numpy as np
import pandas as pd
import plotly as py

from collections import En

from typing import List, Optional

### What are the features I want for my model? 

1. Starting grid relative to teammate 
2. Quali pace relative to team mate 
3. Average lap time compared to team mate 
4. meidan delta of season group average finish position 
5. Median overtakes 
6. Median penalties 
7. Median spins or issues (how to get this information?)

In [2]:
DATA_DIR = './data/'
ARCHIVE_DIR = './archive/'

## Cleaning the data (Prep)

In [8]:
quali_df = pd.read_csv(DATA_DIR+'season_qualifying.csv')
results_df = pd.read_csv(DATA_DIR+'season_results.csv')
races_df = pd.read_csv(DATA_DIR+'season_races.csv')

result_status_df = pd.read_csv(ARCHIVE_DIR+'status.csv')

In [4]:
quali_df.head()

Unnamed: 0,qualifyId,raceId,driverId,constructorId,number,position,q1,q2,q3
0,8735,1052,830,9,33,1,1:30.499,1:30.318,1:28.997
1,8736,1052,1,131,44,2,1:30.617,1:30.085,1:29.385
2,8737,1052,822,131,77,3,1:31.200,1:30.186,1:29.586
3,8738,1052,844,6,16,4,1:30.691,1:30.010,1:29.678
4,8739,1052,842,213,10,5,1:30.848,1:30.513,1:29.809


In [64]:
def conv_race_status(status: str) -> bool:
    """
    Convert the race status to either Finished, DriverError
    LapDown or DNF 
    """
    if status == 'Finished':
        return 'Finished'
    elif status in ['Disqualified', 'Accident', 'Collision', 'Spun off']:
        return 'DriverError'
    elif status.startswith('+'):
        laps_down = int(x.split('+')[1].split(' ')[0])
        if laps_down < 5:
            return 'LapDown'
    return 'DNF'



In [50]:
col_key_pair = ['raceId', 'driverId']
data_df = results_df.groupby(by=col_key_pair).count().reset_index()[col_key_pair]

In [55]:
data_df['key'] = data_df.apply(lambda row: f'{row[0]}_{row[1]}', axis=1)

In [56]:
data_df.head()

Unnamed: 0,raceId,driverId,key
0,1051,1,1051_1
1,1051,4,1051_4
2,1051,8,1051_8
3,1051,20,1051_20
4,1051,815,1051_815


In [57]:
results_cols = ['key', 'constructorId', 'number', 'positionOrder', 
                'milliseconds', 'rank', 'fastestLap', 
                'fastestLapSpeed', 'statusId']

results_df['key'] = results_df.apply(lambda row: f'{row[1]}_{row[2]}', axis=1)

In [62]:
merged  = data_df.merge(results_df[results_cols], how='inner', on='key')

In [69]:
merged = merged.merge(result_status_df, how='inner', on='statusId')

In [91]:
merged['statusResult'] = merged['status'].apply(conv_race_status)

### Ignore results where driver had to retire due to technical issues 

In [95]:
merged[merged['statusResult'] == 'DNF'].shape

(31, 13)

In [97]:
merged.drop(index=merged[merged['statusResult'] == 'DNF'].index, inplace=True)