In [1]:
import pandas as pd
import numpy as np

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
file_path = './data/f1_data_processed.csv'

In [22]:
data_df = pd.read_csv(file_path)
df = data_df[['DriverId', 'Year', 'GridPosition', 'TeamId', 'Time', 'Status', 
              'TotalLaps', 'RaceName', 'Finished']]

In [23]:
def impute_time(race):
    """
    Time imputation for finished racers.

    Parameters:
        race (pd.DataFrame): The current race.

    Returns:
    pd.DataFrame: Imputed race.
    """
    max_time = race['Time'].max()
    min_time = race['Time'].min()
    total_laps = race['TotalLaps'].iloc[0]

    # Iterate over rows to impute missing Time
    for idx, row in race.iterrows():
        if pd.isna(row['Time']) and pd.notna(row['Status']) and row['Status'].startswith('+'):
            try:
                # Extract the number of laps behind the leader
                n_laps = int(row['Status'].split()[0][1:]) # "+{n} Lap(s)" -> n
                # Calculate the imputed time
                imputed_time = max_time / total_laps * n_laps + min_time
                # Ensure the imputed time is greater than the maximum time
                if imputed_time <= max_time:
                    imputed_time = max_time
                # Update the row with the imputed time
                race.at[idx, 'Time'] = imputed_time
            except ValueError:
                # In case the Status does not contain a valid "+{n} Lap(s)" format
                continue
    
    return race

In [27]:
df = df.groupby(['Year', 'RaceName']).apply(impute_time).reset_index(drop=True)

In [28]:
df.head(20)

Unnamed: 0,DriverId,Year,GridPosition,TeamId,Time,Status,TotalLaps,RaceName,Finished
0,hamilton,2018,1,mercedes,5980.382,Finished,55,Abu Dhabi Grand Prix,1
1,vettel,2018,3,ferrari,5982.963,Finished,55,Abu Dhabi Grand Prix,1
2,max_verstappen,2018,6,red_bull,5993.088,Finished,55,Abu Dhabi Grand Prix,1
3,ricciardo,2018,5,red_bull,5995.761,Finished,55,Abu Dhabi Grand Prix,1
4,bottas,2018,2,mercedes,6028.339,Finished,55,Abu Dhabi Grand Prix,1
5,sainz,2018,11,renault,6052.93,Finished,55,Abu Dhabi Grand Prix,1
6,leclerc,2018,8,sauber,6071.171,Finished,55,Abu Dhabi Grand Prix,1
7,perez,2018,14,force_india,6071.657,Finished,55,Abu Dhabi Grand Prix,1
8,grosjean,2018,7,haas,6090.775764,+1 Lap,55,Abu Dhabi Grand Prix,1
9,kevin_magnussen,2018,13,haas,6090.775764,+1 Lap,55,Abu Dhabi Grand Prix,1


In [10]:
data_df.columns

Index(['Unnamed: 0', 'DriverId', 'TeamId', 'Position_Race',
       'ClassifiedPosition', 'GridPosition', 'Time', 'Status', 'RaceDate',
       'TotalLaps', 'Position_Qual', 'Q1_Qual', 'Q2_Qual', 'Q3_Qual',
       'AirTemp', 'Humidity', 'Pressure', 'Rainfall', 'TrackTemp',
       'WindDirection', 'WindSpeed', 'Year', 'RaceName', 'TotalLength',
       'Finished'],
      dtype='object')

In [17]:
data_df[(data_df['Finished'] == 1) & (data_df['Time'].isna())]

Unnamed: 0.1,Unnamed: 0,DriverId,TeamId,Position_Race,ClassifiedPosition,GridPosition,Time,Status,RaceDate,TotalLaps,...,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed,Year,RaceName,TotalLength,Finished
14,14,brendon_hartley,toro_rosso,15,15,16,,+1 Lap,3/25/2018 5:10,58,...,30.915315,997.003604,0.045045,36.324324,294.126126,3.691892,2018,Australian Grand Prix,306124,1
26,26,alonso,mclaren,7,7,13,,+1 Lap,4/8/2018 15:10,57,...,47.363107,1009.494175,0.000000,32.198058,167.407767,0.958252,2018,Bahrain Grand Prix,308484,1
27,27,vandoorne,mclaren,8,8,14,,+1 Lap,4/8/2018 15:10,57,...,47.363107,1009.494175,0.000000,32.198058,167.407767,0.958252,2018,Bahrain Grand Prix,308484,1
28,28,ericsson,sauber,9,9,17,,+1 Lap,4/8/2018 15:10,57,...,47.363107,1009.494175,0.000000,32.198058,167.407767,0.958252,2018,Bahrain Grand Prix,308484,1
29,29,ocon,force_india,10,10,8,,+1 Lap,4/8/2018 15:10,57,...,47.363107,1009.494175,0.000000,32.198058,167.407767,0.958252,2018,Bahrain Grand Prix,308484,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,1333,russell,williams,14,14,15,,+1 Lap,5/23/2021 13:00,78,...,61.424444,1015.571111,0.000000,37.595556,202.081481,0.316296,2021,Monaco Grand Prix,260286,1
1334,1334,latifi,williams,15,15,18,,+1 Lap,5/23/2021 13:00,78,...,61.424444,1015.571111,0.000000,37.595556,202.081481,0.316296,2021,Monaco Grand Prix,260286,1
1335,1335,tsunoda,alphatauri,16,16,16,,+1 Lap,5/23/2021 13:00,78,...,61.424444,1015.571111,0.000000,37.595556,202.081481,0.316296,2021,Monaco Grand Prix,260286,1
1336,1336,mazepin,haas,17,17,19,,+3 Laps,5/23/2021 13:00,78,...,61.424444,1015.571111,0.000000,37.595556,202.081481,0.316296,2021,Monaco Grand Prix,260286,1


In [7]:
data_df['Status'].unique()

array(['Finished', '+1 Lap', 'Wheel', 'Engine', 'Steering', 'Brakes',
       'Puncture', 'Electrical', 'Collision damage', 'Accident',
       'Collision', '+2 Laps', '+3 Laps', 'Gearbox', 'Oil leak', 'Turbo',
       'Exhaust', 'Fuel pressure', 'Hydraulics', 'Tyre', 'Power Unit',
       'Retired', 'Power loss', 'Suspension', 'Disqualified',
       'Mechanical', 'Battery', 'Overheating', 'Damage', 'Out of fuel',
       'Transmission', 'Spun off', 'Water pressure', 'Withdrew',
       'Electronics', '+5 Laps', 'Debris', 'Radiator', 'Illness',
       'Rear wing', 'Wheel nut', 'Driveshaft'], dtype=object)

In [8]:
data_df['TotalLaps']

KeyError: 'TotalLaps'