In [None]:
# IMPORTS
import pandas as pd
import numpy as np

## READING SEPARATE CSV FILES

In [None]:
laps_df = pd.read_csv('laps.csv', encoding='utf-16')
fcyphases_df = pd.read_csv('fcyphases.csv', encoding='utf-16')
races_df = pd.read_csv('races.csv', encoding='utf-16')
drivers_df = pd.read_csv('drivers.csv', encoding='utf-16')
starterfields_df = pd.read_csv('starterfields.csv', encoding='utf-16')

## MERGING DRIVERS AND RACES TABLES TOGETHER

In [None]:
# LAPS -> race_id + driver_id
# DRIVERS -> id(driver_id)
# RACES -> id(race_id)

# # Merge the dataframes based on common columns
# merged_df = pd.merge(laps_df, drivers_df, left_on='driver_id', right_on='id')
# merged_df = pd.merge(merged_df, races_df, left_on='race_id', right_on='id')
# merged_df = merged_df.drop(['id_y'], axis=1)
# # Print the merged dataframe
# merged_df

# Save the DataFrame as a CSV file
#merged_df.to_csv('main.csv', index=False)

## NOW I HAVE MY MAIN DF -> MAIN.CSV

In [None]:
main_df = pd.read_csv('main.csv', encoding='utf-8')
dtypes = {"id": float, 
          "carno": float, 
          "initials": object, 
          "name": object,
           "race_id": float,
         "driver_id": float,
         "team": object,
         "teamcolor": object ,
         "enginemanufacturer": object,
          "gridposition": float,
          "status": object,
          "resultposition": float,
          "completedlaps": float,
          "speedtrap": float,
          "date": object,
          "season": object,
          "location": str,
          "availablecompounds": object,
          "comment" : object,
          "nolaps": float,
          "nolapsplanned": float,
          "tracklength": float,
          "position": float,
          "q1laptime": float,
          "q2laptime": float,
          "q3laptime": float,
          "lapno": float,
          "laptime": float,
          "racetime": float,
          "gap": float,
          "interval": float,
          "compound": object,
          "pitintime": object,
          "pitstopduration": object,
          "nextcompound": object,
          "startlapprog_vsc" : float ,
          "endlapprog_vsc": float,
          "age_vsc": float,
          "startlapprog_sc": float,
          "endlapprog_sc": float,
          "age_sc": float,
          "accidents": float,
          "failures": float,
          "startracetime": float,
          "endracetime": float,
          "startraceprog": float,
          "endraceprog": float,
          "startlap": float,
          "endlap": float,
          "type": object 
          
         }
main_df.rename(columns={'id_x': 'id'}, inplace=True)

In [None]:
#main_df

In [None]:
# DROPPING COMMENT SINCE IT IS IRRELEVANT
main_df = main_df.drop(['comment'], axis=1)

In [None]:
main_df.info()

# Feature Engineering

In [None]:
# OUTPUT FEATURE - PIT (1) / NO PIT (0)
main_df['pitstop'] = main_df['pitintime'].apply(lambda x: 0 if pd.isnull(x) or x == 0 else 1)

In [None]:
counts = main_df['pitstop'].value_counts()
print(counts)

In [None]:
 def race_progress(main_df):
        
    r_id = main_df.race_id.unique()
    d_id = main_df.driver_id.unique()
    
    # create new df
    new_df = pd.DataFrame()

    for r in r_id:
        
        # rows of the same race
        x = main_df[main_df['race_id'] == r]
        
        for d in d_id:
            # rows of the same race and driver
            y = x[x['driver_id'] == d]
            
            if not y.empty:
                max_value = max(y['racetime'])
                y['race_progress'] = y['racetime'] / max_value
            
                new_df = pd.concat([new_df, y])
    
    return new_df

main_df = race_progress(main_df)

In [None]:
def calculate_tyreageprogress(row):
    race_id = row['race_id']
    lapno = row['lapno']
    nextcompound = row['nextcompound']
    
    vsc_phases = fcyphases_df[(fcyphases_df['race_id'] == race_id) & (fcyphases_df['type'] == 'VSC')]
    sc_phases = fcyphases_df[(fcyphases_df['race_id'] == race_id) & (fcyphases_df['type'] == 'SC')]
    
    previous_lapno = lapno - 1
    previous_lap_data = main_df[(main_df['race_id'] == race_id) & (main_df['lapno'] == previous_lapno)]
    
    if previous_lap_data.empty:
        return np.nan
    
    previous_lap_data['tyreageprogress'] = np.nan  # Create 'tyreageprogress' column
    
    total_age = previous_lap_data.at[previous_lap_data.index[0], 'tireage'] + (previous_lapno - previous_lap_data.at[previous_lap_data.index[0], 'lapno'])
    tyre_age_progress = previous_lap_data.at[previous_lap_data.index[0], 'tireage'] / total_age
    
    if not vsc_phases.empty:
        tyre_age_progress *= 1.5
    if not sc_phases.empty:
        tyre_age_progress *= 1.75
    
    tyre_age_progress = np.clip(tyre_age_progress, 0.0, 1.0)
    
    if nextcompound is not None:
        previous_tyre_age = previous_lap_data.at[previous_lap_data.index[0], 'tyreageprogress']
        if nextcompound == 'hard':
            tyre_age_progress[nextcompound] = 0.388 * previous_tyre_age
        elif nextcompound == 'medium':
            tyre_age_progress[nextcompound] = 0.341 * previous_tyre_age
        elif nextcompound == 'soft':
            tyre_age_progress[nextcompound] = 0.295 * previous_tyre_age
    
    race_nolaps = main_df.loc[main_df['race_id'] == race_id, 'nolaps'].iloc[0]
    tyre_age_progress /= race_nolaps
    
    return tyre_age_progress

main_df['tyreageprogress'] = main_df.apply(calculate_tyreageprogress, axis=1)


In [None]:
# Position Feature

# output: leader + pursuer
main_df['is_leader'] = np.where(main_df['position'] == 1, 'leader', 'pursuer')

In [None]:
# Relative Compound Feature

# Define a dictionary to map compound codes to tire types
compound_dict = {"A1": "hard", "A2": "hard", "A3": "medium", "A4": "soft", "A5": "soft", "A6": "soft", "A7": "soft"}

# Use the map method to apply the dictionary to the compound column
main_df['relativecompound'] = main_df['compound'].map(compound_dict)

In [None]:
main_df

In [None]:
# Race Track Category Feature

# defining a dict that maps each location to its corresponding track category number
location_dict = {"Austin" : 2,
                 "Baku": 2,
                 "Budapest": 2,
                 "Catalunya": 1,
                 "Hockenheim": 2,
                 "KualaLumpur": 2,
                 "LeCastellet": 2,
                 "Melbourne": 2,
                 "MexicoCity": 2,
                 "MonteCarlo": 2,
                 "Montreal": 3,
                 "Monza": 2,
                 "Sakhir": 1,
                 "SaoPaulo": 1,
                 "Shanghai": 2,
                 "Silverstone": 1,
                 "Singapore": 3,
                 "Sochi": 2,
                 "Spa": 1,
                 "Spielberg": 2,
                 "Suzuka": 1,
                 "YasMarina": 3
    
}

# creating a new column in the df to store the track category number
main_df['racetrackcat'] = None

# applying the location to the "location" column and filling the values in the new "racetrackcat" column
main_df ['racetrackcat'] = main_df['location'].map(location_dict)
#df.racetrackcat

In [None]:
counts = main_df['racetrackcat'].value_counts()
print(counts)

In [None]:
# FCY Status Feature

# creating a new column in the df to store the FCY status
main_df['fcystatus'] = 0

def fcystatus(race_id, lapno):
    a = fcyphases_df[fcyphases_df['race_id'] == race_id]
    
    for index, row in a.iterrows():
        if row['startlap'] < lapno < row['endlap']:
            # Check if no FCY phase is active
            if row['startlap'] is None or row['endlap'] is None or row['startraceprog'] is None or row['endraceprog'] is None:
                return 0
            # Check if VSC phase is active
            elif row['startlap'] in main_df.columns and row['endlap'] in main_df.columns and row['startlapprog_vsc'] in main_df.columns and row['endlapprog_vsc'] in main_df.columns and main_df.loc[row['startlap']] == main_df.loc[row['endlap']] and main_df.loc[row['startlapprog_vsc']] is not None and main_df.loc[row['endlapprog_vsc']] is not None:
                # Check if first lap of VSC phase
                if row['startlap'] == row['endlap']:
                    return 1
                # Otherwise, further laps of VSC phase
                else:
                    return 2
            # Check if SC phase is active
            elif row['startlap'] in main_df.columns and row['endlap'] in main_df.columns and row['startlapprog_sc'] in main_df.columns and row['endlapprog_sc'] in main_df.columns and main_df.loc[row['startlap']] == main_df.loc[row['endlap']] and main_df.loc[row['startlapprog_sc']] is not None and main_df.loc[row['endlapprog_sc']] is not None:
                # Check if first lap of SC phase
                if row['startlap'] == row['endlap']:
                    return 3
                # Otherwise, further laps of SC phase
                else:
                    return 4
    
    return 0

main_df['fcystatus'] = main_df.apply(lambda row: fcystatus(row['race_id'], row['lapno']), axis=1)

In [None]:
main_df

In [None]:
main_df['remaining_pit_stops'] = 3

for index, row in main_df.iterrows():
    if pd.notnull(row['pitintime']):
        driver_id = row['id']
        remaining_pit_stops = main_df.loc[(main_df['id'] == driver_id) & (main_df['lapno'] > row['lapno']), 'remaining_pit_stops'].max()
        main_df.loc[index, 'remaining_pit_stops'] = max(remaining_pit_stops - 1, 0)

In [None]:
main_df

In [None]:
# Tyre Change of Pursuer Feature

# Sort the data frame by driver ID and lap number
main_df = main_df.sort_values(by=['id', 'lapno'])

# Add a new column 'pursuer_compound'
pursuer_compound = main_df.groupby('id')['compound'].shift()

# Add a new column 'pursuer_tireage'
pursuer_tyreage = main_df.groupby('id')['tireage'].shift()

# Add a new column 'pursuer_pitstop'
pursuer_pitstop = main_df.groupby('id')['pitstop'].shift().astype(float)

# Add a new column 'pursuer_position'
pursuer_position = main_df.groupby('id')['position'].shift()

# Add a new column 'pursuer_tire_change'
main_df['pursuer_tyre_change'] = ((pursuer_pitstop != 1) &
                             (pursuer_compound != main_df['compound']) &
                             (pursuer_tyreage >= 2) &
                             (pursuer_position != main_df['position'])).astype(float)

In [None]:
# Close Ahead Feature - 1 is True / 0 is False

main_df['close_ahead'] = main_df['interval'].apply(lambda x: 1 if x <= 1.5 else 0)

In [None]:
main_df

## PREPROCESSING

In [None]:
# FILTER 1 - Removing WET races

# apply a filter to remove any rows where the "compound" column contains "W" or "I"
main_df = main_df[~main_df['compound'].isin(['W', 'I'])]

main_df

In [None]:
# FILTER 2 - Removing data related to drivers making more than 3 pit stops

# Iterate over each unique combination of id and race_id
for id, race_id in main_df[['id', 'race_id']].drop_duplicates().values:
    # Get the rows for the current combination of id and race_id
    rows = main_df[(main_df['id'] == id) & (main_df['race_id'] == race_id)]

    # Check if the number of pit stops exceeds three
    if rows['pitstopduration'].count() > 3:
        # Remove the rows
        main_df = main_df.drop(rows.index)

In [None]:
# FILTER 3 - Data relating to drivers making their final pit stop after a race progress of 90% are removed

total_laps = main_df['nolaps'].max()  # Obtain the maximum value of the 'nolaps' feature
race_progress_threshold = total_laps * 0.9

# Iterate over each unique combination of id and race_id
for (id, race_id) in main_df[['id', 'race_id']].drop_duplicates().values:
    # Get the rows for the current combination of id and race_id
    rows = main_df[(main_df['id'] == id) & (main_df['race_id'] == race_id)]
    
    # Check if the driver's final pit stop occurs after the race progress threshold
    if rows['race_progress'].max() >= race_progress_threshold:
        # Remove the rows from the main_df DataFrame
        main_df = main_df.drop(rows.index)

In [None]:
# FILTER 4 - Data relating to drivers with a lap time above 200 s or a pit stop duration above 50 s are remove

# Iterate over each unique combination of id and race_id
for (id, race_id) in main_df[['id', 'race_id']].drop_duplicates().values:
    # Get the rows for the current combination of id and race_id
    rows = main_df[(main_df['id'] == id) & (main_df['race_id'] == race_id)]
    
    # Check if any laptime is above 200s or pitstopduration is above 50s
    if (rows['laptime'] > 200).any() or (rows['pitstopduration'] > 50).any():
        # Remove the rows from the filtered_df
        main_df = main_df.drop(rows.index)

In [None]:
# FILTER 5 - Data relating to drivers with a result position greater than ten are removed.

# create a new column called "result_position" and set the default value to NaN
main_df['result_position'] = float('nan')

# loop through each race in the database
for race_id in main_df['race_id'].unique():
    # get the last lap of the race
    last_lap = main_df[main_df['race_id'] == race_id]['lapno'].max()
    # loop through each driver in the race
    for driver_id in main_df[main_df['race_id'] == race_id]['id'].unique():
        # get the position of the driver on the last lap
        driver_position = main_df[(main_df['race_id'] == race_id) & (main_df['id'] == driver_id) & (main_df['lapno'] == last_lap)]['position'].values
        # if the driver completed the last lap, set their result position to their position on the last lap
        if len(driver_position) > 0:
            main_df.loc[(main_df['race_id'] == race_id) & (main_df['id'] == driver_id), 'result_position'] = driver_position[0]

# apply a filter to remove any rows where the result position is greater than 10
main_df = main_df[main_df['result_position'] < 10]

## REMOVE NAN 

In [None]:
# # Encode NaN values in "laptime" as a separate category
# df['laptime_category'] = np.where(df['laptime'].isna(), 'Lap Not Completed', 'Lap Completed')