In [None]:
import pandas as pd
import ast
import pycountry_convert as pc

### Race results

In [None]:
df_race_results = pd.read_csv('../data/results_2000-2024.csv')

df_race_results['Results'] = df_race_results['Results'].apply(ast.literal_eval)

rows = []
results_expanded = []
for index, row in df_race_results.iterrows():
    results_list = row['Results']
    for result in results_list:
        rows.append(row.drop('Results'))
        results_expanded.append(result)

expanded_rows_df = pd.DataFrame(rows)
results_normalized_df = pd.json_normalize(results_expanded)

drivers_results_df = pd.concat([expanded_rows_df.reset_index(drop=True), results_normalized_df.reset_index(drop=True)], axis=1)

drivers_results_df['points'] = pd.to_numeric(drivers_results_df['points'], errors='coerce') # convert to integer

## Circuits
drivers_results_df['Circuit'] = drivers_results_df['Circuit'].apply(ast.literal_eval)
# print(type(drivers_results_df['Circuit'].iloc[0]))
circuits_normalized = pd.json_normalize(drivers_results_df['Circuit'])

drivers_results_df = pd.concat([drivers_results_df.drop(columns=['Circuit']), circuits_normalized],axis=1)

drivers_results_df.drop(columns=['url', 'Location.lat',	'Location.long', 'Location.locality', 'Location.country', 'Driver.url', 'Driver.permanentNumber', 'Constructor.url', 'time'], axis=1, inplace=True)

drivers_results_df.rename(columns={'Driver.driverId': 'driverId', 'Driver.code': 'driverCode', 'Driver.givenName': 'driverName', 'Driver.familyName': 'driverSurname', 'Driver.dateOfBirth': 'driverBirth',
'Driver.nationality':'driverNationality', 'Constructor.constructorId': 'constructorId', 'Constructor.name': 'constructorName', 'Constructor.nationality': 'constructorNationality', 'Time.millis': 'timeMillis', 'Time.time': 'time', 'FastestLap.rank': 'fastesLapRank',
'FastestLap.lap': 'fastesLap', 'FastestLap.Time.time': 'fastestLapTime', 'FastestLap.AverageSpeed.speed': 'fastestLapAverageSpeed', 'FastestLap.AverageSpeed.units': 'fastestLapAverageSpeedUnit'},  inplace=True)

drivers_results_df = drivers_results_df.astype({'position': 'int64', 'points': 'int64', 'grid': 'int64', 'laps': 'int64'})

# Fill NA

# Create CSV file
drivers_results_df.to_csv('../data/cleaned_results_2000-2024.csv', index=False)


### Seasons: Circuits

In [None]:
def get_continent_name(country_name: str) -> str:
    country_code = pc.country_name_to_country_alpha2(country_name)
    continent_code = pc.country_alpha2_to_continent_code(country_code)
    continent_dict = {
        "NA": "North America",
        "SA": "South America",
        "AS": "Asia",
        "AF": "Africa",
        "OC": "Oceania",
        "EU": "Europe",
        "AQ" : "Antarctica"
    }
    return continent_dict[continent_code]


In [None]:
df_seasons_circuits = pd.read_csv('../data/seasons_circuits_2000-2024.csv')

df_dropped_circuits = df_seasons_circuits.drop(columns=['time', 'ThirdPractice', 'FirstPractice', 'SecondPractice', 'url', 'Sprint', 'Qualifying'])

df_dropped_circuits['Circuit'] = df_dropped_circuits['Circuit'].apply(ast.literal_eval)

df_flat_circuits = pd.concat(
    [df_dropped_circuits.drop(columns=['Circuit']), pd.json_normalize(df_dropped_circuits['Circuit'])],
    axis=1
)

df_flat_circuits.rename(columns={'Location.lat':'latitude', 'Location.long':'longitude', 'Location.locality':'locality', 'Location.country':'country'}, inplace=True)

df_flat_circuits.drop(columns=['url'], inplace=True)
# Replace some country names to make them compatible with pycountry_convert library to get continent names 
df_flat_circuits['country'] = df_flat_circuits['country'].replace({'UK': 'United Kingdom', 'USA': 'United States', 'UAE': 'United Arab Emirates', 'Korea': 'South Korea'})

df_flat_circuits['continent'] = df_flat_circuits['country'].apply(get_continent_name)

df_flat_circuits.head()