In [None]:
import pandas as pd
import ast
import pycountry_convert as pc

#### Resultados
Unión del conjunto de datos de los resultados de carreras y de los sprints a partir de los CSVs generados. El nuevo conjunto de datos se exporta en formato CSV

##### Resultados carreras

In [None]:
raw_race_results_df = pd.read_csv('../data/results_2000-2024.csv')

raw_race_results_df['Results'] = raw_race_results_df['Results'].apply(ast.literal_eval)

rows = []
results_expanded = []
for index, row in raw_race_results_df.iterrows():
    results_list = row['Results']
    for result in results_list:
        rows.append(row.drop('Results'))
        results_expanded.append(result)

expanded_rows_df = pd.DataFrame(rows)
results_normalized_df = pd.json_normalize(results_expanded)

race_results_df = pd.concat([expanded_rows_df.reset_index(drop=True), results_normalized_df.reset_index(drop=True)], axis=1)

race_results_df['points'] = pd.to_numeric(race_results_df['points'], errors='coerce') # convert to integer

## Circuits
race_results_df['Circuit'] = race_results_df['Circuit'].apply(ast.literal_eval)
# print(type(drivers_results_df['Circuit'].iloc[0]))
circuits_normalized = pd.json_normalize(race_results_df['Circuit'])

race_results_df = pd.concat([race_results_df.drop(columns=['Circuit']), circuits_normalized],axis=1)

race_results_df.drop(columns=['url', 'Location.lat', 'Location.long', 'Location.locality', 'Location.country', 'Driver.url', 'Driver.permanentNumber', 'Constructor.url', 'time'], axis=1, inplace=True)

race_results_df.rename(columns={'Driver.driverId': 'driverId', 'Driver.code': 'driverCode', 'Driver.givenName': 'driverName', 'Driver.familyName': 'driverSurname', 'Driver.dateOfBirth': 'driverBirth',
'Driver.nationality':'driverNationality', 'Constructor.constructorId': 'constructorId', 'Constructor.name': 'constructorName', 'Constructor.nationality': 'constructorNationality', 'Time.millis': 'timeMillis', 'Time.time': 'time', 'FastestLap.rank': 'fastestLapRank',
'FastestLap.lap': 'fastestLap', 'FastestLap.Time.time': 'fastestLapTime', 'FastestLap.AverageSpeed.speed': 'fastestLapAverageSpeed', 'FastestLap.AverageSpeed.units': 'fastestLapAverageSpeedUnit'},  inplace=True)

race_results_df = race_results_df.astype({'position': 'int64', 'points': 'int64', 'grid': 'int64', 'laps': 'int64'})

race_results_df.head()


##### Resultados sprint races

In [None]:
# Sprint results started from 2021

raw_sprint_results_df = pd.read_csv('../data/sprint_results_2000-2024.csv')

raw_sprint_results_df['SprintResults'] = raw_sprint_results_df['SprintResults'].apply(ast.literal_eval)

rows = []
results_expanded = []
for index, row in raw_sprint_results_df.iterrows():
    results_list = row['SprintResults']
    for result in results_list:
        rows.append(row.drop('SprintResults'))
        results_expanded.append(result)

expanded_rows_df = pd.DataFrame(rows)
results_normalized_df = pd.json_normalize(results_expanded)

sprint_results_df = pd.concat([expanded_rows_df.reset_index(drop=True), results_normalized_df.reset_index(drop=True)], axis=1)

sprint_results_df['points'] = pd.to_numeric(sprint_results_df['points'], errors='coerce') # convert to integer

## Circuits
sprint_results_df['Circuit'] = sprint_results_df['Circuit'].apply(ast.literal_eval)
# print(type(drivers_results_df['Circuit'].iloc[0]))
circuits_normalized = pd.json_normalize(sprint_results_df['Circuit'])

sprint_results_df = pd.concat([sprint_results_df.drop(columns=['Circuit']), circuits_normalized],axis=1)

sprint_results_df.drop(columns=['url', 'Location.lat',	'Location.long', 'Location.locality', 'Location.country', 'Driver.url', 'Driver.permanentNumber', 'Constructor.url', 'time'], axis=1, inplace=True)

# Rename
sprint_results_df.rename(columns={'Driver.driverId': 'driverId', 'Driver.code': 'driverCode', 'Driver.givenName': 'driverName', 'Driver.familyName': 'driverSurname', 'Driver.dateOfBirth': 'driverBirth',
'Driver.nationality':'driverNationality', 'Constructor.constructorId': 'constructorId', 'Constructor.name': 'constructorName', 'Constructor.nationality': 'constructorNationality', 'Time.millis': 'timeMillis', 'Time.time': 'time', 'FastestLap.rank': 'fastestLapRank',
'FastestLap.lap': 'fastestLap', 'FastestLap.Time.time': 'fastestLapTime', 'FastestLap.AverageSpeed.speed': 'fastestLapAverageSpeed', 'FastestLap.AverageSpeed.units': 'fastestLapAverageSpeedUnit'},  inplace=True)

# Rename for future merge
sprint_results_df.rename(columns={'position':'sprintPosition', 'points':'sprintPoints', 'grid':'sprintGrid', 'laps':'sprintLaps'}, inplace=True)

# Set data types
sprint_results_df = sprint_results_df.astype({'sprintPosition': 'int64', 'sprintPoints': 'int64', 'sprintGrid': 'int64', 'sprintLaps': 'int64'})

sprint_results_df.head()


##### Unión de ambos conjuntos de datos

In [None]:
# Merge the sprint results into the results
sprint_results_df['sprintRace']= True
sprint_results_df.head()
all_results_df = pd.merge(race_results_df, sprint_results_df[['circuitId', 'season', 'driverId','sprintPosition', 'sprintGrid', 'sprintLaps', 'sprintPoints','sprintRace']], on=['circuitId', 'season', 'driverId'], how='left')

all_results_df['sprintRace'] = all_results_df['sprintRace'].fillna(False)

# No numerics values to NaN
all_results_df[['fastestLapRank', 'fastestLap', 'sprintPosition', 'sprintGrid', 'sprintLaps', 'sprintPoints']] = all_results_df[['fastestLapRank', 'fastestLap', 'sprintPosition', 'sprintGrid', 'sprintLaps', 'sprintPoints']].apply(pd.to_numeric, errors='coerce')

all_results_df[['fastestLapRank', 'fastestLap', 'sprintPosition', 'sprintGrid', 'sprintLaps', 'sprintPoints']] = all_results_df[['fastestLapRank', 'fastestLap', 'sprintPosition', 'sprintGrid', 'sprintLaps', 'sprintPoints']].fillna(0)
all_results_df[['fastestLapRank', 'fastestLap', 'sprintPosition', 'sprintGrid', 'sprintLaps', 'sprintPoints']] = all_results_df[['fastestLapRank', 'fastestLap', 'sprintPosition', 'sprintGrid', 'sprintLaps', 'sprintPoints']].astype(int)

# Race points + Sprint points
all_results_df['weekendPoints'] = all_results_df['points'] + all_results_df['sprintPoints']

all_results_df.to_csv('../data/race_and_sprint_results_2000-2024.csv', index=False)

Cálculo de los puntos obtenidos para un fin de semana: Carrera + Sprint

In [None]:
all_results_df['cumulative_points'] = all_results_df.groupby(['season','driverId'])['weekendPoints'].cumsum()
all_results_df.tail(25)


#### Circuitos por temporada

##### Función para obtener el continente para un país

In [None]:
def get_continent_name(country_name):
    country_code = pc.country_name_to_country_alpha2(country_name)
    continent_code = pc.country_alpha2_to_continent_code(country_code)
    continent_dict = {
        "NA": "North America",
        "SA": "South America",
        "AS": "Asia",
        "AF": "Africa",
        "OC": "Oceania",
        "EU": "Europe",
        "AQ" : "Antarctica"
    }
    return continent_dict[continent_code]


Limpieza del conjunto de datos de circuitos

In [None]:
df_seasons_circuits = pd.read_csv('../data/seasons_circuits_2000-2024.csv')

df_dropped_circuits = df_seasons_circuits.drop(columns=['time', 'ThirdPractice', 'FirstPractice', 'SecondPractice', 'url', 'Sprint', 'Qualifying'])

df_dropped_circuits['Circuit'] = df_dropped_circuits['Circuit'].apply(ast.literal_eval)

df_flat_circuits = pd.concat(
    [df_dropped_circuits.drop(columns=['Circuit']), pd.json_normalize(df_dropped_circuits['Circuit'])],
    axis=1
)

df_flat_circuits.rename(columns={'Location.lat':'latitude', 'Location.long':'longitude', 'Location.locality':'locality', 'Location.country':'country'}, inplace=True)

df_flat_circuits.drop(columns=['url'], inplace=True)
# Replace some country names to make them compatible with pycountry_convert library to get continent names 
df_flat_circuits['country'] = df_flat_circuits['country'].replace({'UK': 'United Kingdom', 'USA': 'United States', 'UAE': 'United Arab Emirates', 'Korea': 'South Korea'})

df_flat_circuits['continent'] = df_flat_circuits['country'].apply(get_continent_name)


# Create CSV file
df_flat_circuits.to_csv('../data/cleaned_circuits_2000-2024.csv', index=False)


#### Pitstops
Limpieza del conjunto de datos de los pitstops

In [None]:
raw_pitstops_df = pd.read_csv('../data/raw_pitstops.csv')

raw_pitstops_df['PitStops'] = raw_pitstops_df['PitStops'].apply(ast.literal_eval)

rows = []
pitstops_expanded = []
for index, row in raw_pitstops_df.iterrows():
    pitstop_list = row['PitStops']
    for pitstop_item in pitstop_list:
        rows.append(row.drop('PitStops'))
        pitstops_expanded.append(pitstop_item)

expanded_rows_df = pd.DataFrame(rows)
pitstops_normalized_df = pd.json_normalize(pitstops_expanded)

pitstops_rows_df = pd.concat([expanded_rows_df.reset_index(drop=True), pitstops_normalized_df.reset_index(drop=True)], axis=1)
pitstops_rows_df.head()
 
pitstops_rows_df['Circuit'] = pitstops_rows_df['Circuit'].apply(ast.literal_eval)

pitstops_rows_df['circuitId'] = pitstops_rows_df['Circuit'].apply(lambda x: x.get("circuitId"))

pitstops_rows_df.drop(columns=['url', 'Circuit','time'], axis=1, inplace=True)

results_df = pd.read_csv('../data/race_and_sprint_results_2000-2024.csv')
teams_from_results_df = results_df[['season', 'driverId', 'driverName', 'driverSurname', 'constructorId', 'constructorName']]
teams_from_results_df = teams_from_results_df.groupby(['season', 'driverId']).first().reset_index()
drivers_pitstops_df = pd.merge(pitstops_rows_df, teams_from_results_df[['season','driverId','driverName', 'driverSurname', 'constructorId', 'constructorName']], on=['season','driverId'], how='left')
drivers_pitstops_df['driverFullName'] = drivers_pitstops_df['driverName'] + ' ' + drivers_pitstops_df['driverSurname']

# Create CSV file
drivers_pitstops_df.to_csv('../data/cleaned_pitstops.csv', index=False)