# Notebook machine learning
Entrenamiento de modelos de Regresión Lineal y Random Forest para predecir la velocidad media de la vuelta rápida a partir del tiempo de vuelta

Importamos bibliotecas necesarias


In [None]:
import requests
import csv
import pandas as pd
import json
import ast
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Scikit-Learn 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics


Función para convertir los tiempos de vuelta con formato mm:ss.ffff a segundos

In [None]:

def duration_to_seconds(duration):
    duration_str = str(duration)
    if ':' in duration_str: 
        minutes, seconds_miliseconds = duration_str.split(':')
        seconds, miliseconds = seconds_miliseconds.split('.')
        return int(minutes) * 60 + int(seconds) + int(miliseconds) / 1000
    else:  
        return float(duration_str)

#### Carga de datos y limpieza

In [None]:
raw_races_df = pd.read_csv('../data/results_2000-2024.csv')
raw_races_df.head()

Una vez cargado el CSV en un dataframe, se procede a convertir las columnas Results y Circuit a estructuras de Python. <br />
Primero, se expande la columna Results, creando una lista llamada rows que contiene los datos generales de las carreras por cada participante, sin la columna Results, y otra lista results_expanded con la información de cada participante, es decir, el contenido de la columna Results. El objetivo es tener una fila por cada participante. <br />
Una vez terminado los bucles, se convierten las listas a dataframes, aplanando las estructuras anidadas en results_expanded mediante json_normalize. <br />
Se genera un nuevo dataframe a partir de la concatenación de los dos dataframes anteriores, obteniendo una fila por cada participante con su información así como información de la carrera. <br />
Por último, se normaliza la columna Circuit que también contiene una estructura anidada.


In [None]:
raw_races_df['Results'] = raw_races_df['Results'].apply(ast.literal_eval)
raw_races_df['Circuit'] = raw_races_df['Circuit'].apply(ast.literal_eval)


rows = []
results_expanded = []
for index, row in raw_races_df.iterrows():
    for result in row['Results']:
        rows.append(row.drop('Results')) 
        results_expanded.append(result)  

expanded_rows_df = pd.DataFrame(rows)
results_normalized_df = pd.json_normalize(results_expanded) # Se normaliza columna Results
race_results_df = pd.concat([expanded_rows_df.reset_index(drop=True), results_normalized_df.reset_index(drop=True)], axis=1) # Se añade la columna Results normalizadas al dataframe donde se contiene la información de carrera

circuits_normalized = pd.json_normalize(race_results_df['Circuit']) # Se normaliza columna Circuit
race_results_df = pd.concat([race_results_df.drop(columns=['Circuit']), circuits_normalized],axis=1) # Se añaden la columna Circuit normalizada al dataframe

race_results_df.head()

In [None]:
race_results_df.columns

Borramos columnas innecesarias

In [None]:
modified_race_results_df = race_results_df.drop(columns=['url', 'time', 'Location.country', 'Location.lat', 'Location.long', 'url', 'Driver.permanentNumber', 'Constructor.nationality', 'Constructor.constructorId', 'Constructor.url', 'circuitId', 'Location.locality', 'positionText', 'points', 'Driver.nationality', 'Driver.dateOfBirth', 'Time.time', 'Time.millis', 'Driver.url', 'Driver.driverId', 'Driver.code'])
modified_race_results_df[modified_race_results_df['season'] == 2024].tail()

Renombramos las variables que vamos a usar en el modelo y creamos una nueva columna que contenga el tiempo de vuelta en segundos mediante la función duration_to_seconds que se creó al principio del notebook.

In [None]:
renamed_race_results_df = modified_race_results_df.rename(columns= {'FastestLap.Time.time': 'FastestLapTime', 'FastestLap.AverageSpeed.speed': 'FastestLapAvgSpeed'})
renamed_race_results_df['FastestLapTimeSeconds'] = renamed_race_results_df['FastestLapTime'].apply(lambda duration: duration_to_seconds(duration))
renamed_race_results_df.head()

Comprobamos cuantos valores NaN contiene el dataframe

In [None]:
print("Valores NA:")
print(renamed_race_results_df.isna().sum())

Convertimos columnas a variables numéricas

In [None]:
renamed_race_results_df['FastestLapAvgSpeed'] = pd.to_numeric(renamed_race_results_df['FastestLapAvgSpeed'], errors='coerce')
renamed_race_results_df['grid'] = renamed_race_results_df['grid'].astype(str).astype(int)
renamed_race_results_df['position'] = renamed_race_results_df['position'].astype(str).astype(int)
renamed_race_results_df.dtypes

Borramos registros con valores NaN

In [None]:
renamed_race_results_df = renamed_race_results_df.dropna()

Pintamos matriz de correlación para conocer las relaciones entre las distintas columnas numéricas que disponemos en el dataframe

In [None]:
speed_corr = renamed_race_results_df.corr(numeric_only=True)

fig, ax = plt.subplots(figsize=(12, 10))

sns.heatmap(speed_corr, annot=True, fmt=".2f")

#### Fastest Lap vs Fastest Lap Avg Speed - Linear Regression

In [None]:
train = renamed_race_results_df[(renamed_race_results_df['season'] >= 2014) & (renamed_race_results_df['season'] <= 2021)]
test = renamed_race_results_df[(renamed_race_results_df['season'] >= 2022) & (renamed_race_results_df['season'] <= 2023)]


# for position in train['q1_position'].unique():
# train = train[train['q1_position'] == 15]
# test = test[test['q1_position'] == 15]

# Configurar X (Año) y y (Tiempo)
X_train = train[['FastestLapTimeSeconds']].values
y_train = train['FastestLapAvgSpeed'].values
X_test = test[['FastestLapTimeSeconds']].values
y_test = test['FastestLapAvgSpeed'].values

# Escalados
# x_tr_s = scaler.fit_transform(X_train)
# x_test_s = scaler.fit_transform(X_test)
x_tr_s = X_train
x_test_s = X_test

y_tr_s = y_train
y_test_s = y_test

# Entrenar el modelo de regresión lineal
regr = LinearRegression()
regr.fit(x_tr_s, y_tr_s)

# Hacer predicciones sobre el conjunto de test 
y_pred = regr.predict(x_test_s)

print(f'Intercept: {regr.intercept_.round(2)}')
print(f'Coef: {regr.coef_.round(2)}')
print(f'RMSE: {root_mean_squared_error(y_test, y_pred)}') # Round is giving me an error here

fig, ax = plt.subplots(1, 2, figsize=(10, 10), sharey=True)
ax[0].scatter(x_tr_s, y_tr_s)
ax[0].plot(x_tr_s, regr.predict(x_tr_s), c='g')
ax[0].set_title('Fastest Lap vs Fastest Speed (Train)')
ax[0].set_xlabel('FastestLapTimeSeconds')
ax[0].set_ylabel('FastestLapAvgSpeed')

ax[1].scatter(x_test_s, y_test_s)
ax[1].plot(x_tr_s, regr.predict(x_tr_s), c='g')
ax[1].set_title('Fastest Lap vs Fastest Speed (Test)')
ax[1].set_xlabel('FastestLapTimeSeconds')
ax[1].set_ylabel('FastestLapAvgSpeed')
plt.suptitle('Linear Regression Model')

# Pintar y_pred vs y_test_s



Borrado de columnas innecesarias

In [None]:
tmp_renamed_race_results_df = renamed_race_results_df.drop(columns=['raceName', 'round', 'date', 'number', 'laps', 'status', 'Driver.givenName', 'Driver.familyName', 'Constructor.name', 'FastestLap.rank', 'FastestLap.lap', 'FastestLapTime', 'FastestLap.AverageSpeed.units'])
tmp_renamed_race_results_df.tail()

Comprobamos que no hayan valores nulos

In [None]:
print(tmp_renamed_race_results_df.isna().sum())


Generamos una copia del dataframe modificado, y este será el dataframe a usar en los modelos

In [None]:
final_race_results_df = tmp_renamed_race_results_df.copy()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt

enc = OneHotEncoder(sparse_output=False)
one_hot_encoded = enc.fit_transform(final_race_results_df[['circuitName']])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=enc.get_feature_names_out(['circuitName']))

# Check indexes
one_hot_df.index = final_race_results_df.index
encoded_renamed_race_results_df = pd.concat([final_race_results_df, one_hot_df], axis=1)

# Dividir los datos en entrenamiento y prueba
train_2014_2021 = encoded_renamed_race_results_df[(encoded_renamed_race_results_df['season'] >= 2014) & (encoded_renamed_race_results_df['season'] <= 2021)]
test_2022_2023 = encoded_renamed_race_results_df[(encoded_renamed_race_results_df['season'] >= 2022) & (encoded_renamed_race_results_df['season'] <= 2023)]

train = train_2014_2021.drop(columns=['circuitName'])
test = test_2022_2023.drop(columns=['circuitName'])

X_train = train.drop(columns=['FastestLapAvgSpeed']).values
y_train = train['FastestLapAvgSpeed'].values
X_test = test.drop(columns=['FastestLapAvgSpeed']).values
y_test = test['FastestLapAvgSpeed'].values


X_train_scaled = X_train
X_test_scaled = X_test

regr = LinearRegression()
regr.fit(X_train_scaled, y_train)

# Hacer predicciones sobre el conjunto de test
y_pred = regr.predict(X_test_scaled)

# Imprimir el Intercepto y los coeficientes
print(f'Intercept: {regr.intercept_.round(2)}')
print(f'Coef: {regr.coef_.round(2)}')

# Calcular RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')


plt.scatter(X_test[:, 3], y_test, c=X_test[:, 0], cmap='viridis', label='Test Data')
plt.colorbar(label="Grid Position")
plt.scatter(X_test[:, 3], y_pred, c='red', alpha=0.6, label='Predictions')
plt.xlabel("FastestLapTimeSeconds")
plt.ylabel("FastestLapAvgSpeed")
plt.title("Predictions vs Test Data (Color: Grid Position)")
plt.legend()
plt.show()




In [None]:
ax = sns.scatterplot(x=y_pred, y=y_test)
out = ax.axline((0, 0), slope=1, linewidth=.5, color='r', linestyle='dashed')

In [None]:



# Dividir los datos en entrenamiento y prueba
enc = OneHotEncoder(sparse_output=False)
one_hot_encoded = enc.fit_transform(final_race_results_df[['circuitName']])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=enc.get_feature_names_out(['circuitName']))

# Check indexes
one_hot_df.index = final_race_results_df.index

encoded_renamed_race_results_df = pd.concat([final_race_results_df, one_hot_df], axis=1)

train_2014_2021 = encoded_renamed_race_results_df[(encoded_renamed_race_results_df['season'] >= 2014) & (encoded_renamed_race_results_df['season'] <= 2021)]
test_2022_2023 = encoded_renamed_race_results_df[(encoded_renamed_race_results_df['season'] >= 2022) & (encoded_renamed_race_results_df['season'] <= 2023)]

train_2014_2021 = train_2014_2021[(train_2014_2021['circuitName'] == 'Hungaroring')]
test_2022_2023 = test_2022_2023[(test_2022_2023['circuitName'] == 'Hungaroring')]

train = train_2014_2021.drop(columns=['season', 'circuitName'])
test = test_2022_2023.drop(columns=['season', 'circuitName'])

X_train = train.drop(columns=['FastestLapAvgSpeed']).values
y_train = train['FastestLapAvgSpeed'].values
X_test = test.drop(columns=['FastestLapAvgSpeed']).values
y_test = test['FastestLapAvgSpeed'].values


X_train_scaled = X_train
X_test_scaled = X_test

regr = LinearRegression()
regr.fit(X_train_scaled, y_train)

# Hacer predicciones sobre el conjunto de test
y_pred = regr.predict(X_test_scaled)

# Imprimir el Intercepto y los coeficientes
print(f'Intercept: {regr.intercept_.round(2)}')
print(f'Coef: {regr.coef_.round(2)}')

# Calcular RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')


plt.scatter(X_test[:, 2], y_test, c=X_test[:, 0], cmap='viridis', label='Test Data')
plt.colorbar(label="Grid Position")
plt.scatter(X_test[:, 2], y_pred, c='red', alpha=0.6, label='Predictions')
plt.xlabel("FastestLapTimeSeconds")
plt.ylabel("FastestLapAvgSpeed")
plt.title("Predictions vs Test Data (Color: Grid Position)")
plt.legend()
plt.show()




In [None]:
ax = sns.scatterplot(x=y_pred, y=y_test)
out = ax.axline((0, 0), slope=1, linewidth=.5, color='r', linestyle='dashed')

##### Grid vs Position

In [None]:
train = renamed_race_results_df[(renamed_race_results_df['season'] >= 2014) & (renamed_race_results_df['season'] <= 2021)]
test = renamed_race_results_df[(renamed_race_results_df['season'] >= 2022) & (renamed_race_results_df['season'] <= 2023)]


# for position in train['q1_position'].unique():
# train = train[train['q1_position'] == 15]
# test = test[test['q1_position'] == 15]

X_train = train[['grid']].values
y_train = train['position'].values
X_test = test[['grid']].values
y_test = test['position'].values

# Escalados
# x_tr_s = scaler.fit_transform(X_train)
# x_test_s = scaler.fit_transform(X_test)
x_tr_s = X_train
x_test_s = X_test

y_tr_s = y_train
y_test_s = y_test

# Entrenar el modelo de regresión lineal
regr = LinearRegression()
regr.fit(x_tr_s, y_tr_s)

# Hacer predicciones sobre el conjunto de test 
y_pred = regr.predict(x_test_s)

print(f'Intercept: {regr.intercept_.round(2)}')
print(f'Coef: {regr.coef_.round(2)}')
print(f'RMSE: {root_mean_squared_error(y_test, y_pred)}') # Round is giving me an error here

fig, ax = plt.subplots(1, 2, figsize=(10, 10), sharey=True)
ax[0].scatter(x_tr_s, y_tr_s)
ax[0].plot(x_tr_s, regr.predict(x_tr_s), c='g')
ax[0].set_title('Fastest Lap vs Fastest Speed (Train)')
ax[0].set_xlabel('Grid')
ax[0].set_ylabel('Position')

ax[1].scatter(x_test_s, y_test_s)
ax[1].plot(x_tr_s, regr.predict(x_tr_s), c='g')
ax[1].set_title('Fastest Lap vs Fastest Speed (Test)')
ax[1].set_xlabel('Grid')
ax[1].set_ylabel('Position')
plt.suptitle('Linear Regression Model')



### Random forest

In [None]:
# Dataframe: final_race_results_df

enc = OneHotEncoder(sparse_output=False)
one_hot_encoded = enc.fit_transform(final_race_results_df[['circuitName']])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=enc.get_feature_names_out(['circuitName']))

# Check indexes
one_hot_df.index = final_race_results_df.index
encoded_renamed_race_results_df = pd.concat([final_race_results_df, one_hot_df], axis=1)

# Dividir los datos en entrenamiento y prueba
train_2014_2021 = encoded_renamed_race_results_df[(encoded_renamed_race_results_df['season'] >= 2014) & (encoded_renamed_race_results_df['season'] <= 2021)]
test_2022_2023 = encoded_renamed_race_results_df[(encoded_renamed_race_results_df['season'] >= 2022) & (encoded_renamed_race_results_df['season'] <= 2023)]

train = train_2014_2021.drop(columns=['circuitName'])
test = test_2022_2023.drop(columns=['circuitName'])

X_train = train.drop(columns=['FastestLapAvgSpeed']).values
y_train = train['FastestLapAvgSpeed'].values
X_test = test.drop(columns=['FastestLapAvgSpeed']).values
y_test = test['FastestLapAvgSpeed'].values


# Entrenar modelo
# rf = RandomForestRegressor(max_depth=10) # RMSE 9
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

# Obtener predicciones
y_train_pred = rf.predict(X_train)
y_pred = rf.predict(X_test)

# Calcular el RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('RMSE:', round(rmse, 2))

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
print("RMSE promedio:", -scores.mean())

In [None]:
diff = pd.DataFrame({'Real': y_test, 'Predicción': y_pred, 'Error': y_test - y_pred})
print(diff.sort_values(by='Error', ascending=False))

In [None]:
ax = sns.scatterplot(x=y_pred, y=y_test)
out = ax.axline((0, 0), slope=1, linewidth=.5, color='r', linestyle='dashed')

In [None]:
final_race_results_df['circuitName'].unique()

In [None]:
# Dataframe: final_race_results_df

enc = OneHotEncoder(sparse_output=False)
one_hot_encoded = enc.fit_transform(final_race_results_df[['circuitName']])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=enc.get_feature_names_out(['circuitName']))

# Check indexes
one_hot_df.index = final_race_results_df.index
encoded_renamed_race_results_df = pd.concat([final_race_results_df, one_hot_df], axis=1)

# Dividir los datos en entrenamiento y prueba
train_2014_2021 = encoded_renamed_race_results_df[(encoded_renamed_race_results_df['season'] >= 2014) & (encoded_renamed_race_results_df['season'] <= 2021)]
test_2022_2023 = encoded_renamed_race_results_df[(encoded_renamed_race_results_df['season'] >= 2022) & (encoded_renamed_race_results_df['season'] <= 2023)]

train_2014_2021 = train_2014_2021[(train_2014_2021['circuitName'] == 'Bahrain International Circuit')]
test_2022_2023 = test_2022_2023[(test_2022_2023['circuitName'] == 'Bahrain International Circuit')]

train = train_2014_2021.drop(columns=['circuitName'])
test = test_2022_2023.drop(columns=['circuitName'])

X_train = train.drop(columns=['FastestLapAvgSpeed']).values
y_train = train['FastestLapAvgSpeed'].values
X_test = test.drop(columns=['FastestLapAvgSpeed']).values
y_test = test['FastestLapAvgSpeed'].values


# Entrenar modelo
# rf = RandomForestRegressor(max_depth=10) # RMSE 9
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

# Obtener predicciones
y_train_pred = rf.predict(X_train)
y_pred = rf.predict(X_test)

# Calcular el RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('RMSE:', round(rmse, 2))

In [None]:
diff = pd.DataFrame({'Real': y_test, 'Predicción': y_pred, 'Error': y_test - y_pred})
print(diff.sort_values(by='Error', ascending=False))

In [None]:
problematic_circuit_data = final_race_results_df[final_race_results_df['circuitName'] == 'Bahrain International Circuit']

# Comparar estadísticas con otros circuitos
print(problematic_circuit_data.describe())