In [None]:
import requests
import csv
import pandas as pd
import json
import ast
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Convert time to seconds
def duration_to_seconds(duration):
    duration_str = str(duration)
    if ':' in duration_str: 
        minutes, seconds_miliseconds = duration_str.split(':')
        seconds, miliseconds = seconds_miliseconds.split('.')
        return int(minutes) * 60 + int(seconds) + int(miliseconds) / 1000
    else:  
        return float(duration_str)

#### Race results

In [None]:
raw_races_df = pd.read_csv('../data/results_2000-2024.csv')
raw_races_df.head()

In [None]:
raw_races_df.drop(columns=['time', 'url'], axis=1, inplace=True)
raw_races_df['Results'] = raw_races_df['Results'].apply(ast.literal_eval)

In [None]:

rows = []
results_expanded = []
for index, row in raw_races_df.iterrows():
    for result in row['Results']:
        rows.append(row.drop('Results')) 
        results_expanded.append(result)  

expanded_rows_df = pd.DataFrame(rows)
results_normalized_df = pd.json_normalize(results_expanded)

results_normalized_df.head()

race_results_df = pd.concat([expanded_rows_df.reset_index(drop=True), results_normalized_df.reset_index(drop=True)], axis=1)

race_results_df.head()
race_results_df['Circuit'] = race_results_df['Circuit'].apply(ast.literal_eval)
circuits_normalized = pd.json_normalize(race_results_df['Circuit'])
race_results_df.head()

race_results_df = pd.concat([race_results_df.drop(columns=['Circuit']), circuits_normalized],axis=1)

race_results_df.head()

In [None]:
race_results_df.columns

In [None]:
modified_race_results_df = race_results_df.drop(columns=['Location.country', 'Location.lat', 'Location.long', 'url', 'Driver.permanentNumber', 'Constructor.nationality', 'Constructor.constructorId', 'Constructor.url', 'circuitId', 'Location.locality', 'positionText', 'points', 'Driver.nationality', 'Driver.dateOfBirth', 'Time.time', 'Time.millis', 'Driver.url', 'Driver.driverId', 'Driver.code'])
modified_race_results_df[modified_race_results_df['season'] == 2024].tail()

In [None]:
renamed_race_results_df = modified_race_results_df.rename(columns= {'FastestLap.Time.time': 'FastestLapTime', 'FastestLap.AverageSpeed.speed': 'FastestLapAvgSpeed'})
renamed_race_results_df.head()

In [None]:
renamed_race_results_df['FastestLapTimeSeconds'] = renamed_race_results_df['FastestLapTime'].apply(lambda duration: duration_to_seconds(duration))

In [None]:
print("Valores NA:")
print(renamed_race_results_df.isna().sum())

In [None]:
renamed_race_results_df['FastestLapAvgSpeed'] = pd.to_numeric(renamed_race_results_df['FastestLapAvgSpeed'], errors='coerce')
renamed_race_results_df['grid'] = renamed_race_results_df['grid'].astype(str).astype(int)
renamed_race_results_df['position'] = renamed_race_results_df['position'].astype(str).astype(int)

In [None]:
renamed_race_results_df.dtypes

In [None]:
speed_corr = renamed_race_results_df.corr(numeric_only=True)

fig, ax = plt.subplots(figsize=(12, 10))

sns.heatmap(speed_corr, annot=True, fmt=".2f")

In [None]:
# renamed_race_results_df['FastestLapTimeSeconds'] = renamed_race_results_df['FastestLapTimeSeconds'].interpolate(method='linear')
# renamed_race_results_df['FastestLapAvgSpeed'] = renamed_race_results_df['FastestLapAvgSpeed'].interpolate(method='linear')

renamed_race_results_df = renamed_race_results_df.dropna()

In [None]:
speed_corr = renamed_race_results_df.corr(numeric_only=True)

fig, ax = plt.subplots(figsize=(12, 10))

sns.heatmap(speed_corr, annot=True, fmt=".2f")

#### Fastest Lap vs Fastest Lap Avg Speed - Linear Regression

In [None]:
train = renamed_race_results_df[(renamed_race_results_df['season'] >= 2014) & (renamed_race_results_df['season'] <= 2021)]
test = renamed_race_results_df[(renamed_race_results_df['season'] >= 2022) & (renamed_race_results_df['season'] <= 2023)]


# for position in train['q1_position'].unique():
# train = train[train['q1_position'] == 15]
# test = test[test['q1_position'] == 15]

# Configurar X (Año) y y (Tiempo)
X_train = train[['FastestLapTimeSeconds']].values
y_train = train['FastestLapAvgSpeed'].values
X_test = test[['FastestLapTimeSeconds']].values
y_test = test['FastestLapAvgSpeed'].values

# Escalados
# x_tr_s = scaler.fit_transform(X_train)
# x_test_s = scaler.fit_transform(X_test)
x_tr_s = X_train
x_test_s = X_test

y_tr_s = y_train
y_test_s = y_test

# Entrenar el modelo de regresión lineal
regr = LinearRegression()
regr.fit(x_tr_s, y_tr_s)

# Hacer predicciones sobre el conjunto de test 
y_pred = regr.predict(x_test_s)

print(f'Intercept: {regr.intercept_.round(2)}')
print(f'Coef: {regr.coef_.round(2)}')
print(f'RMSE: {root_mean_squared_error(y_test, y_pred)}') # Round is giving me an error here

fig, ax = plt.subplots(1, 2, figsize=(10, 10), sharey=True)
ax[0].scatter(x_tr_s, y_tr_s)
ax[0].plot(x_tr_s, regr.predict(x_tr_s), c='g')
ax[0].set_title('Fastest Lap vs Fastest Speed (Train)')
ax[0].set_xlabel('FastestLapTimeSeconds')
ax[0].set_ylabel('FastestLapAvgSpeed')

ax[1].scatter(x_test_s, y_test_s)
ax[1].plot(x_tr_s, regr.predict(x_tr_s), c='g')
ax[1].set_title('Fastest Lap vs Fastest Speed (Test)')
ax[1].set_xlabel('FastestLapTimeSeconds')
ax[1].set_ylabel('FastestLapAvgSpeed')
plt.suptitle('Linear Regression Model')

# Pintar y_pred vs y_test_s



In [None]:
renamed_race_results_df.columns

In [None]:
tmp_renamed_race_results_df = renamed_race_results_df.drop(columns=['raceName', 'round', 'date', 'number', 'laps', 'status', 'Driver.givenName', 'Driver.familyName', 'Constructor.name', 'FastestLap.rank', 'FastestLap.lap', 'FastestLapTime', 'FastestLap.AverageSpeed.units'])
tmp_renamed_race_results_df.tail()

In [None]:
print(tmp_renamed_race_results_df.isna().sum())


In [None]:
tmp_renamed_race_results_df.tail()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt


# Dividir los datos en entrenamiento y prueba
enc = OneHotEncoder(sparse_output=False)

one_hot_encoded = enc.fit_transform(tmp_renamed_race_results_df[['circuitName']])


one_hot_df = pd.DataFrame(one_hot_encoded, columns=enc.get_feature_names_out(['circuitName']))

# Check indexes
one_hot_df.index = tmp_renamed_race_results_df.index

encoded_renamed_race_results_df = pd.concat([tmp_renamed_race_results_df, one_hot_df], axis=1)

train_2014_2021 = encoded_renamed_race_results_df[(encoded_renamed_race_results_df['season'] >= 2014) & (encoded_renamed_race_results_df['season'] <= 2021)]
test_2022_2023 = encoded_renamed_race_results_df[(encoded_renamed_race_results_df['season'] >= 2022) & (encoded_renamed_race_results_df['season'] <= 2023)]

train = train_2014_2021.drop(columns=['circuitName'])
test = test_2022_2023.drop(columns=['circuitName'])

X_train = train.drop(columns=['FastestLapAvgSpeed']).values
y_train = train['FastestLapAvgSpeed'].values
X_test = test.drop(columns=['FastestLapAvgSpeed']).values
y_test = test['FastestLapAvgSpeed'].values


X_train_scaled = X_train
X_test_scaled = X_test

regr = LinearRegression()
regr.fit(X_train_scaled, y_train)

# Hacer predicciones sobre el conjunto de test
y_pred = regr.predict(X_test_scaled)

# Imprimir el Intercepto y los coeficientes
print(f'Intercept: {regr.intercept_.round(2)}')
print(f'Coef: {regr.coef_.round(2)}')

# Calcular RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')


plt.scatter(X_test[:, 3], y_test, c=X_test[:, 0], cmap='viridis', label='Test Data')
plt.colorbar(label="Grid Position")
plt.scatter(X_test[:, 3], y_pred, c='red', alpha=0.6, label='Predictions')
plt.xlabel("FastestLapTimeSeconds")
plt.ylabel("FastestLapAvgSpeed")
plt.title("Predictions vs Test Data (Color: Grid Position)")
plt.legend()
plt.show()




In [None]:
ax = sns.scatterplot(x=y_pred, y=y_test)
out = ax.axline((0, 0), slope=1, linewidth=.5, color='r', linestyle='dashed')

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt


# Dividir los datos en entrenamiento y prueba
enc = OneHotEncoder(sparse_output=False)

one_hot_encoded = enc.fit_transform(tmp_renamed_race_results_df[['circuitName']])


one_hot_df = pd.DataFrame(one_hot_encoded, columns=enc.get_feature_names_out(['circuitName']))

# Check indexes
one_hot_df.index = tmp_renamed_race_results_df.index

encoded_renamed_race_results_df = pd.concat([tmp_renamed_race_results_df, one_hot_df], axis=1)

train_2014_2021 = encoded_renamed_race_results_df[(encoded_renamed_race_results_df['season'] >= 2014) & (encoded_renamed_race_results_df['season'] <= 2021)]
test_2022_2023 = encoded_renamed_race_results_df[(encoded_renamed_race_results_df['season'] >= 2022) & (encoded_renamed_race_results_df['season'] <= 2023)]

train_2014_2021 = train_2014_2021[(train_2014_2021['circuitName'] == 'Hungaroring')]
test_2022_2023 = test_2022_2023[(test_2022_2023['circuitName'] == 'Hungaroring')]

train = train_2014_2021.drop(columns=['season', 'circuitName'])
test = test_2022_2023.drop(columns=['season', 'circuitName'])

X_train = train.drop(columns=['FastestLapAvgSpeed']).values
y_train = train['FastestLapAvgSpeed'].values
X_test = test.drop(columns=['FastestLapAvgSpeed']).values
y_test = test['FastestLapAvgSpeed'].values


X_train_scaled = X_train
X_test_scaled = X_test

regr = LinearRegression()
regr.fit(X_train_scaled, y_train)

# Hacer predicciones sobre el conjunto de test
y_pred = regr.predict(X_test_scaled)

# Imprimir el Intercepto y los coeficientes
print(f'Intercept: {regr.intercept_.round(2)}')
print(f'Coef: {regr.coef_.round(2)}')

# Calcular RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')


plt.scatter(X_test[:, 2], y_test, c=X_test[:, 0], cmap='viridis', label='Test Data')
plt.colorbar(label="Grid Position")
plt.scatter(X_test[:, 2], y_pred, c='red', alpha=0.6, label='Predictions')
plt.xlabel("FastestLapTimeSeconds")
plt.ylabel("FastestLapAvgSpeed")
plt.title("Predictions vs Test Data (Color: Grid Position)")
plt.legend()
plt.show()




In [None]:
ax = sns.scatterplot(x=y_pred, y=y_test)
out = ax.axline((0, 0), slope=1, linewidth=.5, color='r', linestyle='dashed')

##### Grid vs Position

In [None]:
train = renamed_race_results_df[(renamed_race_results_df['season'] >= 2014) & (renamed_race_results_df['season'] <= 2021)]
test = renamed_race_results_df[(renamed_race_results_df['season'] >= 2022) & (renamed_race_results_df['season'] <= 2023)]


# for position in train['q1_position'].unique():
# train = train[train['q1_position'] == 15]
# test = test[test['q1_position'] == 15]

X_train = train[['grid']].values
y_train = train['position'].values
X_test = test[['grid']].values
y_test = test['position'].values

# Escalados
# x_tr_s = scaler.fit_transform(X_train)
# x_test_s = scaler.fit_transform(X_test)
x_tr_s = X_train
x_test_s = X_test

y_tr_s = y_train
y_test_s = y_test

# Entrenar el modelo de regresión lineal
regr = LinearRegression()
regr.fit(x_tr_s, y_tr_s)

# Hacer predicciones sobre el conjunto de test 
y_pred = regr.predict(x_test_s)

print(f'Intercept: {regr.intercept_.round(2)}')
print(f'Coef: {regr.coef_.round(2)}')
print(f'RMSE: {root_mean_squared_error(y_test, y_pred)}') # Round is giving me an error here

fig, ax = plt.subplots(1, 2, figsize=(10, 10), sharey=True)
ax[0].scatter(x_tr_s, y_tr_s)
ax[0].plot(x_tr_s, regr.predict(x_tr_s), c='g')
ax[0].set_title('Fastest Lap vs Fastest Speed (Train)')
ax[0].set_xlabel('Grid')
ax[0].set_ylabel('Position')

ax[1].scatter(x_test_s, y_test_s)
ax[1].plot(x_tr_s, regr.predict(x_tr_s), c='g')
ax[1].set_title('Fastest Lap vs Fastest Speed (Test)')
ax[1].set_xlabel('Grid')
ax[1].set_ylabel('Position')
plt.suptitle('Linear Regression Model')

