In [194]:
#imports 
import copy
import os
from pathlib import Path
import numpy as np 
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [185]:
'''
Gather data together 
'''


save_directory = '/Users/owainthorp/Documents/Coding/Project/Data/'
file_name = 'giro-d-italia 2000-2024.csv'
data_path = os.path.join(save_directory, file_name)

df = pd.read_csv(data_path)
X_giro = df.iloc[:, 3:7].values  # Features
y_giro = df.iloc[:, -1].values

save_directory = '/Users/owainthorp/Documents/Coding/Project/Data/'
file_name = 'tour-de-france 2000-2023.csv'
data_path = os.path.join(save_directory, file_name)

df = pd.read_csv(data_path)
X_tour = df.iloc[:, 3:7].values  # Features
y_tour = df.iloc[:, -1].values

save_directory = '/Users/owainthorp/Documents/Coding/Project/Data/'
file_name = 'vuelta-a-espana 1994-2023.csv'
data_path = os.path.join(save_directory, file_name)

df = pd.read_csv(data_path)
X_vuelta = df.iloc[:, 3:7].values  # Features
y_vuelta = df.iloc[:, -1].values

# Stack all X arrays vertically
X = np.vstack([X_giro, X_tour, X_vuelta])

# Stack all y arrays vertically
y = np.concatenate([y_giro, y_tour, y_vuelta])

In [186]:
#remove unprocessable data instances 
def data_clean(data, results):
    

    row_hit_list = []

    for r in range(len(data)):
        
        if 'Error' in data[r, :4]:
            row_hit_list.append(r)
            
        if results[r] == 'Error':
            row_hit_list.append(r)
    
    rows_hit_list = list(set(row_hit_list))#simple way to remove duplicates 
    
    
    if rows_hit_list:
        data = np.delete(data, rows_hit_list, axis=0)
        results = np.delete(results, rows_hit_list, axis=0) 

        
    return data, results


In [245]:
# Stack all X arrays vertically
X = np.vstack([X_giro, X_tour, X_vuelta])

# Stack all y arrays vertically
y = np.concatenate([y_giro, y_tour, y_vuelta])

X, y = data_clean(X,y)


X = X.astype(np.float32)
y = y.astype(np.float32)


#normalise the data 
scaler = StandardScaler()
scaler.fit(X)
X_normalised = scaler.transform(X)

np.random.seed(42)

#turn into a tensorflow object and shuffle 
indices = np.arange(len(X_normalised))
np.random.shuffle(indices)

X_normalised, y = X_normalised[indices], y[indices]

#seperate into training, validation and test
X_train, X_test, y_train, y_test = train_test_split(X_normalised, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [246]:
l1_lambda = 0.0001
reg =tf.keras.regularizers.l1(l1_lambda)


model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(100, activation='relu', input_shape = (4,), kernel_regularizer = reg),
    tf.keras.layers.Dense(100, activation='relu',  kernel_regularizer = reg),
    tf.keras.layers.Dense(100, activation='relu',  kernel_regularizer = reg),
    tf.keras.layers.Dense(100, activation='relu',  kernel_regularizer = reg),
    tf.keras.layers.Dense(1)
])

#early stopping
early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    patience = 10, restore_best_weights = True)

#optimizer 
optimizer = tf.keras.optimizers.AdamW(learning_rate = 1e-5)

#compile the model 
model.compile(loss='mean_squared_error',  
              metrics=['mean_absolute_error'], 
              optimizer = optimizer)



history = model.fit(X_train, y_train, epochs=500,
                   validation_data = (X_val, y_val),
                   callbacks = [early_stopping_cb])

Epoch 1/500


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 1553.2169 - mean_absolute_error: 16.1868 - val_loss: 1566.4324 - val_mean_absolute_error: 16.6859
Epoch 2/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1442.5444 - mean_absolute_error: 15.7564 - val_loss: 1565.5060 - val_mean_absolute_error: 16.6842
Epoch 3/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1674.5521 - mean_absolute_error: 17.1807 - val_loss: 1564.5723 - val_mean_absolute_error: 16.6828
Epoch 4/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1345.0659 - mean_absolute_error: 16.2029 - val_loss: 1563.5961 - val_mean_absolute_error: 16.6810
Epoch 5/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1563.5530 - mean_absolute_error: 16.6140 - val_loss: 1562.5984 - val_mean_absolute_error: 16.6793
Epoch 6/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

Epoch 44/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1393.1252 - mean_absolute_error: 17.3411 - val_loss: 1407.3114 - val_mean_absolute_error: 16.7530
Epoch 45/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1380.1547 - mean_absolute_error: 16.7458 - val_loss: 1398.6198 - val_mean_absolute_error: 16.7716
Epoch 46/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1637.0402 - mean_absolute_error: 17.9165 - val_loss: 1390.4913 - val_mean_absolute_error: 16.7911
Epoch 47/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1268.8802 - mean_absolute_error: 15.7508 - val_loss: 1382.5085 - val_mean_absolute_error: 16.8078
Epoch 48/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1517.1055 - mean_absolute_error: 17.8521 - val_loss: 1374.3695 - val_mean_absolute_error: 16.8281
Epoch 49/500
[1m33/33[0m [32m━━━━━━━━

Epoch 87/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1271.9626 - mean_absolute_error: 19.1463 - val_loss: 1123.8789 - val_mean_absolute_error: 18.0747
Epoch 88/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1093.2693 - mean_absolute_error: 18.5355 - val_loss: 1121.2302 - val_mean_absolute_error: 18.0939
Epoch 89/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1365.0917 - mean_absolute_error: 18.4302 - val_loss: 1119.4618 - val_mean_absolute_error: 18.0923
Epoch 90/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 992.8283 - mean_absolute_error: 18.2041 - val_loss: 1117.5315 - val_mean_absolute_error: 18.0983
Epoch 91/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 872.3719 - mean_absolute_error: 17.5277 - val_loss: 1115.3865 - val_mean_absolute_error: 18.1114
Epoch 92/500
[1m33/33[0m [32m━━━━━━━━━━

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 874.7874 - mean_absolute_error: 17.2328 - val_loss: 1090.4049 - val_mean_absolute_error: 17.9029
Epoch 130/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1546.0294 - mean_absolute_error: 19.3314 - val_loss: 1090.2677 - val_mean_absolute_error: 17.8972
Epoch 131/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1145.9391 - mean_absolute_error: 18.2800 - val_loss: 1090.1121 - val_mean_absolute_error: 17.8910
Epoch 132/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1021.2026 - mean_absolute_error: 17.8053 - val_loss: 1090.1360 - val_mean_absolute_error: 17.8668
Epoch 133/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1011.1458 - mean_absolute_error: 17.8964 - val_loss: 1090.1881 - val_mean_absolute_error: 17.8419
Epoch 134/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━

In [247]:
model.evaluate(X_test, y_test)

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 663us/step - loss: 9165.9355 - mean_absolute_error: 23.6918


[8831.6787109375, 24.361371994018555]