# task no. 4

This is the notebook taking care of the task no.4 : feature prediction.

In our analysis we have decided very different models to see how they differ from eachother in term of performance and also their explainability to see if they make sense.

For this notebook we decided to explore the nature of NeuralNetworks on the dataset.

First a preparation is due to be used.

## dataset preparation



In [1]:
import pandas as pd
import os

races_final_path = os.path.join('..','dataset', 'engineered_races.csv')
cyclists_final_path = os.path.join('..','dataset', 'cyclists_final_enhanced.csv')


cyclists_data = pd.read_csv(cyclists_final_path)
races_data = pd.read_csv(races_final_path)




First we binarize the columns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

cyclists_data.rename(columns={'name': 'cyclist'}, inplace=True)


merged_data = races_data.merge(cyclists_data, left_on='cyclist', right_on='_url', how='inner')

merged_data['top_20'] = merged_data['position'].apply(lambda x: 1 if x <= 20 else 0)


merged_data['date'] = pd.to_datetime(merged_data['date'])

columns_to_keep = [

    'bmi','career_points','career_duration(days)','debut_year', # cyclists features
    'points','difficulty_score','competitive_age','climbing_efficiency', # races features
    'top_20'# target feature
]

train_set = merged_data[merged_data['date'] < '2022-01-01']
test_set = merged_data[merged_data['date'] >= '2022-01-01']
std_scaler= StandardScaler()



test_set = test_set[columns_to_keep]
train_set = train_set[columns_to_keep]

X_dev = train_set.drop(columns=['top_20'])

X_test = test_set.drop(columns=['top_20'])
y_test = test_set['top_20']

X_test= std_scaler.fit_transform(X_test)
X_dev= std_scaler.fit_transform(X_dev)

y_dev = train_set['top_20']
X_train,X_val,Y_train,Y_val=train_test_split(
    X_dev,y_dev,
    test_size=0.2,
    random_state=42,
    stratify=y_dev
    )

first a stratification can only help the generization capabilities.

Now we have to setup the task, for this kind of setting the binary cross entropy is the most appropriate given we just want to classify stuff and we are not doing any regression whatsoever.

A first test using a simple NN might be usefull in this case to see the most basic algorithm.

In [None]:
import tensorflow as tf

from keras import layers, models, initializers
from keras.optimizers import Adam, SGD
import itertools as it
from keras.callbacks import EarlyStopping
from keras.initializers import GlorotUniform, GlorotNormal,HeNormal,HeUniform
initializer=initializers.HeNormal()


def get_device_auto():
    gpus_list=tf.config.list_physical_devices('GPU')
    device = None
    if len(gpus_list) != 0:
        device=gpus_list[0]
    else:
        device=tf.config.list_physical_devices('CPU')[0]
    return device

def create_ff_nn(
        optimizer=Adam(),
        num_layers=2,
        num_units=64,
        input_dim=256,
        hidden_activation='relu',
        output_activation='sigmoid',
        loss_function='binary_crossentropy',
        metrics=['accuracy','f1_score','binary_crossentropy'],
        learning_rate=0.001
        ):
    model=models.Sequential()
    optimizer = Adam()
    model.add(layers.Dense(num_units,input_dim=input_dim,activation=hidden_activation))
    for _ in range(num_layers -1):
        model.add(layers.Dense(
            num_units,
            activation=hidden_activation,
            kernel_initializer=HeNormal()
            ))
    model.add(layers.Dense(
        1,
        activation=output_activation,
        kernel_initializer=GlorotNormal()
        ))
    optimizer.learning_rate=learning_rate
    model.compile(
        optimizer=optimizer,
        loss=loss_function,
        metrics=metrics
    )
    return model

def hyperparams_iterator(hyperparams):
    return map(
        lambda comb:  {k:v for k,v in zip(hyperparams.keys(),comb)},
        it.product(*hyperparams.values())
    )

early_stopping=EarlyStopping(
    monitor='f1_score',
    patience=5,
    verbose=1,
    restore_best_weights=True
)
hyperparams={
    'num_layers':[5,10,15,20],
    'learning_rate':[0.001,0.0001],
    'num_units':[1024,2048,4096]
}

device=get_device_auto()
batch_size=1024
tf.random.set_seed(42)
best_val=float('-inf')

2024-12-10 23:55:26.829562: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-10 23:55:26.838358: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-10 23:55:26.845097: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-10 23:55:26.846984: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-10 23:55:26.852384: I tensorflow/core/platform/cpu_feature_guar

In [4]:
results=[]

with tf.device(device.device_type):
    for params in hyperparams_iterator(hyperparams):
        model=create_ff_nn(**params,input_dim=X_train.shape[1])
        model.fit(
            X_train,Y_train,
            batch_size=batch_size,
            validation_data=(X_val,Y_val),
            callbacks=[early_stopping]
            )
        new_row=params
        
        eval_results=model.evaluate(X_train,Y_train,batch_size=batch_size,return_dict=True)
        f1_score,accuracy,bin_cross_ent=eval_results['f1_score'],eval_results['accuracy'],eval_results['binary_crossentropy']
        new_row|={
            'f1_score_train':f1_score,
            'accuracy_train':accuracy,
            'bin_cross_ent_train':bin_cross_ent,
            }
        eval_results=model.evaluate(X_val,Y_val,batch_size=batch_size,return_dict=True)
        f1_score,accuracy,bin_cross_ent=eval_results['f1_score'],eval_results['accuracy'],eval_results['binary_crossentropy']
        new_row|={
            'f1_score_val':f1_score,
            'accuracy_val':accuracy,
            'bin_cross_ent_val':bin_cross_ent,
            }
        if bin_cross_ent < best_val:
            best_val = bin_cross_ent
            model.save('weights/best_ff_nn.h5')
        print(new_row)
        results.append(new_row)
pd_results=pd.DataFrame(results)

pd_results.sort_values(by='bin_cross_ent_val')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1733871329.613099   11143 service.cc:146] XLA service 0x7f3b7401a670 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1733871329.613135   11143 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 4060 Ti, Compute Capability 8.9
2024-12-10 23:55:29.632070: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-12-10 23:55:29.741586: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 90101












[1m 51/434[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 3ms/step - accuracy: 0.7866 - binary_crossentropy: 0.4739 - f1_score: 0.2841 - loss: 0.4725

I0000 00:00:1733871332.264390   11143 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m421/434[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.8319 - binary_crossentropy: 0.4218 - f1_score: 0.2889 - loss: 0.4216









[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8323 - binary_crossentropy: 0.4213 - f1_score: 0.2889 - loss: 0.4213








[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.8323 - binary_crossentropy: 0.4212 - f1_score: 0.2889 - loss: 0.4212 - val_accuracy: 0.8447 - val_binary_crossentropy: 0.3988 - val_f1_score: 0.2894 - val_loss: 0.3988
Restoring model weights from the end of the best epoch: 1.
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8455 - binary_crossentropy: 0.3952 - f1_score: 0.2883 - loss: 0.3952
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8445 - binary_crossentropy: 0.3986 - f1_score: 0.2898 - loss: 0.3986
{'num_layers': 5, 'learning_rate': 0.001, 'num_units': 1024, 'f1_score_train': 0.289430171251297, 'accuracy_train': 0.8448644876480103, 'bin_cross_ent_train': 0.39629536867141724, 'f1_score_val': 0.28942960500717163, 'accuracy_val': 0.8447408080101013, 'bin_cross_ent_val': 0.3987657427787781}


ValueError: Unknown variable: <KerasVariable shape=(8, 2048), dtype=float32, path=sequential_1/dense_6/kernel>. This optimizer can only be called for the variables it was originally built with. When working with a new set of variables, you should recreate a new optimizer instance.