# task no. 4

This is the notebook taking care of the task no.4 : feature prediction.

In our analysis we have decided very different models to see how they differ from eachother in term of performance and also their explainability to see if they make sense.

For this notebook we decided to explore the nature of NeuralNetworks on the dataset.

First a preparation is due to be used.

## dataset preparation



In [4]:
!git clone https://mirdan08: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@github.com/DadeOrsu/dm_project24_group_6

Cloning into 'dm_project24_group_6'...
remote: Enumerating objects: 1064, done.[K
remote: Counting objects: 100% (93/93), done.[K
remote: Compressing objects: 100% (63/63), done.[K
remote: Total 1064 (delta 53), reused 61 (delta 27), pack-reused 971 (from 1)[K
Receiving objects: 100% (1064/1064), 47.34 MiB | 702.00 KiB/s, done.
Resolving deltas: 100% (702/702), done.


In [8]:
cd dm_project24_group_6/src/task4_prediction/

/content/dm_project24_group_6/src/task4_prediction


In [9]:
import pandas as pd
import os

races_final_path = os.path.join('..','dataset', 'engineered_races.csv')
cyclists_final_path = os.path.join('..','dataset', 'cyclists_final_enhanced.csv')


cyclists_data = pd.read_csv(cyclists_final_path)
races_data = pd.read_csv(races_final_path)




First we binarize the columns

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

cyclists_data.rename(columns={'name': 'cyclist'}, inplace=True)


merged_data = races_data.merge(cyclists_data, left_on='cyclist', right_on='_url', how='inner')

merged_data['top_20'] = merged_data['position'].apply(lambda x: 1 if x <= 20 else 0)


merged_data['date'] = pd.to_datetime(merged_data['date'])

columns_to_keep = [

    'bmi','career_points','career_duration(days)','debut_year', # cyclists features
    'points','difficulty_score','competitive_age','climbing_efficiency', # races features
    'top_20'# target feature
]

train_set = merged_data[merged_data['date'] < '2022-01-01']
test_set = merged_data[merged_data['date'] >= '2022-01-01']
std_scaler= StandardScaler()



test_set = test_set[columns_to_keep]
train_set = train_set[columns_to_keep]

X_dev = train_set.drop(columns=['top_20'])

X_test = test_set.drop(columns=['top_20'])
y_test = test_set['top_20']

X_test= std_scaler.fit_transform(X_test)
X_dev= std_scaler.fit_transform(X_dev)

y_dev = train_set['top_20']
X_train,X_val,Y_train,Y_val=train_test_split(
    X_dev,y_dev,
    test_size=0.2,
    random_state=42,
    stratify=y_dev
    )

first a stratification can only help the generization capabilities.

Now we have to setup the task, for this kind of setting the binary cross entropy is the most appropriate given we just want to classify stuff and we are not doing any regression whatsoever.

A first test using a simple NN might be usefull in this case to see the most basic algorithm.

In [11]:
import tensorflow as tf

from keras import layers, models, initializers
from keras.optimizers import Adam, SGD
import itertools as it
from keras.callbacks import EarlyStopping
from keras.initializers import GlorotUniform, GlorotNormal,HeNormal,HeUniform
initializer=initializers.HeNormal()


def get_device_auto():
    gpus_list=tf.config.list_physical_devices('GPU')
    device = None
    if len(gpus_list) != 0:
        device=gpus_list[0]
    else:
        device=tf.config.list_physical_devices('CPU')[0]
    return device

def create_ff_nn(
        optimizer=Adam(),
        num_layers=2,
        num_units=64,
        input_dim=256,
        hidden_activation='relu',
        output_activation='sigmoid',
        loss_function='binary_crossentropy',
        metrics=['accuracy','f1_score','binary_crossentropy'],
        learning_rate=0.001
        ):
    model=models.Sequential()
    optimizer = Adam()
    model.add(layers.Dense(num_units,input_dim=input_dim,activation=hidden_activation))
    for _ in range(num_layers -1):
        model.add(layers.Dense(
            num_units,
            activation=hidden_activation,
            kernel_initializer=HeNormal()
            ))
    model.add(layers.Dense(
        1,
        activation=output_activation,
        kernel_initializer=GlorotNormal()
        ))
    optimizer.learning_rate=learning_rate
    model.compile(
        optimizer=optimizer,
        loss=loss_function,
        metrics=metrics
    )
    return model

def hyperparams_iterator(hyperparams):
    return map(
        lambda comb:  {k:v for k,v in zip(hyperparams.keys(),comb)},
        it.product(*hyperparams.values())
    )

early_stopping=EarlyStopping(
    monitor='f1_score',
    patience=5,
    verbose=1,
    restore_best_weights=True
)
hyperparams={
    'num_layers':[10,15,20,30],
    'learning_rate':[0.001,0.0001,0.00001],
    'num_units':[1024]
}

device=get_device_auto()
batch_size=1024
tf.random.set_seed(42)
best_val=float('-inf')

In [12]:
results=[]

with tf.device(device.device_type):
    for params in hyperparams_iterator(hyperparams):
        model=create_ff_nn(**params,input_dim=X_train.shape[1])
        model.fit(
            X_train,Y_train,
            batch_size=batch_size,
            validation_data=(X_val,Y_val),
            callbacks=[early_stopping]
            )
        new_row=params

        eval_results=model.evaluate(X_train,Y_train,batch_size=batch_size,return_dict=True)
        f1_score,accuracy,bin_cross_ent=eval_results['f1_score'],eval_results['accuracy'],eval_results['binary_crossentropy']
        new_row|={
            'f1_score_train':f1_score,
            'accuracy_train':accuracy,
            'bin_cross_ent_train':bin_cross_ent,
            }
        eval_results=model.evaluate(X_val,Y_val,batch_size=batch_size,return_dict=True)
        f1_score,accuracy,bin_cross_ent=eval_results['f1_score'],eval_results['accuracy'],eval_results['binary_crossentropy']
        new_row|={
            'f1_score_val':f1_score,
            'accuracy_val':accuracy,
            'bin_cross_ent_val':bin_cross_ent,
            }
        if bin_cross_ent < best_val:
            best_val = bin_cross_ent
            model.save('weights/best_ff_nn.h5')
        print(new_row)
        results.append(new_row)
pd_results=pd.DataFrame(results)

pd_results.sort_values(by='bin_cross_ent_val')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 2s/step - accuracy: 0.8089 - binary_crossentropy: 0.4637 - f1_score: 0.2894 - loss: 0.4637 - val_accuracy: 0.8433 - val_binary_crossentropy: 0.4042 - val_f1_score: 0.2904 - val_loss: 0.4042
Restoring model weights from the end of the best epoch: 1.
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 417ms/step - accuracy: 0.8422 - binary_crossentropy: 0.4054 - f1_score: 0.2920 - loss: 0.4054
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 401ms/step - accuracy: 0.8444 - binary_crossentropy: 0.4030 - f1_score: 0.2911 - loss: 0.4030
{'num_layers': 10, 'learning_rate': 0.001, 'num_units': 1024, 'f1_score_train': 0.29039040207862854, 'accuracy_train': 0.8430845141410828, 'bin_cross_ent_train': 0.4029702842235565, 'f1_score_val': 0.2903754711151123, 'accuracy_val': 0.8432633876800537, 'bin_cross_ent_val': 0.4042336344718933}
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 

Unnamed: 0,num_layers,learning_rate,num_units,f1_score_train,accuracy_train,bin_cross_ent_train,f1_score_val,accuracy_val,bin_cross_ent_val
7,20,0.0001,1024,0.29039,0.845558,0.398041,0.290375,0.844345,0.402232
3,15,0.001,1024,0.29039,0.844389,0.401512,0.290375,0.84329,0.403358
0,10,0.001,1024,0.29039,0.843085,0.40297,0.290375,0.843263,0.404234
10,30,0.0001,1024,0.29039,0.843835,0.402839,0.290375,0.843101,0.406893
1,10,0.0001,1024,0.29039,0.845369,0.403917,0.290375,0.844588,0.407851
11,30,1e-05,1024,0.29039,0.842929,0.403783,0.290375,0.841804,0.408799
4,15,0.0001,1024,0.29039,0.846085,0.404292,0.290375,0.844858,0.40933
8,20,1e-05,1024,0.29039,0.839989,0.410571,0.290375,0.838992,0.413673
6,20,0.001,1024,0.29039,0.830142,0.414197,0.290375,0.830152,0.414013
5,15,1e-05,1024,0.29039,0.83615,0.417958,0.290375,0.835775,0.419747
