# Bagging and Random Forests

In this notebook, we will use the Random Forest algorithm to predict if a cyclist will be in the top 20 in the next race or not.

We start by loading the data and preparing the train set and the test set.

In [1]:
import pandas as pd
from os import path
import numpy as np

races_final_path = path.join('..','dataset', 'races_cleaned.csv')
cyclists_final_path = path.join('..','dataset', 'cyclists_cleaned.csv')


cyclists_data = pd.read_csv(cyclists_final_path)
races_data = pd.read_csv(races_final_path)

In [2]:

cyclists_data.rename(columns={'name': 'cyclist'}, inplace=True)


merged_data = races_data.merge(cyclists_data, left_on='cyclist', right_on='_url', how='inner')

merged_data['top_20'] = merged_data['position'].apply(lambda x: 1 if x <= 20 else 0)


merged_data['date'] = pd.to_datetime(merged_data['date'])

columns_to_keep = [
    'points', 'length', 'profile', 'startlist_quality', 'cyclist_age',
    'is_tarmac', 'delta', 'top_20', 'weight', 'height'
]


train_set = merged_data[merged_data['date'] < '2022-01-01']
test_set = merged_data[merged_data['date'] >= '2022-01-01']

train_set = train_set[columns_to_keep]
test_set = test_set[columns_to_keep]

X_train = train_set.drop(columns=['top_20'])
y_train = train_set['top_20']


X_test = test_set.drop(columns=['top_20'])
y_test = test_set['top_20']


After we've splitted the data, we will define and fit a random forest model to the train set.

Finally, we will evaluate the model using the test set.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier


# Griglia di iperparametri
param_dist = {
    'n_estimators': [50, 100, 200],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 2, 3],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5],
    'bootstrap': [True],
    'class_weight': [None, 'balanced']
}


random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(),
    param_distributions=param_dist,
    n_iter=20,  # Limita il numero di combinazioni testate
    scoring='f1_weighted',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Adattamento del modello alla griglia
random_search.fit(X_train, y_train)

# Migliori parametri
best_params = random_search.best_params_

# Miglior modello
best_model = random_search.best_estimator_

# Scrivere iperparametri 


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END bootstrap=True, class_weight=balanced, criterion=entropy, max_depth=20, max_features=2, min_samples_leaf=5, min_samples_split=2, n_estimators=50; total time=  36.3s
[CV] END bootstrap=True, class_weight=balanced, criterion=entropy, max_depth=20, max_features=2, min_samples_leaf=5, min_samples_split=2, n_estimators=50; total time=  40.7s
[CV] END bootstrap=True, class_weight=None, criterion=gini, max_depth=10, max_features=3, min_samples_leaf=3, min_samples_split=10, n_estimators=100; total time= 1.1min
[CV] END bootstrap=True, class_weight=None, criterion=gini, max_depth=10, max_features=3, min_samples_leaf=3, min_samples_split=10, n_estimators=100; total time= 1.2min
[CV] END bootstrap=True, class_weight=balanced, criterion=entropy, max_depth=20, max_features=2, min_samples_leaf=5, min_samples_split=2, n_estimators=50; total time=  37.1s
[CV] END bootstrap=True, class_weight=None, criterion=gini, max_depth=10, max_f

KeyboardInterrupt: 

In [None]:
print(best_params)

In [None]:
#predict on the test set
test_pred_rf = best_model.predict(X_test)

In [None]:
#metrics computed on the test set
from sklearn.metrics import classification_report
def report_scores(test_label, test_pred):
    print(classification_report(test_label, 
                            test_pred, 
                            target_names=['0', '1']))

In [None]:


#compute the performance of the model
report_scores(y_test, test_pred_rf)



              precision    recall  f1-score   support

           0       0.94      0.88      0.91     30219
           1       0.47      0.65      0.55      5187

    accuracy                           0.84     35406
   macro avg       0.70      0.76      0.73     35406
weighted avg       0.87      0.84      0.85     35406

