# Training

Per ogni dataset preprocessato viene allenato un classificatore XGBoost.
Per trovare il modello migliore, rispetto ai possibili iperparametri impostabili, viene sfruttata una ricerca random con 10 iterazioni nello spazio degli iperparametri.
Ogni modello viene valutato utilizzando cross validation a 5 fold, per un totale di 50 modelli allenati per dataset.

In [1]:
import numpy as np 
import os
import pandas as pd
import pickle
from modules.utils import load_train_ds
from modules.utils import custom_flatten

In [2]:
import xgboost as xgb 
from sklearn.model_selection import RandomizedSearchCV

def xgb_cross_val(X, Y, iter, cv, njobs):
    classifier = xgb.XGBClassifier()       
    param_grid = {
            #### Default model parameters ######
            'objective' : ['multi:softmax'],
            'eval_metric' : ['mlogloss'],
            'num_classes': [4],
            'random_state' : [42],
            'verbosity' : [0],
            ####################################
            'max_depth': [25, 50, 75, 100],
            'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
            'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
            'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
            'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
            'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
            'gamma': [0, 0.25, 0.5, 1.0],
            'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
            'n_estimators': [100, 500, 1000]}

    rs_clf = RandomizedSearchCV(classifier, 
                            param_grid, 
                            n_iter=iter,
                            scoring='accuracy',
                            n_jobs=njobs, 
                            verbose=5, 
                            cv=cv, 
                            random_state=42)

    results = rs_clf.fit(x_train, y_train)
    return results


## "block4_pool"

In [3]:
dataset_path = os.path.join(
    os.getcwd(),
    'data',
    'block4_pool')

x_train, y_train = load_train_ds(dataset_path)
x_train = custom_flatten(x_train)

In [4]:
results = xgb_cross_val(x_train, y_train, 10, 5, 5)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed: 16.3min
[Parallel(n_jobs=5)]: Done  50 out of  50 | elapsed: 160.4min finished


In [5]:
# Save the random search results
# Refit is true so it also contains the best estimator
with open(os.path.join(os.getcwd(), 'results', 'results_block4_pool'), 'wb') as f:
    pickle.dump(results, f)

In [29]:
pd.concat([pd.DataFrame(results.cv_results_["params"]),pd.DataFrame(results.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)

Unnamed: 0,verbosity,subsample,reg_lambda,random_state,objective,num_classes,n_estimators,min_child_weight,max_depth,learning_rate,gamma,eval_metric,colsample_bytree,colsample_bylevel,Accuracy
0,0,0.9,100.0,42,multi:softmax,4,100,5.0,25,0.01,1.0,mlogloss,0.4,1.0,0.920124
1,0,0.5,0.1,42,multi:softmax,4,1000,10.0,100,0.3,1.0,mlogloss,0.4,1.0,0.949377
2,0,0.9,10.0,42,multi:softmax,4,1000,1.0,25,0.3,0.25,mlogloss,0.7,1.0,0.948071
3,0,1.0,10.0,42,multi:softmax,4,500,1.0,25,0.2,0.5,mlogloss,0.8,0.8,0.954728
4,0,0.5,0.1,42,multi:softmax,4,100,1.0,75,0.1,0.0,mlogloss,0.6,0.4,0.964053
5,0,0.5,5.0,42,multi:softmax,4,500,10.0,50,0.1,0.0,mlogloss,0.4,0.6,0.961377
6,0,1.0,0.1,42,multi:softmax,4,1000,1.0,25,0.1,0.0,mlogloss,0.5,1.0,0.950728
7,0,0.9,1.0,42,multi:softmax,4,100,10.0,75,0.001,0.25,mlogloss,0.6,0.8,0.932088
8,0,0.5,10.0,42,multi:softmax,4,1000,3.0,100,0.1,0.5,mlogloss,0.7,0.9,0.961377
9,0,0.9,10.0,42,multi:softmax,4,100,1.0,50,0.2,1.0,mlogloss,0.4,0.7,0.948071


## block5_pool

In [5]:
dataset_path = os.path.join(
    os.getcwd(),
    'data',
    'block5_pool')

x_train, y_train = load_train_ds(dataset_path)
x_train = custom_flatten(x_train)

In [4]:
results = xgb_cross_val(x_train, y_train, 10, 5, 10)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  42 out of  50 | elapsed: 49.5min remaining:  9.4min
[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed: 55.7min finished


In [None]:
with open(os.path.join(os.getcwd(), 'results_block5_pool'), 'wb') as f:
    pickle.dump(results, f)

In [11]:
pd.concat([pd.DataFrame(results.cv_results_["params"]),pd.DataFrame(results.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)

Unnamed: 0,verbosity,subsample,reg_lambda,random_state,objective,num_classes,n_estimators,min_child_weight,max_depth,learning_rate,gamma,eval_metric,colsample_bytree,colsample_bylevel,Accuracy
0,0,0.9,100.0,42,multi:softmax,4,100,5.0,25,0.01,1.0,mlogloss,0.4,1.0,0.906799
1,0,0.5,0.1,42,multi:softmax,4,1000,10.0,100,0.3,1.0,mlogloss,0.4,1.0,0.942737
2,0,0.9,10.0,42,multi:softmax,4,1000,1.0,25,0.3,0.25,mlogloss,0.7,1.0,0.941413
3,0,1.0,10.0,42,multi:softmax,4,500,1.0,25,0.2,0.5,mlogloss,0.8,0.8,0.937404
4,0,0.5,0.1,42,multi:softmax,4,100,1.0,75,0.1,0.0,mlogloss,0.6,0.4,0.956044
5,0,0.5,5.0,42,multi:softmax,4,500,10.0,50,0.1,0.0,mlogloss,0.4,0.6,0.936088
6,0,1.0,0.1,42,multi:softmax,4,1000,1.0,25,0.1,0.0,mlogloss,0.5,1.0,0.941413
7,0,0.9,1.0,42,multi:softmax,4,100,10.0,75,0.001,0.25,mlogloss,0.6,0.8,0.909475
8,0,0.5,10.0,42,multi:softmax,4,1000,3.0,100,0.1,0.5,mlogloss,0.7,0.9,0.946746
9,0,0.9,10.0,42,multi:softmax,4,100,1.0,50,0.2,1.0,mlogloss,0.4,0.7,0.946728


## block2_pool

In [3]:
dataset_path = os.path.join(
    os.getcwd(),
    'data',
    'block2_pool')

x_train, y_train = load_train_ds(dataset_path)
x_train = custom_flatten(x_train)

In [4]:
results = xgb_cross_val(x_train, y_train, 10, 5, 1)

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 1014.5min finished


In [5]:
with open(os.path.join(os.getcwd(), 'results', 'results_block2_pool'), 'wb') as f:
    pickle.dump(results, f)

In [6]:
pd.concat([pd.DataFrame(results.cv_results_["params"]),pd.DataFrame(results.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)

Unnamed: 0,verbosity,subsample,reg_lambda,random_state,objective,num_classes,n_estimators,min_child_weight,max_depth,learning_rate,gamma,eval_metric,colsample_bytree,colsample_bylevel,Accuracy
0,0,0.9,100.0,42,multi:softmax,4,100,5.0,25,0.01,1.0,mlogloss,0.4,1.0,0.90015
1,0,0.5,0.1,42,multi:softmax,4,1000,10.0,100,0.3,1.0,mlogloss,0.4,1.0,0.925448
2,0,0.9,10.0,42,multi:softmax,4,1000,1.0,25,0.3,0.25,mlogloss,0.7,1.0,0.928088
3,0,1.0,10.0,42,multi:softmax,4,500,1.0,25,0.2,0.5,mlogloss,0.8,0.8,0.91879
4,0,0.5,0.1,42,multi:softmax,4,100,1.0,75,0.1,0.0,mlogloss,0.6,0.4,0.938764
5,0,0.5,5.0,42,multi:softmax,4,500,10.0,50,0.1,0.0,mlogloss,0.4,0.6,0.942755
6,0,1.0,0.1,42,multi:softmax,4,1000,1.0,25,0.1,0.0,mlogloss,0.5,1.0,0.921466
7,0,0.9,1.0,42,multi:softmax,4,100,10.0,75,0.001,0.25,mlogloss,0.6,0.8,0.904168
8,0,0.5,10.0,42,multi:softmax,4,1000,3.0,100,0.1,0.5,mlogloss,0.7,0.9,0.934773
9,0,0.9,10.0,42,multi:softmax,4,100,1.0,50,0.2,1.0,mlogloss,0.4,0.7,0.922781
