# Gradient Boosting basat en histogrames - Cerca d'hiperparàmetres

In [1]:
import pandas as pd
import numpy as np
import h5py
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor
import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import random

import pickle
import time



## Càrrega de dades

In [2]:
X_data = np.load('/kaggle/input/tfm2a-preparar-dades/X_minMaxScaled_opt.npy')
Y_data = np.load('/kaggle/input/tfm2a-preparar-dades/Y_FM.npy')
param_names = np.load('/kaggle/input/tfm2a-preparar-dades/Y_FM_names.npy')


In [3]:
# Separació en train i test
X_train, X_test, y_train_all, y_test_all = train_test_split(X_data, Y_data, test_size = 0.25, random_state = 42)

# Comprovem les dimensions de train i test
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train_all.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test_all.shape)

Training Features Shape: (68544, 52)
Training Labels Shape: (68544, 6)
Testing Features Shape: (22848, 52)
Testing Labels Shape: (22848, 6)


# Cerca en malla dels hiperparàmetres

Utilitzem la validació creuada per seleccionar els hiperparàmetres òptims

In [4]:
def histGB_GridSearch(nParam):
    start_time = time.time()
    print(datetime.datetime.now(), " - Calculant grid search CV del paràmetre ", nParam)
    
    y_train = y_train_all[:,nParam]
    y_test = y_test_all[:, nParam]


    # Definim una nova instància del model
    model = HistGradientBoostingRegressor(random_state = 24, verbose = 0)

    # Definim el rang de valors pels paràmetres que volem optimitzar
    parameters = {
        'max_iter':[100, 500], #equivalent a n_estimators
        'max_bins': [255],
        'max_depth': [3],
        'min_samples_leaf': [20],
        'learning_rate': [0.1]
    }

    # Definim una instància de GridSearchCV 
    grid = GridSearchCV(model, param_grid=parameters, cv=4)

    # Entrenem el GridSearchCV
    grid.fit(X_train, y_train)

    # Quina és la millor combinació de paràmetres:
    print("Millor score: ", grid.best_score_)
    print("Millors paràmetres: ", grid.best_params_)
    print("Millor model: ", grid.best_estimator_)

    # Resultats
    grid_results = pd.DataFrame(grid.cv_results_)[[
        'param_max_iter', 
        'param_max_bins',
        'param_max_depth',
        'param_min_samples_leaf',
        'param_learning_rate',                 
        'mean_test_score', 
        'std_test_score', 
        'rank_test_score']]
    grid_results['nParam'] = nParam
    
    pickle.dump(grid_results, open("grid_results_"+str(nParam), "wb"))
    
    end_time = time.time()
    total_time = end_time - start_time
    print("El temps total de l'execució és:", total_time, "segons, o bé ", total_time/60, " minuts.")
    
    return grid_results.sort_values(by=['rank_test_score'])

In [5]:
histGB_GridSearch(0)

2023-06-18 10:50:17.468039  - Calculant grid search CV del paràmetre  0
Millor score:  0.9629065199130232
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 3, 'max_iter': 500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_depth=3, max_iter=500, random_state=24)
El temps total de l'execució és: 22.15360975265503 segons, o bé  0.36922682921091715  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
1,500,255,3,20,0.1,0.962907,0.003347,1,0
0,100,255,3,20,0.1,0.922422,0.004261,2,0


In [6]:
histGB_GridSearch(1)

2023-06-18 10:50:39.688257  - Calculant grid search CV del paràmetre  1
Millor score:  0.903583628678957
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 3, 'max_iter': 500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_depth=3, max_iter=500, random_state=24)
El temps total de l'execució és: 20.851549863815308 segons, o bé  0.3475258310635885  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
1,500,255,3,20,0.1,0.903584,0.000653,1,1
0,100,255,3,20,0.1,0.844922,0.001202,2,1


In [7]:
histGB_GridSearch(2)

2023-06-18 10:51:00.591162  - Calculant grid search CV del paràmetre  2
Millor score:  0.9783430659874578
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 3, 'max_iter': 500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_depth=3, max_iter=500, random_state=24)
El temps total de l'execució és: 21.76564121246338 segons, o bé  0.36276068687438967  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
1,500,255,3,20,0.1,0.978343,0.000301,1,2
0,100,255,3,20,0.1,0.953289,0.000452,2,2


In [8]:
histGB_GridSearch(3)

2023-06-18 10:51:22.407329  - Calculant grid search CV del paràmetre  3
Millor score:  0.9623964189356435
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 3, 'max_iter': 500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_depth=3, max_iter=500, random_state=24)
El temps total de l'execució és: 21.74407148361206 segons, o bé  0.3624011913935343  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
1,500,255,3,20,0.1,0.962396,0.000803,1,3
0,100,255,3,20,0.1,0.939334,0.001547,2,3


In [9]:
histGB_GridSearch(4)

2023-06-18 10:51:44.202117  - Calculant grid search CV del paràmetre  4
Millor score:  0.8271297839590444
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 3, 'max_iter': 500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_depth=3, max_iter=500, random_state=24)
El temps total de l'execució és: 22.63961124420166 segons, o bé  0.3773268540700277  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
1,500,255,3,20,0.1,0.82713,0.001717,1,4
0,100,255,3,20,0.1,0.639228,0.003315,2,4


In [10]:
histGB_GridSearch(5)

2023-06-18 10:52:06.896114  - Calculant grid search CV del paràmetre  5
Millor score:  0.8565864149124027
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 3, 'max_iter': 500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_depth=3, max_iter=500, random_state=24)
El temps total de l'execució és: 20.62448501586914 segons, o bé  0.34374141693115234  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
1,500,255,3,20,0.1,0.856586,0.000528,1,5
0,100,255,3,20,0.1,0.78481,0.001134,2,5


In [11]:
# Definim la millor max_iter per cada paràmetre atmosfèric
def get_best_max_iter(nParam):
    if nParam == 0:
        return 500
    elif nParam == 1:
        return 1500
    elif nParam == 2:
        return 500
    elif nParam == 3:
        return 500
    elif nParam == 4:
        return 2500
    elif nParam == 5:
        return 1500

In [12]:
def histGB_GridSearch2(nParam):
    start_time = time.time()
    print(datetime.datetime.now(), " - Calculant grid search CV del paràmetre ", nParam)
    
    y_train = y_train_all[:,nParam]
    y_test = y_test_all[:, nParam]


    # Definim una nova instància del model
    model = HistGradientBoostingRegressor(random_state = 24, verbose = 0)

    # Definim el rang de valors pels paràmetres que volem optimitzar
    parameters = {        
        'max_iter':[get_best_max_iter(nParam)], 
        'max_bins': [100, 150, 200, 255],
        'max_depth': [3],
        'min_samples_leaf': [20],
        'learning_rate': [0.1]
    }

    # Definim una instància de GridSearchCV 
    grid = GridSearchCV(model, param_grid=parameters, cv=4)

    # Entrenem el GridSearchCV
    grid.fit(X_train, y_train)

    # Quina és la millor combinació de paràmetres:
    print("Millor score: ", grid.best_score_)
    print("Millors paràmetres: ", grid.best_params_)
    print("Millor model: ", grid.best_estimator_)

    # Resultats
    grid_results = pd.DataFrame(grid.cv_results_)[[
        'param_max_iter', 
        'param_max_bins',
        'param_max_depth',
        'param_min_samples_leaf',
        'param_learning_rate',          
        'mean_test_score', 
        'std_test_score', 
        'rank_test_score']]
    grid_results['nParam'] = nParam
    
    pickle.dump(grid_results, open("grid_results_"+str(nParam), "wb"))
    
    end_time = time.time()
    total_time = end_time - start_time
    print("El temps total de l'execució és:", total_time, "segons, o bé ", total_time/60, " minuts.")
    
    return grid_results.sort_values(by=['rank_test_score'])

In [13]:
histGB_GridSearch2(0)

2023-06-18 10:52:27.666717  - Calculant grid search CV del paràmetre  0
Millor score:  0.963066761382986
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 150, 'max_depth': 3, 'max_iter': 500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_bins=150, max_depth=3, max_iter=500,
                              random_state=24)
El temps total de l'execució és: 55.92433309555054 segons, o bé  0.9320722182591756  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
1,500,150,3,20,0.1,0.963067,0.003205,1,0
3,500,255,3,20,0.1,0.962907,0.003347,2,0
2,500,200,3,20,0.1,0.962731,0.002815,3,0
0,500,100,3,20,0.1,0.962635,0.002603,4,0


In [14]:
histGB_GridSearch2(1)

2023-06-18 10:53:23.661348  - Calculant grid search CV del paràmetre  1
Millor score:  0.9298757050006776
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 3, 'max_iter': 1500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_depth=3, max_iter=1500, random_state=24)
El temps total de l'execució és: 148.6332929134369 segons, o bé  2.4772215485572815  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
3,1500,255,3,20,0.1,0.929876,0.000356,1,1
1,1500,150,3,20,0.1,0.929685,0.000889,2,1
2,1500,200,3,20,0.1,0.929579,0.000802,3,1
0,1500,100,3,20,0.1,0.929283,0.000516,4,1


In [15]:
histGB_GridSearch2(2)

2023-06-18 10:55:52.372115  - Calculant grid search CV del paràmetre  2
Millor score:  0.9783430659874578
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 3, 'max_iter': 500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_depth=3, max_iter=500, random_state=24)
El temps total de l'execució és: 56.28755164146423 segons, o bé  0.9381258606910705  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
3,500,255,3,20,0.1,0.978343,0.000301,1,2
1,500,150,3,20,0.1,0.978222,0.00038,2,2
2,500,200,3,20,0.1,0.978221,0.0003,3,2
0,500,100,3,20,0.1,0.97783,0.00018,4,2


In [16]:
histGB_GridSearch2(3)

2023-06-18 10:56:48.735406  - Calculant grid search CV del paràmetre  3
Millor score:  0.9623964189356435
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 3, 'max_iter': 500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_depth=3, max_iter=500, random_state=24)
El temps total de l'execució és: 56.603164196014404 segons, o bé  0.9433860699335734  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
3,500,255,3,20,0.1,0.962396,0.000803,1,3
1,500,150,3,20,0.1,0.962354,0.000877,2,3
2,500,200,3,20,0.1,0.962345,0.001011,3,3
0,500,100,3,20,0.1,0.962159,0.001218,4,3


In [17]:
histGB_GridSearch2(4)

2023-06-18 10:57:45.413867  - Calculant grid search CV del paràmetre  4
Millor score:  0.9193361809374604
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 3, 'max_iter': 2500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_depth=3, max_iter=2500, random_state=24)
El temps total de l'execució és: 247.20948123931885 segons, o bé  4.120158020655314  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
3,2500,255,3,20,0.1,0.919336,0.000889,1,4
2,2500,200,3,20,0.1,0.919015,0.000869,2,4
1,2500,150,3,20,0.1,0.918646,0.001322,3,4
0,2500,100,3,20,0.1,0.91748,0.000889,4,4


In [18]:
histGB_GridSearch2(5)

2023-06-18 11:01:52.702645  - Calculant grid search CV del paràmetre  5
Millor score:  0.8848407482247926
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 200, 'max_depth': 3, 'max_iter': 1500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_bins=200, max_depth=3, max_iter=1500,
                              random_state=24)
El temps total de l'execució és: 141.89011430740356 segons, o bé  2.364835238456726  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
2,1500,200,3,20,0.1,0.884841,0.000435,1,5
1,1500,150,3,20,0.1,0.884328,0.00051,2,5
3,1500,255,3,20,0.1,0.882438,0.003668,3,5
0,1500,100,3,20,0.1,0.882334,0.00186,4,5


In [19]:

def get_best_max_bins(nParam):
    if nParam == 0:
        return 150
    elif nParam == 1:
        return 255
    elif nParam == 2:
        return 255
    elif nParam == 3:
        return 255
    elif nParam == 4:
        return 255
    elif nParam == 5:
        return 200

In [20]:
def histGB_GridSearch3(nParam):
    start_time = time.time()
    print(datetime.datetime.now(), " - Calculant grid search CV del paràmetre ", nParam)
    
    y_train = y_train_all[:,nParam]
    y_test = y_test_all[:, nParam]


    # Definim una nova instància del model
    model = HistGradientBoostingRegressor(random_state = 24, verbose = 0)

    # Definim el rang de valors pels paràmetres que volem optimitzar
    parameters = {
        'max_iter': [get_best_max_iter(nParam)], 
        'max_bins': [get_best_max_bins(nParam)],
        'max_depth': [3, 5, 7, 10, 20],
        'min_samples_leaf': [20],
        'learning_rate': [0.1]
    }

    # Definim una instància de GridSearchCV
    grid = GridSearchCV(model, param_grid=parameters, cv=4)

    # Entrenem el GridSearchCV
    grid.fit(X_train, y_train)

    # Quina és la millor combinació de paràmetres:
    print("Millor score: ", grid.best_score_)
    print("Millors paràmetres: ", grid.best_params_)
    print("Millor model: ", grid.best_estimator_)

    # Resultats
    grid_results = pd.DataFrame(grid.cv_results_)[[
        'param_max_iter', 
        'param_max_bins',
        'param_max_depth',
        'param_min_samples_leaf',
        'param_learning_rate', 
        'mean_test_score', 
        'std_test_score', 
        'rank_test_score']]
    grid_results['nParam'] = nParam
    
    pickle.dump(grid_results, open("grid_results_"+str(nParam), "wb"))
    
    end_time = time.time()
    total_time = end_time - start_time
    print("El temps total de l'execució és:", total_time, "segons, o bé ", total_time/60, " minuts.")
    
    return grid_results.sort_values(by=['rank_test_score'])

In [21]:
histGB_GridSearch3(0)

2023-06-18 11:04:14.783813  - Calculant grid search CV del paràmetre  0
Millor score:  0.9757157009584457
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 150, 'max_depth': 10, 'max_iter': 500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_bins=150, max_depth=10, max_iter=500,
                              random_state=24)
El temps total de l'execució és: 112.65667009353638 segons, o bé  1.8776111682256063  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
3,500,150,10,20,0.1,0.975716,0.001,1,0
2,500,150,7,20,0.1,0.975548,0.000945,2,0
4,500,150,20,20,0.1,0.975452,0.001383,3,0
1,500,150,5,20,0.1,0.973955,0.001324,4,0
0,500,150,3,20,0.1,0.963067,0.003205,5,0


In [22]:
histGB_GridSearch3(1)

2023-06-18 11:06:07.522160  - Calculant grid search CV del paràmetre  1
Millor score:  0.9442445736525626
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 7, 'max_iter': 1500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_depth=7, max_iter=1500, random_state=24)
El temps total de l'execució és: 246.2798900604248 segons, o bé  4.104664834340413  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
2,1500,255,7,20,0.1,0.944245,0.000881,1,1
3,1500,255,10,20,0.1,0.943839,0.000887,2,1
4,1500,255,20,20,0.1,0.943323,0.001454,3,1
1,1500,255,5,20,0.1,0.942024,0.001697,4,1
0,1500,255,3,20,0.1,0.929876,0.000356,5,1


In [23]:
histGB_GridSearch3(2)

2023-06-18 11:10:13.885243  - Calculant grid search CV del paràmetre  2
Millor score:  0.9886768243160515
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 20, 'max_iter': 500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_depth=20, max_iter=500, random_state=24)
El temps total de l'execució és: 127.88309645652771 segons, o bé  2.1313849409421284  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
4,500,255,20,20,0.1,0.988677,0.000361,1,2
3,500,255,10,20,0.1,0.988605,0.000334,2,2
2,500,255,7,20,0.1,0.988338,0.000336,3,2
1,500,255,5,20,0.1,0.987207,0.000317,4,2
0,500,255,3,20,0.1,0.978343,0.000301,5,2


In [24]:
histGB_GridSearch3(3)

2023-06-18 11:12:21.851743  - Calculant grid search CV del paràmetre  3
Millor score:  0.9752349279027073
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 20, 'max_iter': 500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_depth=20, max_iter=500, random_state=24)
El temps total de l'execució és: 124.93114399909973 segons, o bé  2.0821857333183287  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
4,500,255,20,20,0.1,0.975235,0.000565,1,3
3,500,255,10,20,0.1,0.975147,0.000708,2,3
2,500,255,7,20,0.1,0.974524,0.000922,3,3
1,500,255,5,20,0.1,0.972922,0.000749,4,3
0,500,255,3,20,0.1,0.962396,0.000803,5,3


In [25]:
histGB_GridSearch3(4)

2023-06-18 11:14:26.869008  - Calculant grid search CV del paràmetre  4
Millor score:  0.9434692986356349
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 7, 'max_iter': 2500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_depth=7, max_iter=2500, random_state=24)
El temps total de l'execució és: 500.45862078666687 segons, o bé  8.340977013111115  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
2,2500,255,7,20,0.1,0.943469,0.000792,1,4
1,2500,255,5,20,0.1,0.943278,0.000952,2,4
3,2500,255,10,20,0.1,0.942894,0.000749,3,4
4,2500,255,20,20,0.1,0.942427,0.001916,4,4
0,2500,255,3,20,0.1,0.919336,0.000889,5,4


In [26]:
histGB_GridSearch3(5)

2023-06-18 11:22:47.417499  - Calculant grid search CV del paràmetre  5
Millor score:  0.8927851095702826
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 200, 'max_depth': 7, 'max_iter': 1500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_bins=200, max_depth=7, max_iter=1500,
                              random_state=24)
El temps total de l'execució és: 169.12774229049683 segons, o bé  2.8187957048416137  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
2,1500,200,7,20,0.1,0.892785,0.002106,1,5
4,1500,200,20,20,0.1,0.891728,0.004178,2,5
1,1500,200,5,20,0.1,0.890511,0.00166,3,5
3,1500,200,10,20,0.1,0.889959,0.002258,4,5
0,1500,200,3,20,0.1,0.884841,0.000435,5,5


In [27]:
def get_best_max_depth(nParam):
    if nParam == 0:
        return 10
    elif nParam == 1:
        return 7
    elif nParam == 2:
        return 20
    elif nParam == 3:
        return 20
    elif nParam == 4:
        return 7
    elif nParam == 5:
        return 7

In [28]:
def histGB_GridSearch4(nParam):
    start_time = time.time()
    print(datetime.datetime.now(), " - Calculant grid search CV del paràmetre ", nParam)
    
    y_train = y_train_all[:,nParam]
    y_test = y_test_all[:, nParam]


    # Definim una nova instància del model
    model = HistGradientBoostingRegressor(random_state = 24, verbose = 0)

    # Definim el rang de valors pels paràmetres que volem optimitzar
    parameters = {
        'max_iter': [get_best_max_iter(nParam)], 
        'max_bins': [get_best_max_bins(nParam)],
        'max_depth': [get_best_max_depth(nParam)],
        'min_samples_leaf': [10, 20, 50],
        'learning_rate': [0.1]
    }

    # Definim una instància de GridSearchCV 
    grid = GridSearchCV(model, param_grid=parameters, cv=4)

    # Entrenem el GridSearchCV
    grid.fit(X_train, y_train)

    # Quina és la millor combinació de paràmetres:
    print("Millor score: ", grid.best_score_)
    print("Millors paràmetres: ", grid.best_params_)
    print("Millor model: ", grid.best_estimator_)

    # Resultats
    grid_results = pd.DataFrame(grid.cv_results_)[[
        'param_max_iter', 
        'param_max_bins',
        'param_max_depth',
        'param_min_samples_leaf',
        'param_learning_rate', 
        'mean_test_score', 
        'std_test_score', 
        'rank_test_score']]
    grid_results['nParam'] = nParam
    
    pickle.dump(grid_results, open("grid_results_"+str(nParam), "wb"))
    
    end_time = time.time()
    total_time = end_time - start_time
    print("El temps total de l'execució és:", total_time, "segons, o bé ", total_time/60, " minuts.")
    
    return grid_results.sort_values(by=['rank_test_score'])

In [29]:
histGB_GridSearch4(0)

2023-06-18 11:25:36.749568  - Calculant grid search CV del paràmetre  0
Millor score:  0.9757157009584457
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 150, 'max_depth': 10, 'max_iter': 500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_bins=150, max_depth=10, max_iter=500,
                              random_state=24)
El temps total de l'execució és: 73.89102673530579 segons, o bé  1.2315171122550965  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
1,500,150,10,20,0.1,0.975716,0.001,1,0
0,500,150,10,10,0.1,0.975006,0.00138,2,0
2,500,150,10,50,0.1,0.974687,0.001816,3,0


In [30]:
histGB_GridSearch4(1)

2023-06-18 11:26:50.728986  - Calculant grid search CV del paràmetre  1
Millor score:  0.9443687106311321
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 7, 'max_iter': 1500, 'min_samples_leaf': 50}
Millor model:  HistGradientBoostingRegressor(max_depth=7, max_iter=1500, min_samples_leaf=50,
                              random_state=24)
El temps total de l'execució és: 176.56256771087646 segons, o bé  2.942709461847941  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
2,1500,255,7,50,0.1,0.944369,0.000807,1,1
1,1500,255,7,20,0.1,0.944245,0.000881,2,1
0,1500,255,7,10,0.1,0.943434,0.001082,3,1


In [31]:
histGB_GridSearch4(2)

2023-06-18 11:29:47.382218  - Calculant grid search CV del paràmetre  2
Millor score:  0.9886967092196808
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 20, 'max_iter': 500, 'min_samples_leaf': 50}
Millor model:  HistGradientBoostingRegressor(max_depth=20, max_iter=500, min_samples_leaf=50,
                              random_state=24)
El temps total de l'execució és: 94.62968587875366 segons, o bé  1.577161431312561  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
2,500,255,20,50,0.1,0.988697,0.000278,1,2
1,500,255,20,20,0.1,0.988677,0.000361,2,2
0,500,255,20,10,0.1,0.988508,0.000301,3,2


In [32]:
histGB_GridSearch4(3)

2023-06-18 11:31:22.103684  - Calculant grid search CV del paràmetre  3
Millor score:  0.9755698402212994
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 20, 'max_iter': 500, 'min_samples_leaf': 50}
Millor model:  HistGradientBoostingRegressor(max_depth=20, max_iter=500, min_samples_leaf=50,
                              random_state=24)
El temps total de l'execució és: 87.37906908988953 segons, o bé  1.4563178181648255  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
2,500,255,20,50,0.1,0.97557,0.000483,1,3
1,500,255,20,20,0.1,0.975235,0.000565,2,3
0,500,255,20,10,0.1,0.974724,0.000493,3,3


In [33]:
histGB_GridSearch4(4)

2023-06-18 11:32:49.576433  - Calculant grid search CV del paràmetre  4
Millor score:  0.9452102903531757
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 7, 'max_iter': 2500, 'min_samples_leaf': 50}
Millor model:  HistGradientBoostingRegressor(max_depth=7, max_iter=2500, min_samples_leaf=50,
                              random_state=24)
El temps total de l'execució és: 352.22010231018066 segons, o bé  5.870335038503011  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
2,2500,255,7,50,0.1,0.94521,0.001164,1,4
1,2500,255,7,20,0.1,0.943469,0.000792,2,4
0,2500,255,7,10,0.1,0.942632,0.000764,3,4


In [34]:
histGB_GridSearch4(5)

2023-06-18 11:38:41.893301  - Calculant grid search CV del paràmetre  5
Millor score:  0.8927851095702826
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 200, 'max_depth': 7, 'max_iter': 1500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_bins=200, max_depth=7, max_iter=1500,
                              random_state=24)
El temps total de l'execució és: 109.55295372009277 segons, o bé  1.8258825620015462  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
1,1500,200,7,20,0.1,0.892785,0.002106,1,5
2,1500,200,7,50,0.1,0.892063,0.000615,2,5
0,1500,200,7,10,0.1,0.891778,0.003086,3,5


In [35]:
def get_best_min_samples_leaf(nParam):
    if nParam == 0:
        return 20
    elif nParam == 1:
        return 50
    elif nParam == 2:
        return 20
    elif nParam == 3:
        return 50
    elif nParam == 4:
        return 50
    elif nParam == 5:
        return 20

In [36]:
def histGB_GridSearch5(nParam):
    start_time = time.time()
    print(datetime.datetime.now(), " - Calculant grid search CV del paràmetre ", nParam)
    
    y_train = y_train_all[:,nParam]
    y_test = y_test_all[:, nParam]


    # Definim una nova instància del model
    model = HistGradientBoostingRegressor(random_state = 24, verbose = 0)

    # Definim el rang de valors pels paràmetres que volem optimitzar
    parameters = {
        'max_iter': [get_best_max_iter(nParam)], 
        'max_bins': [get_best_max_bins(nParam)],
        'max_depth': [get_best_max_depth(nParam)],
        'min_samples_leaf': [get_best_min_samples_leaf(nParam)],
        'learning_rate': [0.5, 0.1, 0.01]
    }

    # Definim una instància de GridSearchCV 
    grid = GridSearchCV(model, param_grid=parameters, cv=4)

    # Entrenem el GridSearchCV
    grid.fit(X_train, y_train)

    # Quina és la millor combinació de paràmetres:
    print("Millor score: ", grid.best_score_)
    print("Millors paràmetres: ", grid.best_params_)
    print("Millor model: ", grid.best_estimator_)

    # Resultats
    grid_results = pd.DataFrame(grid.cv_results_)[[
        'param_max_iter', 
        'param_max_bins',
        'param_max_depth',
        'param_min_samples_leaf',
        'param_learning_rate',
        'mean_test_score', 
        'std_test_score', 
        'rank_test_score']]
    grid_results['nParam'] = nParam
    
    pickle.dump(grid_results, open("grid_results_"+str(nParam), "wb"))
    
    end_time = time.time()
    total_time = end_time - start_time
    print("El temps total de l'execució és:", total_time, "segons, o bé ", total_time/60, " minuts.")
    
    return grid_results.sort_values(by=['rank_test_score'])

In [37]:
histGB_GridSearch5(0)

2023-06-18 11:40:31.669714  - Calculant grid search CV del paràmetre  0
Millor score:  0.9757157009584457
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 150, 'max_depth': 10, 'max_iter': 500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_bins=150, max_depth=10, max_iter=500,
                              random_state=24)
El temps total de l'execució és: 71.61140823364258 segons, o bé  1.1935234705607096  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
1,500,150,10,20,0.1,0.975716,0.001,1,0
0,500,150,10,20,0.5,0.963229,0.001659,2,0
2,500,150,10,20,0.01,0.959012,0.002645,3,0


In [38]:
histGB_GridSearch5(1)

2023-06-18 11:41:43.447777  - Calculant grid search CV del paràmetre  1
Millor score:  0.9443687106311321
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 7, 'max_iter': 1500, 'min_samples_leaf': 50}
Millor model:  HistGradientBoostingRegressor(max_depth=7, max_iter=1500, min_samples_leaf=50,
                              random_state=24)
El temps total de l'execució és: 156.8754599094391 segons, o bé  2.6145909984906512  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
1,1500,255,7,50,0.1,0.944369,0.000807,1,1
0,1500,255,7,50,0.5,0.926801,0.000246,2,1
2,1500,255,7,50,0.01,0.915968,0.000179,3,1


In [39]:
histGB_GridSearch5(2)

2023-06-18 11:44:20.422105  - Calculant grid search CV del paràmetre  2
Millor score:  0.9886768243160515
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 20, 'max_iter': 500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_depth=20, max_iter=500, random_state=24)
El temps total de l'execució és: 86.16947960853577 segons, o bé  1.4361579934755961  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
1,500,255,20,20,0.1,0.988677,0.000361,1,2
0,500,255,20,20,0.5,0.982357,0.000902,2,2
2,500,255,20,20,0.01,0.971024,0.000571,3,2


In [40]:
histGB_GridSearch5(3)

2023-06-18 11:45:46.690642  - Calculant grid search CV del paràmetre  3
Millor score:  0.9755698402212994
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 20, 'max_iter': 500, 'min_samples_leaf': 50}
Millor model:  HistGradientBoostingRegressor(max_depth=20, max_iter=500, min_samples_leaf=50,
                              random_state=24)
El temps total de l'execució és: 79.19067430496216 segons, o bé  1.3198445717493692  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
1,500,255,20,50,0.1,0.97557,0.000483,1,3
0,500,255,20,50,0.5,0.969453,0.000666,2,3
2,500,255,20,50,0.01,0.958704,0.001128,3,3


In [41]:
histGB_GridSearch5(4)

2023-06-18 11:47:05.981550  - Calculant grid search CV del paràmetre  4
Millor score:  0.9452102903531757
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 7, 'max_iter': 2500, 'min_samples_leaf': 50}
Millor model:  HistGradientBoostingRegressor(max_depth=7, max_iter=2500, min_samples_leaf=50,
                              random_state=24)
El temps total de l'execució és: 297.5770490169525 segons, o bé  4.959617483615875  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
1,2500,255,7,50,0.1,0.94521,0.001164,1,4
0,2500,255,7,50,0.5,0.905021,0.000291,2,4
2,2500,255,7,50,0.01,0.889327,0.001048,3,4


In [42]:
histGB_GridSearch5(5)

2023-06-18 11:52:03.662276  - Calculant grid search CV del paràmetre  5
Millor score:  0.8927851095702826
Millors paràmetres:  {'learning_rate': 0.1, 'max_bins': 200, 'max_depth': 7, 'max_iter': 1500, 'min_samples_leaf': 20}
Millor model:  HistGradientBoostingRegressor(max_bins=200, max_depth=7, max_iter=1500,
                              random_state=24)
El temps total de l'execució és: 129.6427001953125 segons, o bé  2.160711669921875  minuts.


Unnamed: 0,param_max_iter,param_max_bins,param_max_depth,param_min_samples_leaf,param_learning_rate,mean_test_score,std_test_score,rank_test_score,nParam
1,1500,200,7,20,0.1,0.892785,0.002106,1,5
0,1500,200,7,20,0.5,0.868967,0.001771,2,5
2,1500,200,7,20,0.01,0.868047,0.000851,3,5
