# Random Forest - Cerca d'hiperparàmetres

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, median_absolute_error
import time
import datetime
import random
import pickle

In [2]:
# Perquè funcioni, la GPU ha d'estar engegada
#!pip install cuml

from cuml.ensemble import RandomForestRegressor as cuRF

## Càrrega de dades

In [3]:
X_data = np.load('/kaggle/input/tfm2a-preparar-dades/X_minMaxScaled_opt.npy')
Y_data = np.load('/kaggle/input/tfm2a-preparar-dades/Y_FM.npy')
param_names = np.load('/kaggle/input/tfm2a-preparar-dades/Y_FM_names.npy')
pca_labels = np.load('/kaggle/input/tfm1b-espectres/PCA_labels.npy')

# Separem en train i test
# random_state = 42 per assegurar que dividim igual en totes les anàlisis
X_train, X_test, y_train_all, y_test_all, pca_labels_train, pca_labels_test = train_test_split(X_data, Y_data, pca_labels, test_size = 0.25, random_state = 42)

# Hem de convertir les dades 
# cuML library: Expected input to be of type in [dtype('float32'), dtype('float64')]
X_train = np.float32(X_train)
X_test = np.float32(X_test)
y_train_all = np.float32(y_train_all)
y_test_all = np.float32(y_test_all)


# Comprovem les dimensions de train i test
print('Training Features shape:', X_train.shape)
print('Training Labels shape:', y_train_all.shape)
print('Testing Features shape:', X_test.shape)
print('Testing Labels shape:', y_test_all.shape)

Training Features shape: (68544, 52)
Training Labels shape: (68544, 6)
Testing Features shape: (22848, 52)
Testing Labels shape: (22848, 6)


## Cerca en malla dels hiperparàmetres

Utilitzem la validació creuada per seleccionar els hiperparàmetres òptims

In [4]:
def GridSearch1(nParam):
    start_time = time.time()
    print(datetime.datetime.now(), " - Calculant grid search CV del paràmetre ", nParam)
    
    y_train = y_train_all[:,nParam]
    y_test = y_test_all[:, nParam]

    # Nova instància del model
    model = cuRF(random_state = 24, verbose = 0, n_streams=1)

    # Definim el rang de valors pels paràmetres que volem optimitzar
    parameters = {
        'n_estimators': [500, 1000], # valors provats entre 50 i 2000       
        'max_depth': [3],
        'min_samples_split': [5],
        'min_samples_leaf': [10],
        'max_features': ['sqrt'],         
        'n_bins': [255]
    }

    # Definim una instància de GridSearchCV amb 4 plecs de validació
    grid = GridSearchCV(model, param_grid=parameters, cv=4)

    # Entrenem el GridSearchCV
    grid.fit(X_train, y_train)

    # Quina és la millor combinació de paràmetres:
    print("Millor score: ", grid.best_score_)
    print("Millors paràmetres: ", grid.best_params_)
    print("Millor model: ", grid.best_estimator_)

    # Resultats
    grid_results = pd.DataFrame(grid.cv_results_)[[
        'param_n_estimators',
        'param_max_depth',         
        'param_min_samples_split',
        'param_min_samples_leaf',
        'param_max_features',
        'param_n_bins',
        'mean_test_score', 
        'std_test_score', 
        'rank_test_score']]
    grid_results['nParam'] = nParam
    
    pickle.dump(grid_results, open("grid_results_"+str(nParam), "wb"))
    
    end_time = time.time()
    total_time = end_time - start_time
    print("El temps total de l'execució és:", total_time, "segons, o bé ", total_time/60, " minuts.")
    
    return grid_results.sort_values(by=['rank_test_score'])

In [5]:
GridSearch1(0)

2023-06-18 10:22:38.534301  - Calculant grid search CV del paràmetre  0
Millor score:  0.5301084965467453
Millors paràmetres:  {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_bins': 255, 'n_estimators': 1000}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 59.008352756500244 segons, o bé  0.9834725459416708  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
1,1000,3,5,10,sqrt,255,0.530108,0.002841,1,0
0,500,3,5,10,sqrt,255,0.526185,0.002998,2,0


In [6]:
GridSearch1(1)

2023-06-18 10:23:37.602182  - Calculant grid search CV del paràmetre  1
Millor score:  0.5858713239431381
Millors paràmetres:  {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_bins': 255, 'n_estimators': 500}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 40.57721924781799 segons, o bé  0.6762869874636332  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,500,3,5,10,sqrt,255,0.585871,0.001923,1,1
1,1000,3,5,10,sqrt,255,0.583043,0.001933,2,1


In [7]:
GridSearch1(2)

2023-06-18 10:24:18.232976  - Calculant grid search CV del paràmetre  2
Millor score:  0.5913606435060501
Millors paràmetres:  {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_bins': 255, 'n_estimators': 500}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 40.944403886795044 segons, o bé  0.682406731446584  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,500,3,5,10,sqrt,255,0.591361,0.003435,1,2
1,1000,3,5,10,sqrt,255,0.590321,0.003626,2,2


In [8]:
GridSearch1(3)

2023-06-18 10:24:59.234497  - Calculant grid search CV del paràmetre  3
Millor score:  0.8235933184623718
Millors paràmetres:  {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_bins': 255, 'n_estimators': 1000}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 41.247554779052734 segons, o bé  0.6874592463175456  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
1,1000,3,5,10,sqrt,255,0.823593,0.002235,1,3
0,500,3,5,10,sqrt,255,0.823141,0.002319,2,3


In [9]:
GridSearch1(4)

2023-06-18 10:25:40.538658  - Calculant grid search CV del paràmetre  4
Millor score:  0.18305933475494385
Millors paràmetres:  {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_bins': 255, 'n_estimators': 1000}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 51.88378977775574 segons, o bé  0.8647298296292623  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
1,1000,3,5,10,sqrt,255,0.183059,0.000798,1,4
0,500,3,5,10,sqrt,255,0.182615,0.000638,2,4


In [10]:
GridSearch1(5)

2023-06-18 10:26:32.479083  - Calculant grid search CV del paràmetre  5
Millor score:  0.5018392503261566
Millors paràmetres:  {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_bins': 255, 'n_estimators': 1000}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 42.55720067024231 segons, o bé  0.7092866778373719  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
1,1000,3,5,10,sqrt,255,0.501839,0.002429,1,5
0,500,3,5,10,sqrt,255,0.500395,0.002766,2,5


In [11]:
def GridSearch2(nParam):
    start_time = time.time()
    print(datetime.datetime.now(), " - Calculant grid search CV del paràmetre ", nParam)
    
    y_train = y_train_all[:,nParam]
    y_test = y_test_all[:, nParam]

    # Nova instància del model
    model = cuRF(random_state = 24, verbose = 0, n_streams=1)

    # Definim el rang de valors pels paràmetres que volem optimitzar
    parameters = {
        'n_estimators': [100],      
        'max_depth': [3, 5, 7, 10, 20],
        'min_samples_split': [5],
        'min_samples_leaf': [10],
        'max_features': ['sqrt'],         
        'n_bins': [255]
    }

    # Definim una instància de GridSearchCV 
    grid = GridSearchCV(model, param_grid=parameters, cv=4)

    # Entrenem el GridSearchCV
    grid.fit(X_train, y_train)

    # Quina és la millor combinació de paràmetres:
    print("Millor score: ", grid.best_score_)
    print("Millors paràmetres: ", grid.best_params_)
    print("Millor model: ", grid.best_estimator_)

    # Resultats
    grid_results = pd.DataFrame(grid.cv_results_)[[
        'param_n_estimators',
        'param_max_depth',         
        'param_min_samples_split',
        'param_min_samples_leaf',
        'param_max_features',
        'param_n_bins',
        'mean_test_score', 
        'std_test_score', 
        'rank_test_score']]
    grid_results['nParam'] = nParam
    
    pickle.dump(grid_results, open("grid_results_"+str(nParam), "wb"))
    
    end_time = time.time()
    total_time = end_time - start_time
    print("El temps total de l'execució és:", total_time, "segons, o bé ", total_time/60, " minuts.")
    
    return grid_results.sort_values(by=['rank_test_score'])

In [12]:
GridSearch2(0)

2023-06-18 10:27:15.142518  - Calculant grid search CV del paràmetre  0
Millor score:  0.9613674581050873
Millors paràmetres:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 16.687191486358643 segons, o bé  0.2781198581059774  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
4,100,20,5,10,sqrt,255,0.961367,0.002634,1,0
3,100,10,5,10,sqrt,255,0.922021,0.003626,2,0
2,100,7,5,10,sqrt,255,0.842379,0.003392,3,0
1,100,5,5,10,sqrt,255,0.734075,0.001759,4,0
0,100,3,5,10,sqrt,255,0.522199,0.00227,5,0


In [13]:
GridSearch2(1)

2023-06-18 10:27:31.882527  - Calculant grid search CV del paràmetre  1
Millor score:  0.9082280695438385
Millors paràmetres:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 16.11856198310852 segons, o bé  0.2686426997184753  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
4,100,20,5,10,sqrt,255,0.908228,0.000743,1,1
3,100,10,5,10,sqrt,255,0.85551,0.001054,2,1
2,100,7,5,10,sqrt,255,0.788676,0.001298,3,1
1,100,5,5,10,sqrt,255,0.714677,0.001442,4,1
0,100,3,5,10,sqrt,255,0.590864,0.001554,5,1


In [14]:
GridSearch2(2)

2023-06-18 10:27:48.050976  - Calculant grid search CV del paràmetre  2
Millor score:  0.9607708156108856
Millors paràmetres:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 15.803075313568115 segons, o bé  0.2633845885594686  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
4,100,20,5,10,sqrt,255,0.960771,0.000758,1,2
3,100,10,5,10,sqrt,255,0.918653,0.000691,2,2
2,100,7,5,10,sqrt,255,0.843108,0.001619,3,2
1,100,5,5,10,sqrt,255,0.741581,0.002231,4,2
0,100,3,5,10,sqrt,255,0.579724,0.00496,5,2


In [15]:
GridSearch2(3)

2023-06-18 10:28:03.902356  - Calculant grid search CV del paràmetre  3
Millor score:  0.9676719158887863
Millors paràmetres:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 15.726967573165894 segons, o bé  0.26211612621943153  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
4,100,20,5,10,sqrt,255,0.967672,0.000917,1,3
3,100,10,5,10,sqrt,255,0.950096,0.000985,2,3
2,100,7,5,10,sqrt,255,0.924411,0.001239,3,3
1,100,5,5,10,sqrt,255,0.891343,0.001739,4,3
0,100,3,5,10,sqrt,255,0.820436,0.0026,5,3


In [16]:
GridSearch2(4)

2023-06-18 10:28:19.679977  - Calculant grid search CV del paràmetre  4
Millor score:  0.7739312201738358
Millors paràmetres:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 18.746976613998413 segons, o bé  0.31244961023330686  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
4,100,20,5,10,sqrt,255,0.773931,0.002256,1,4
3,100,10,5,10,sqrt,255,0.565187,0.003354,2,4
2,100,7,5,10,sqrt,255,0.415914,0.002625,3,4
1,100,5,5,10,sqrt,255,0.299798,0.002495,4,4
0,100,3,5,10,sqrt,255,0.178687,0.001236,5,4


In [17]:
GridSearch2(5)

2023-06-18 10:28:38.478387  - Calculant grid search CV del paràmetre  5
Millor score:  0.8576576411724091
Millors paràmetres:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 16.93026614189148 segons, o bé  0.282171102364858  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
4,100,20,5,10,sqrt,255,0.857658,0.000333,1,5
3,100,10,5,10,sqrt,255,0.804885,0.001062,2,5
2,100,7,5,10,sqrt,255,0.728687,0.001087,3,5
1,100,5,5,10,sqrt,255,0.642388,0.001644,4,5
0,100,3,5,10,sqrt,255,0.502637,0.003287,5,5


In [18]:
def GridSearch3(nParam):
    start_time = time.time()
    print(datetime.datetime.now(), " - Calculant grid search CV del paràmetre ", nParam)
    
    y_train = y_train_all[:,nParam]
    y_test = y_test_all[:, nParam]


    # Nova instància del model
    model = cuRF(random_state = 24, verbose = 0, n_streams=1)

    # Definim el rang de valors pels paràmetres que volem optimitzar
    parameters = {
        'n_estimators': [100],      
        'max_depth': [20],
        'min_samples_split': [30, 40], # valors provats: 2, 5, 10, 20, 30, 40, 50, 100
        'min_samples_leaf': [10],
        'max_features': ['sqrt'],         
        'n_bins': [255]
    }

    # Definim una instància de GridSearchCV 
    grid = GridSearchCV(model, param_grid=parameters, cv=4)

    # Entrenem el GridSearchCV
    grid.fit(X_train, y_train)

    # Quina és la millor combinació de paràmetres:
    print("Millor score: ", grid.best_score_)
    print("Millors paràmetres: ", grid.best_params_)
    print("Millor model: ", grid.best_estimator_)

    # Resultats
    grid_results = pd.DataFrame(grid.cv_results_)[[
        'param_n_estimators',
        'param_max_depth',         
        'param_min_samples_split',
        'param_min_samples_leaf',
        'param_max_features',
        'param_n_bins',
        'mean_test_score', 
        'std_test_score', 
        'rank_test_score']]
    grid_results['nParam'] = nParam
    
    pickle.dump(grid_results, open("grid_results_"+str(nParam), "wb"))
    
    end_time = time.time()
    total_time = end_time - start_time
    print("El temps total de l'execució és:", total_time, "segons, o bé ", total_time/60, " minuts.")
    
    return grid_results.sort_values(by=['rank_test_score'])

In [19]:
GridSearch3(0)

2023-06-18 10:28:55.557958  - Calculant grid search CV del paràmetre  0
Millor score:  0.9596232920885086
Millors paràmetres:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 30, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 8.738945960998535 segons, o bé  0.14564909934997558  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,100,20,30,10,sqrt,255,0.959623,0.002913,1,0
1,100,20,40,10,sqrt,255,0.957688,0.003069,2,0


In [20]:
GridSearch3(1)

2023-06-18 10:29:04.349335  - Calculant grid search CV del paràmetre  1
Millor score:  0.9055185467004776
Millors paràmetres:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 30, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 8.624292850494385 segons, o bé  0.14373821417490643  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,100,20,30,10,sqrt,255,0.905519,0.000975,1,1
1,100,20,40,10,sqrt,255,0.902693,0.00098,2,1


In [21]:
GridSearch3(2)

2023-06-18 10:29:13.027398  - Calculant grid search CV del paràmetre  2
Millor score:  0.9586894065141678
Millors paràmetres:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 30, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 8.842724561691284 segons, o bé  0.14737874269485474  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,100,20,30,10,sqrt,255,0.958689,0.000632,1,2
1,100,20,40,10,sqrt,255,0.956562,0.000918,2,2


In [22]:
GridSearch3(3)

2023-06-18 10:29:21.926996  - Calculant grid search CV del paràmetre  3
Millor score:  0.9666586220264435
Millors paràmetres:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 30, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 8.460443496704102 segons, o bé  0.14100739161173503  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,100,20,30,10,sqrt,255,0.966659,0.000943,1,3
1,100,20,40,10,sqrt,255,0.965617,0.00095,2,3


In [23]:
GridSearch3(4)

2023-06-18 10:29:30.441583  - Calculant grid search CV del paràmetre  4
Millor score:  0.7656807154417038
Millors paràmetres:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 30, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 9.231200695037842 segons, o bé  0.15385334491729735  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,100,20,30,10,sqrt,255,0.765681,0.002408,1,4
1,100,20,40,10,sqrt,255,0.757999,0.00252,2,4


In [24]:
GridSearch3(5)

2023-06-18 10:29:39.727360  - Calculant grid search CV del paràmetre  5
Millor score:  0.8545900136232376
Millors paràmetres:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 30, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 8.488367557525635 segons, o bé  0.14147279262542725  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,100,20,30,10,sqrt,255,0.85459,0.000244,1,5
1,100,20,40,10,sqrt,255,0.851659,0.000357,2,5


In [25]:
def GridSearch4(nParam):
    start_time = time.time()
    print(datetime.datetime.now(), " - Calculant grid search CV del paràmetre ", nParam)
    
    y_train = y_train_all[:,nParam]
    y_test = y_test_all[:, nParam]


    # Definim una nova instància del model
    model = cuRF(random_state = 24, verbose = 0, n_streams=1)

    # Definim el rang de valors pels paràmetres que volem optimitzar
    parameters = {
        'n_estimators': [100],      
        'max_depth': [20],
        'min_samples_split': [20],
        'min_samples_leaf': [1, 5, 10],
        'max_features': ['sqrt'],         
        'n_bins': [255]
    }

    # Definim una instància de GridSearchCV 
    grid = GridSearchCV(model, param_grid=parameters, cv=4)

    # Entrenem el GridSearchCV
    grid.fit(X_train, y_train)

    # Quina és la millor combinació de paràmetres:
    print("Millor score: ", grid.best_score_)
    print("Millors paràmetres: ", grid.best_params_)
    print("Millor model: ", grid.best_estimator_)

    # Resultats
    grid_results = pd.DataFrame(grid.cv_results_)[[
        'param_n_estimators',
        'param_max_depth',         
        'param_min_samples_split',
        'param_min_samples_leaf',
        'param_max_features',
        'param_n_bins',
        'mean_test_score', 
        'std_test_score', 
        'rank_test_score']]
    grid_results['nParam'] = nParam
    
    pickle.dump(grid_results, open("grid_results_"+str(nParam), "wb"))
    
    end_time = time.time()
    total_time = end_time - start_time
    print("El temps total de l'execució és:", total_time, "segons, o bé ", total_time/60, " minuts.")
    
    return grid_results.sort_values(by=['rank_test_score'])

In [26]:
GridSearch4(0)

2023-06-18 10:29:48.319098  - Calculant grid search CV del paràmetre  0
Millor score:  0.9651349186897278
Millors paràmetres:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 14.286207914352417 segons, o bé  0.23810346523920695  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,100,20,20,1,sqrt,255,0.965135,0.0024,1,0
1,100,20,20,5,sqrt,255,0.963577,0.002472,2,0
2,100,20,20,10,sqrt,255,0.961367,0.002634,3,0


In [27]:
GridSearch4(1)

2023-06-18 10:30:02.661131  - Calculant grid search CV del paràmetre  1
Millor score:  0.9111064523458481
Millors paràmetres:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 13.826866626739502 segons, o bé  0.23044777711232503  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,100,20,20,1,sqrt,255,0.911106,0.000756,1,1
1,100,20,20,5,sqrt,255,0.910433,0.000966,2,1
2,100,20,20,10,sqrt,255,0.908228,0.000743,3,1


In [28]:
GridSearch4(2)

2023-06-18 10:30:16.545470  - Calculant grid search CV del paràmetre  2
Millor score:  0.9635592103004456
Millors paràmetres:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 13.578085899353027 segons, o bé  0.2263014316558838  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,100,20,20,1,sqrt,255,0.963559,0.000587,1,2
1,100,20,20,5,sqrt,255,0.962812,0.000835,2,2
2,100,20,20,10,sqrt,255,0.960771,0.000758,3,2


In [29]:
GridSearch4(3)

2023-06-18 10:30:30.181814  - Calculant grid search CV del paràmetre  3
Millor score:  0.9689160585403442
Millors paràmetres:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 13.422322034835815 segons, o bé  0.2237053672472636  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,100,20,20,1,sqrt,255,0.968916,0.000824,1,3
1,100,20,20,5,sqrt,255,0.968697,0.000818,2,3
2,100,20,20,10,sqrt,255,0.967672,0.000917,3,3


In [30]:
GridSearch4(4)

2023-06-18 10:30:43.662270  - Calculant grid search CV del paràmetre  4
Millor score:  0.7815339863300323
Millors paràmetres:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 14.680387735366821 segons, o bé  0.24467312892278034  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,100,20,20,1,sqrt,255,0.781534,0.002292,1,4
1,100,20,20,5,sqrt,255,0.780277,0.001234,2,4
2,100,20,20,10,sqrt,255,0.773931,0.002256,3,4


In [31]:
GridSearch4(5)

2023-06-18 10:30:58.400479  - Calculant grid search CV del paràmetre  5
Millor score:  0.8608437031507492
Millors paràmetres:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 13.683892488479614 segons, o bé  0.22806487480799356  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,100,20,20,1,sqrt,255,0.860844,0.000365,1,5
1,100,20,20,5,sqrt,255,0.860114,0.00051,2,5
2,100,20,20,10,sqrt,255,0.857658,0.000333,3,5


In [32]:
def GridSearch5(nParam):
    start_time = time.time()
    print(datetime.datetime.now(), " - Calculant grid search CV del paràmetre ", nParam)
    
    y_train = y_train_all[:,nParam]
    y_test = y_test_all[:, nParam]


    # Definim una nova instància del model
    model = cuRF(random_state = 24, verbose = 0, n_streams=1)

    # Definim el rang de valors pels paràmetres que volem optimitzar
    parameters = {
        'n_estimators': [100],      
        'max_depth': [20],
        'min_samples_split': [20],
        'min_samples_leaf': [1],
        'max_features': ['auto', 'sqrt', 'log2'],         
        'n_bins': [255]
    }

    # Definim una instància de GridSearchCV 
    grid = GridSearchCV(model, param_grid=parameters, cv=4)

    # Entrenem el GridSearchCV
    grid.fit(X_train, y_train)

    # Quina és la millor combinació de paràmetres:
    print("Millor score: ", grid.best_score_)
    print("Millors paràmetres: ", grid.best_params_)
    print("Millor model: ", grid.best_estimator_)

    # Resultats
    grid_results = pd.DataFrame(grid.cv_results_)[[
        'param_n_estimators',
        'param_max_depth',         
        'param_min_samples_split',
        'param_min_samples_leaf',
        'param_max_features',
        'param_n_bins',
        'mean_test_score', 
        'std_test_score', 
        'rank_test_score']]
    grid_results['nParam'] = nParam
    
    pickle.dump(grid_results, open("grid_results_"+str(nParam), "wb"))
    
    end_time = time.time()
    total_time = end_time - start_time
    print("El temps total de l'execució és:", total_time, "segons, o bé ", total_time/60, " minuts.")
    
    return grid_results.sort_values(by=['rank_test_score'])

In [33]:
GridSearch5(0)

2023-06-18 10:31:12.199228  - Calculant grid search CV del paràmetre  0
Millor score:  0.9687335789203644
Millors paràmetres:  {'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 24.532139778137207 segons, o bé  0.4088689963022868  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,100,20,20,1,auto,255,0.968734,0.001665,1,0
1,100,20,20,1,sqrt,255,0.965135,0.0024,2,0
2,100,20,20,1,log2,255,0.962345,0.002303,3,0


In [34]:
GridSearch5(1)

2023-06-18 10:31:36.791210  - Calculant grid search CV del paràmetre  1
Millor score:  0.927852138876915
Millors paràmetres:  {'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 23.933977842330933 segons, o bé  0.39889963070551554  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,100,20,20,1,auto,255,0.927852,0.000302,1,1
1,100,20,20,1,sqrt,255,0.911106,0.000756,2,1
2,100,20,20,1,log2,255,0.903784,0.000623,3,1


In [35]:
GridSearch5(2)

2023-06-18 10:32:00.787229  - Calculant grid search CV del paràmetre  2
Millor score:  0.9836549311876297
Millors paràmetres:  {'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 23.65038013458252 segons, o bé  0.39417300224304197  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,100,20,20,1,auto,255,0.983655,0.000237,1,2
1,100,20,20,1,sqrt,255,0.963559,0.000587,2,2
2,100,20,20,1,log2,255,0.951725,0.000791,3,2


In [36]:
GridSearch5(3)

2023-06-18 10:32:24.500153  - Calculant grid search CV del paràmetre  3
Millor score:  0.9734764844179153
Millors paràmetres:  {'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 23.522082090377808 segons, o bé  0.39203470150629677  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,100,20,20,1,auto,255,0.973476,0.000844,1,3
1,100,20,20,1,sqrt,255,0.968916,0.000824,2,3
2,100,20,20,1,log2,255,0.966204,0.00085,3,3


In [37]:
GridSearch5(4)

2023-06-18 10:32:48.086280  - Calculant grid search CV del paràmetre  4
Millor score:  0.8445550948381424
Millors paràmetres:  {'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 24.80012273788452 segons, o bé  0.41333537896474204  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,100,20,20,1,auto,255,0.844555,0.001811,1,4
1,100,20,20,1,sqrt,255,0.781534,0.002292,2,4
2,100,20,20,1,log2,255,0.753273,0.003848,3,4


In [38]:
GridSearch5(5)

2023-06-18 10:33:12.947274  - Calculant grid search CV del paràmetre  5
Millor score:  0.8804817795753479
Millors paràmetres:  {'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_bins': 255, 'n_estimators': 100}
Millor model:  RandomForestRegressor()
El temps total de l'execució és: 24.105966567993164 segons, o bé  0.4017661094665527  minuts.


Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_max_features,param_n_bins,mean_test_score,std_test_score,rank_test_score,nParam
0,100,20,20,1,auto,255,0.880482,0.00048,1,5
1,100,20,20,1,sqrt,255,0.860844,0.000365,2,5
2,100,20,20,1,log2,255,0.85204,0.000338,3,5
