## Grid Search CV

GridSearchCV es una herramienta de la libreria Sklearn.

Esta herramienta hace una busqueda exhaustiva (o por fuerza bruta) de los mejores parámetros de un modelo siguiendo alguna métrica en particular.

_**Documentacion:** https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html_

In [1]:
import numpy as np
import pandas as pd

import matplotlib # Para ver la versión
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn # Para ver la versión

# Normalizacion
from sklearn.preprocessing import MinMaxScaler

# Train, Test
from sklearn.model_selection import train_test_split

# Metricas
from sklearn.metrics import jaccard_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
# Versiones

print(f"numpy=={np.__version__}")
print(f"pandas=={pd.__version__}")
print(f"matplotlib=={matplotlib.__version__}")
print(f"seaborn=={sns.__version__}")
print(f"scikit-learn=={sklearn.__version__}")

numpy==1.20.3
pandas==1.2.4
matplotlib==3.4.2
seaborn==0.11.1
scikit-learn==1.5.1


In [3]:
# Dataset del titanic preprocesado en clase

titanic = pd.read_csv("../Data/titanic_preprocesamiento.csv")

X = titanic.drop(["Survived"], axis = 1)
y = titanic["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape},  y_test: {y_test.shape}")

X_train: (613, 11), y_train: (613,)
X_test: (263, 11),  y_test: (263,)


### Preprocesamiento

In [4]:
# Escalado para KNeighborsClassifier

X_scaler = MinMaxScaler()
X_train = X_scaler.fit_transform(X_train)
X_test = X_scaler.transform(X_test)

### Modelo

In [5]:
from sklearn.neighbors import KNeighborsClassifier

In [6]:
model = KNeighborsClassifier()
model.fit(X_train, y_train)

### Predicciones

In [7]:
yhat = model.predict(X_test)

yhat

array([1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1.,
       0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1.,
       0., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1.,
       0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 0., 1., 0., 0.,
       1., 0., 1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0.,
       0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0.,
       0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0.,
       1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0.,
       0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0.,
       1., 0., 0., 0., 1.

In [8]:
print("Jaccard Index:", jaccard_score(y_test, yhat, average = "macro"))
print("Accuracy:"     , accuracy_score(y_test, yhat))
print("Precisión:"    , precision_score(y_test, yhat, average = "macro"))
print("Sensibilidad:" , recall_score(y_test, yhat, average = "macro"))
print("F1-score:"     , f1_score(y_test, yhat, average = "macro"))
print("ROC AUC:"      , roc_auc_score(y_test, yhat))

Jaccard Index: 0.6521702199949035
Accuracy: 0.8022813688212928
Precisión: 0.7867390219560878
Sensibilidad: 0.7867390219560878
F1-score: 0.7867390219560878
ROC AUC: 0.7867390219560877


### GridSearchCV

In [10]:
from sklearn.model_selection import GridSearchCV

In [20]:
%%time

# Modelo
model = KNeighborsClassifier()

# Parametros a iterar
params = {"n_neighbors" : [3, 4, 5, 6, 7, 8]}

# Metricas
scorers = ["f1_macro", "accuracy", "recall_macro"]

#GridSearchCV
grid_solver = GridSearchCV(estimator  = model,
                           param_grid = params, 
                           scoring    = scorers,
                           cv         = 5,
                           refit      = "accuracy",
                           n_jobs     = -1,
                           verbose    = 99)

# Resultados
model_result = grid_solver.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
CPU times: user 50.9 ms, sys: 9.9 ms, total: 60.8 ms
Wall time: 515 ms


In [21]:
# model_result.best_estimator es el mejor modelo que obtuvimos al iterar sobre todos los parámetros

model_result.best_estimator_.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [22]:
yhat = model_result.best_estimator_.predict(X_test)

print("Jaccard Index:", jaccard_score(y_test, yhat, average = "macro"))
print("Accuracy:"     , accuracy_score(y_test, yhat))
print("Precisión:"    , precision_score(y_test, yhat, average = "macro"))
print("Sensibilidad:" , recall_score(y_test, yhat, average = "macro"))
print("F1-score:"     , f1_score(y_test, yhat, average = "macro"))
print("ROC AUC:"      , roc_auc_score(y_test, yhat))

Jaccard Index: 0.6521702199949035
Accuracy: 0.8022813688212928
Precisión: 0.7867390219560878
Sensibilidad: 0.7867390219560878
F1-score: 0.7867390219560878
ROC AUC: 0.7867390219560877


In [None]:
from pprint import pprint
pprint(model_result.cv_results_)

In [None]:
print(model_result.cv_results_["mean_test_recall_macro"].mean())
print(model_result.cv_results_["mean_test_f1_macro"].mean())
print(model_result.cv_results_["mean_test_accuracy"].mean())

print(model_result.best_score_)
print(model_result.best_params_)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)

# Max depth
from collections import Counter

print(Counter([x.get_depth() for x in model.estimators_]))


import matplotlib.pyplot as plt
import seaborn as sns

sns.histplot([x.get_depth() for x in model.estimators_])
plt.show()

# N Leaves
from collections import Counter

print(Counter([x.get_n_leaves() for x in model.estimators_]))


import matplotlib.pyplot as plt
import seaborn as sns

sns.histplot([x.get_n_leaves() for x in model.estimators_])
plt.show()

In [None]:
3*2*5*2*6*4*4

In [25]:
%%time

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

params = {"n_estimators"           : [100, 150, 200], # Numero de arboles
          "criterion"              : ["gini", "entropy"], # Es la función para medir la calidad de una división/split.
          "max_features"           : ["sqrt", "log2"], # El número de características (atributos) a considerar en cada split
          "min_samples_split"      : [2, 5, 10, 15, 20], # El número mínimo de muestras requeridas para llegar a nodo hoja.
          "random_state"           : [42]} 

scorers = ["f1_macro", "accuracy", "recall_macro"]

grid_solver = GridSearchCV(estimator  = model     , 
                           param_grid = params    , 
                           scoring    = scorers   ,
                           cv         = 5         ,
                           refit      = "accuracy",
                           n_jobs     = -1        ,
                           verbose    = 3)

model_result = grid_solver.fit(X, y)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
CPU times: user 745 ms, sys: 142 ms, total: 887 ms
Wall time: 30.8 s


In [26]:
print(model_result.cv_results_["mean_test_recall_macro"].mean())
print(model_result.cv_results_["mean_test_f1_macro"].mean())
print(model_result.cv_results_["mean_test_accuracy"].mean())

print("*"*100)

print(model_result.best_score_)
print(model_result.best_params_)

0.8030129331293671
0.8088862965378985
0.8228311688311688
****************************************************************************************************
0.8333311688311689
{'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 100, 'random_state': 42}


In [None]:
resultados = {'criterion': 'entropy',
 'max_depth': 18,
 'max_features': None,
 'max_leaf_nodes': 90,
 'min_impurity_decrease': 0.0,
 'min_samples_split': 20,
 'n_estimators': 150,
 'random_state': 42}

In [None]:
import pickle

mejor_modelo = RandomForestClassifier(**resultados)
mejor_modelo.fit(X, y)

with open("mejor_modelo.pkl", "bw") as file:
    pickle.dump(mejor_modelo, file)

In [None]:
################################################################################################################################