#Busqueda de hiperparametros

Vamos a utliziar el mismo conjunto de datos de airlines pero ahora haremos un busqueda de hiperparametros utilizando el Random forest como algoritmo dado que rapido de entrenar 



En este ejemplo utilizaremos el GridSearch con crossvalidation

Aqui lo que se hara es entrenar con las combinaciones de los diferentes parametros determinados y en base a esas combinaciones seleccionar con crossvalidation el mejor modelo en base a scoring , que en este caso sera el accuracy

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder


Proceso inicial de:

* Cargar Datos
* Seleccionar variables 
* Splitear en input(X) y output(y)
* Splitear train y test
* Transformar el train

In [None]:
#load datasets
air = pd.read_csv('airlines_sample.csv')

#select columns
cat_columns = air.select_dtypes(include='object').columns
numeric_columns = air.select_dtypes(include=np.number).columns

#split data in X e y
X = air.drop('Delay', axis=1)
y = air['Delay']

#split data in train y test
X_train , X_test, y_train, y_test = train_test_split(X,y , test_size=0.2, random_state=10, stratify=y)



#generacion del encoder 
le = OrdinalEncoder(handle_unknown='use_encoded_value' , #use cuando tenes valores desconocidos 
                    unknown_value=-1) # asignacion a valores desconocidos 

# generacion de numeros por cada categorico 
le.fit(X_train[cat_columns])

#transformacion de las categoricas en train dataset
X_train[cat_columns] = le.transform(X_train[cat_columns])




Cuando ya tenemos todo el dataset armado y pronto , lo que vamos hacer es selccionar un algoritmo y un conjunto de parametros donde se van a combinar

In [None]:
#seleccipon del clasificador
rfc = RandomForestClassifier(random_state=42)

#conjunto de parametros
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [None]:
2*3*5*2

60

Entrenamiento del train pero seleccionado el algoritmo y el conjunto de parametros a combinar

In [None]:
#seleccion del calsificador general
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 3)

#entrenamiento 
CV_rfc.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 500]})

Seleccion del mejor modelo con sus informacion

In [None]:
#dataframe con toda la infor de cada corrida de entrenamiento
results = pd.DataFrame(CV_rfc.cv_results_)

#info del mejor modelo
CV_rfc.best_estimator_


RandomForestClassifier(criterion='entropy', max_depth=4, n_estimators=200,
                       random_state=42)

In [None]:
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.519006,0.225422,0.037222,0.005582,gini,4,auto,200,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.631944,0.604167,0.634146,0.623419,0.013643,17
1,0.972389,0.133298,0.141842,0.086528,gini,4,auto,500,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.638889,0.59375,0.634146,0.622262,0.020254,19
2,0.449679,0.12523,0.03672,0.005675,gini,4,sqrt,200,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.631944,0.604167,0.634146,0.623419,0.013643,17
3,0.885334,0.003273,0.078725,0.004134,gini,4,sqrt,500,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.638889,0.59375,0.634146,0.622262,0.020254,19
4,0.370431,0.000687,0.033199,0.000942,gini,4,log2,200,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.618056,0.600694,0.623693,0.614148,0.009787,45
5,0.94084,0.006031,0.080029,0.001683,gini,4,log2,500,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.635417,0.59375,0.627178,0.618781,0.018017,30
6,0.359887,0.004598,0.03456,0.000817,gini,5,auto,200,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",0.628472,0.621528,0.606272,0.618757,0.009273,33
7,0.906799,0.007581,0.0805,0.001924,gini,5,auto,500,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",0.631944,0.607639,0.623693,0.621092,0.010092,22
8,0.364748,0.000664,0.034134,0.001461,gini,5,sqrt,200,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",0.628472,0.621528,0.606272,0.618757,0.009273,33
9,0.936271,0.010643,0.086702,0.002055,gini,5,sqrt,500,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",0.631944,0.607639,0.623693,0.621092,0.010092,22


In [None]:
CV_rfc.best_score_

0.6315371338237191

In [None]:
results.sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
30,0.381906,0.013669,0.032163,0.00048,entropy,4,auto,200,"{'criterion': 'entropy', 'max_depth': 4, 'max_...",0.642361,0.604167,0.648084,0.631537,0.019494,1
32,0.387782,0.014056,0.032084,0.000294,entropy,4,sqrt,200,"{'criterion': 'entropy', 'max_depth': 4, 'max_...",0.642361,0.604167,0.648084,0.631537,0.019494,1
33,0.962445,0.024998,0.081358,0.001313,entropy,4,sqrt,500,"{'criterion': 'entropy', 'max_depth': 4, 'max_...",0.649306,0.604167,0.637631,0.630368,0.01913,3
31,0.965323,0.002908,0.084313,0.008543,entropy,4,auto,500,"{'criterion': 'entropy', 'max_depth': 4, 'max_...",0.649306,0.604167,0.637631,0.630368,0.01913,3
37,1.311601,0.202403,0.081001,0.002196,entropy,5,auto,500,"{'criterion': 'entropy', 'max_depth': 5, 'max_...",0.652778,0.607639,0.630662,0.63036,0.018429,5


#Predicciones y metricas


In [None]:
#trasnformo el test
X_test[cat_columns] = le.transform(X_test[cat_columns])

#prediccion
pred = CV_rfc.predict(X_test)


In [None]:
#metrica

accuracy_score(y_test, pred)

0.6527777777777778