# Otimização de Hiperparâmetros

## Carregando os dados

In [None]:
# carregando os pacotes
import os
import numpy as np
import pandas as pd

In [None]:
df_abt = pd.read_excel('churn_data.xlsx')
df_abt.head()

Unnamed: 0,customerID,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,1,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,34,Yes,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,2,Yes,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,45,No,One year,No,Bank transfer,42.3,1840.75,No
4,9237-HQITU,2,Yes,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
df_abt["TotalCharges"] = pd.to_numeric(df_abt["TotalCharges"],errors='coerce')

In [None]:
df_abt = df_abt.dropna()

In [None]:
cat_vars = ['PhoneService',"Contract","PaperlessBilling","PaymentMethod"]
num_vars = ["tenure","MonthlyCharges","TotalCharges"]
modeling_vars = cat_vars + num_vars

In [None]:
X = df_abt.filter(modeling_vars).copy()
y = df_abt['Churn'].copy()
y = pd.DataFrame(y)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

le.fit(y.Churn) 
y.Churn = le.transform(y.Churn)

In [None]:
X = pd.get_dummies(X)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

## Grid Search

In [None]:
from sklearn.tree import DecisionTreeClassifier

arvore_decisao = DecisionTreeClassifier()

In [None]:
# Importamos GridSearchCV
from sklearn.model_selection import GridSearchCV

# Criamos um dicionário que os hiperparâmetros que queremos treinar o modelo
parameters = {
  'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

# Criamos um GridSearch passando o modelo, os parâmetros, a métrica que queremos otimizar. 
grid_search = GridSearchCV(arvore_decisao, parameters, scoring='roc_auc', cv=5, n_jobs=-1)



In [None]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
             scoring='roc_auc')

In [None]:
# criando uma tabela com os resultados do grid_search
results = pd.DataFrame(grid_search.cv_results_)

# rankeando esses resultados
results.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,0.034243,0.005055,0.013474,0.003326,4,{'max_depth': 4},0.815025,0.829992,0.83,0.823281,0.826981,0.825056,0.005591,1
4,0.030468,0.007918,0.010892,0.005696,5,{'max_depth': 5},0.817179,0.82664,0.833729,0.821734,0.822465,0.824349,0.005568,2
2,0.036157,0.0045,0.016492,0.00408,3,{'max_depth': 3},0.805941,0.817207,0.821046,0.815282,0.810875,0.81407,0.00522,3
5,0.025006,0.004972,0.007223,0.000502,6,{'max_depth': 6},0.818015,0.80935,0.808694,0.802433,0.804236,0.808546,0.005409,4
6,0.027627,0.00575,0.010256,0.005439,7,{'max_depth': 7},0.812007,0.789771,0.794464,0.786139,0.787921,0.794061,0.009392,5
1,0.019419,0.003012,0.013997,0.004548,2,{'max_depth': 2},0.755245,0.785522,0.779483,0.786445,0.776895,0.776718,0.011321,6
7,0.024895,0.002048,0.007609,0.00073,8,{'max_depth': 8},0.802534,0.763198,0.768291,0.762598,0.777724,0.774869,0.014856,7
8,0.02421,0.000617,0.011041,0.003378,9,{'max_depth': 9},0.784453,0.738363,0.758479,0.751553,0.773779,0.761325,0.016253,8
9,0.031108,0.003966,0.006621,0.000952,10,{'max_depth': 10},0.769338,0.727425,0.736594,0.744872,0.757262,0.747098,0.014834,9
0,0.017539,0.008884,0.01296,0.004488,1,{'max_depth': 1},0.730069,0.712166,0.734768,0.735691,0.737566,0.730052,0.009278,10


Vamos realizar um GridSearch com uma Random Forest.

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf =  RandomForestClassifier()


In [None]:
parameters = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              'n_estimators': [100, 300, 500]}

grid_search = GridSearchCV(rf, parameters, scoring='roc_auc', cv=5, n_jobs=-1)

In [None]:
grid_search.fit(X_train, y_train)

  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'n_estimators': [100, 300, 500]},
             scoring='roc_auc')

In [None]:
results = pd.DataFrame(grid_search.cv_results_)
results.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
19,2.000001,0.538886,0.170651,0.081969,7,300,"{'max_depth': 7, 'n_estimators': 300}",0.822321,0.832059,0.841495,0.838449,0.83971,0.834807,0.007007,1
20,2.582024,0.017244,0.189433,0.00447,7,500,"{'max_depth': 7, 'n_estimators': 500}",0.823186,0.832254,0.842121,0.83823,0.838162,0.83479,0.006604,2
17,2.393746,0.041878,0.1832,0.00717,6,500,"{'max_depth': 6, 'n_estimators': 500}",0.821848,0.832177,0.841587,0.838214,0.838976,0.83456,0.007064,3
16,1.452359,0.022089,0.109406,0.00124,6,300,"{'max_depth': 6, 'n_estimators': 300}",0.821842,0.831276,0.841538,0.83807,0.838893,0.834324,0.0071,4
23,2.734541,0.02641,0.197511,0.007775,8,500,"{'max_depth': 8, 'n_estimators': 500}",0.823386,0.829608,0.842672,0.836041,0.836915,0.833724,0.006625,5


In [None]:
grid_search.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 7,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 300,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

## Randomized Search

In [None]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(n_jobs=-1)


In [None]:
from sklearn.model_selection import RandomizedSearchCV

parameters = {'learning_rate': [0.001, 0.01], 
              'num_leaves': [2, 128],
              'min_child_samples': [1, 100],
              'subsample': [0.05, 1.0],
              'colsample_bytree': [0.1, 1.0]}

random_search = RandomizedSearchCV(lgbm, parameters, scoring='roc_auc', cv=5, n_iter=5, n_jobs=-1)

random_search.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


RandomizedSearchCV(cv=5, estimator=LGBMClassifier(), n_iter=5, n_jobs=-1,
                   param_distributions={'colsample_bytree': [0.1, 1.0],
                                        'learning_rate': [0.001, 0.01],
                                        'min_child_samples': [1, 100],
                                        'num_leaves': [2, 128],
                                        'subsample': [0.05, 1.0]},
                   scoring='roc_auc')

In [None]:
results = pd.DataFrame(random_search.cv_results_)
results.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_num_leaves,param_min_child_samples,param_learning_rate,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,0.049667,0.002011,0.010947,0.000956,1.0,2,1,0.01,0.1,"{'subsample': 1.0, 'num_leaves': 2, 'min_child...",0.806599,0.8128,0.818179,0.824237,0.827488,0.817861,0.007561,1
0,0.0988,0.006244,0.014331,0.002299,0.05,128,1,0.01,0.1,"{'subsample': 0.05, 'num_leaves': 128, 'min_ch...",0.805435,0.809142,0.820649,0.824467,0.827672,0.817473,0.008687,2
4,0.047975,0.008232,0.012138,0.004643,0.05,2,1,0.001,0.1,"{'subsample': 0.05, 'num_leaves': 2, 'min_chil...",0.804591,0.806301,0.817912,0.823888,0.826399,0.815818,0.008923,3
3,0.06487,0.00382,0.01092,0.003045,1.0,2,1,0.01,1.0,"{'subsample': 1.0, 'num_leaves': 2, 'min_child...",0.76588,0.783038,0.812804,0.806844,0.819147,0.797543,0.019995,4
2,0.072386,0.00689,0.013507,0.009145,0.05,2,1,0.001,1.0,"{'subsample': 0.05, 'num_leaves': 2, 'min_chil...",0.730069,0.712166,0.734768,0.735691,0.737566,0.730052,0.009278,5


In [None]:
from sklearn.utils.fixes import loguniform

parameters = {'learning_rate': loguniform(1e-3, 1e-1), 
              'num_leaves': [2, 128],
              'min_child_samples': [1, 100],
              'subsample': [0.05, 1.0],
              'colsample_bytree': [0.1, 1.0]}

random_search = RandomizedSearchCV(lgbm, parameters, scoring='roc_auc', cv=5, n_iter=30, n_jobs=-1)

In [None]:
random_search.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


RandomizedSearchCV(cv=5, estimator=LGBMClassifier(), n_iter=30, n_jobs=-1,
                   param_distributions={'colsample_bytree': [0.1, 1.0],
                                        'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f998584afd0>,
                                        'min_child_samples': [1, 100],
                                        'num_leaves': [2, 128],
                                        'subsample': [0.05, 1.0]},
                   scoring='roc_auc')

In [None]:
results = pd.DataFrame(random_search.cv_results_)
results.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_learning_rate,param_min_child_samples,param_num_leaves,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,0.183109,0.008525,0.011982,0.000113,1.0,0.001618,100,128,1.0,"{'colsample_bytree': 1.0, 'learning_rate': 0.0...",0.814956,0.831126,0.835102,0.828107,0.826551,0.827169,0.006768,1
5,0.056417,0.012295,0.010324,0.000698,0.1,0.084491,1,2,1.0,"{'colsample_bytree': 0.1, 'learning_rate': 0.0...",0.81308,0.820206,0.823793,0.829516,0.834162,0.824151,0.007311,2
17,0.048732,0.00036,0.01044,0.000527,0.1,0.079125,100,2,0.05,"{'colsample_bytree': 0.1, 'learning_rate': 0.0...",0.812999,0.819878,0.823206,0.829571,0.834018,0.823934,0.007349,3
20,0.050471,0.003512,0.009887,0.000248,0.1,0.074733,1,2,1.0,"{'colsample_bytree': 0.1, 'learning_rate': 0.0...",0.812472,0.819299,0.822955,0.829241,0.83384,0.823561,0.007476,4
4,0.048715,0.003267,0.010274,0.00063,0.1,0.063749,100,2,0.05,"{'colsample_bytree': 0.1, 'learning_rate': 0.0...",0.811814,0.817789,0.82206,0.827966,0.832883,0.822502,0.007406,5
23,0.048829,0.003432,0.010569,0.001273,0.1,0.062854,1,2,1.0,"{'colsample_bytree': 0.1, 'learning_rate': 0.0...",0.811717,0.817724,0.822437,0.827929,0.832612,0.822484,0.00736,6
7,0.071304,0.010563,0.014703,0.005054,0.1,0.01675,100,128,1.0,"{'colsample_bytree': 0.1, 'learning_rate': 0.0...",0.813116,0.814227,0.821702,0.826755,0.833554,0.821871,0.007687,7
6,0.066456,0.002214,0.01169,0.000166,0.1,0.004143,100,128,0.05,"{'colsample_bytree': 0.1, 'learning_rate': 0.0...",0.811225,0.812577,0.822396,0.827166,0.833445,0.821362,0.008493,8
14,0.051781,0.006491,0.010005,0.000144,0.1,0.033845,1,2,0.05,"{'colsample_bytree': 0.1, 'learning_rate': 0.0...",0.809579,0.814296,0.820864,0.825889,0.829975,0.82012,0.007429,9
0,0.051633,0.003599,0.010042,0.00037,0.1,0.028155,1,2,1.0,"{'colsample_bytree': 0.1, 'learning_rate': 0.0...",0.809045,0.814628,0.820767,0.824631,0.831094,0.820033,0.007668,10


**[Responda]** Faça a mesma busca de hiperparâmetros usando uma Random Search, mas agora com o XGBoosting.