# Otimização de Hiperparâmetros

## Carregando os dados

In [64]:
# carregando os pacotes
import os
import numpy as np
import pandas as pd

In [65]:
df_abt = pd.read_excel('/content/Bank_Personal_Loan_Modelling.xlsx')
df_abt.head()

Unnamed: 0,ID,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal_Loan,Securities_Account,CD_Account,Online,CreditCard
0,1,25,1,49,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,4,1.0,2,0,0,0,0,0,1


In [66]:
df_abt = df_abt.dropna()

In [67]:
from sklearn.model_selection import train_test_split

In [68]:
X = df_abt[["Age", "Experience", "Income", "Family", "CCAvg", "Education", "Mortgage", "CreditCard", "Securities_Account", "CD_Account", "Online"]]
y = df_abt['Personal_Loan'].copy()
y = pd.DataFrame(y)

In [69]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

## Grid Search

In [70]:
from sklearn.tree import DecisionTreeClassifier

arvore_decisao = DecisionTreeClassifier()

In [71]:
arvore_decisao.fit(X_train, y_train)

DecisionTreeClassifier()

In [72]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

print(f"Acurácia (Treino): {accuracy_score(y_train, arvore_decisao.predict(X_train))}")
print(f"Acurácia (Teste): {accuracy_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"Acurácia Balanceada (Treino): {balanced_accuracy_score(y_train, arvore_decisao.predict(X_train))}")
print(f"Acurácia Balanceada (Teste): {balanced_accuracy_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"Precision (Treino): {precision_score(y_train, arvore_decisao.predict(X_train))}")
print(f"Precision (Teste): {precision_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"Recall (Treino): {recall_score(y_train, arvore_decisao.predict(X_train))}")
print(f"Recall (Teste): {recall_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"F1-Score (Treino): {f1_score(y_train, arvore_decisao.predict(X_train))}")
print(f"F1-Score (Teste): {f1_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"ROCAUC (Treino): {roc_auc_score(y_train, arvore_decisao.predict_proba(X_train)[:,1])}")
print(f"ROCAUC (Teste): {roc_auc_score(y_test, arvore_decisao.predict_proba(X_test)[:,1])}")

Acurácia (Treino): 1.0
Acurácia (Teste): 0.981
Acurácia Balanceada (Treino): 1.0
Acurácia Balanceada (Teste): 0.9336283185840708
Precision (Treino): 1.0
Precision (Teste): 0.9230769230769231
Recall (Treino): 1.0
Recall (Teste): 0.875
F1-Score (Treino): 1.0
F1-Score (Teste): 0.8983957219251337
ROCAUC (Treino): 1.0
ROCAUC (Teste): 0.9336283185840708


In [73]:
# Importamos GridSearchCV
from sklearn.model_selection import GridSearchCV

# Criamos um dicionário que os hiperparâmetros que queremos treinar o modelo
parameters = {
  'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

# Criamos um GridSearch passando o modelo, os parâmetros, a métrica que queremos otimizar. 
grid_search = GridSearchCV(arvore_decisao, parameters, scoring='roc_auc', cv=5, n_jobs=-1)



In [74]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
             scoring='roc_auc')

In [75]:
grid_search.best_estimator_.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [76]:
arvore_decisao = DecisionTreeClassifier(ccp_alpha = 0.0,
 class_weight= None,
 criterion = 'gini',
 max_depth = 3,
 max_features = None,
 max_leaf_nodes = None,
 min_impurity_decrease = 0.0,
 min_samples_leaf = 1,
 min_samples_split = 2,
 min_weight_fraction_leaf = 0.0,
 random_state = None,
 splitter = 'best')

In [77]:
arvore_decisao.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3)

In [78]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

print(f"Acurácia (Treino): {accuracy_score(y_train, arvore_decisao.predict(X_train))}")
print(f"Acurácia (Teste): {accuracy_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"Acurácia Balanceada (Treino): {balanced_accuracy_score(y_train, arvore_decisao.predict(X_train))}")
print(f"Acurácia Balanceada (Teste): {balanced_accuracy_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"Precision (Treino): {precision_score(y_train, arvore_decisao.predict(X_train))}")
print(f"Precision (Teste): {precision_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"Recall (Treino): {recall_score(y_train, arvore_decisao.predict(X_train))}")
print(f"Recall (Teste): {recall_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"F1-Score (Treino): {f1_score(y_train, arvore_decisao.predict(X_train))}")
print(f"F1-Score (Teste): {f1_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"ROCAUC (Treino): {roc_auc_score(y_train, arvore_decisao.predict_proba(X_train)[:,1])}")
print(f"ROCAUC (Teste): {roc_auc_score(y_test, arvore_decisao.predict_proba(X_test)[:,1])}")

Acurácia (Treino): 0.98525
Acurácia (Teste): 0.972
Acurácia Balanceada (Treino): 0.9383066002949852
Acurácia Balanceada (Teste): 0.9007190265486726
Precision (Treino): 0.9629629629629629
Precision (Teste): 0.8863636363636364
Recall (Treino): 0.8802083333333334
Recall (Teste): 0.8125
F1-Score (Treino): 0.9197278911564626
F1-Score (Teste): 0.8478260869565218
ROCAUC (Treino): 0.995778311670354
ROCAUC (Teste): 0.981914869100295


In [79]:
# criando uma tabela com os resultados do grid_search
results = pd.DataFrame(grid_search.cv_results_)

# rankeando esses resultados
results.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,0.013623,0.003836,0.00876,0.002469,3,{'max_depth': 3},0.993612,0.996883,0.993066,0.995446,0.995967,0.994995,0.001439,1
3,0.015346,0.004381,0.006574,0.000361,4,{'max_depth': 4},0.995202,0.991297,0.983672,0.990507,0.996695,0.991475,0.004539,2
4,0.011686,0.002441,0.008971,0.003361,5,{'max_depth': 5},0.981181,0.984822,0.983663,0.988836,0.988675,0.985435,0.002956,3
5,0.012379,0.003122,0.008216,0.002674,6,{'max_depth': 6},0.975574,0.972481,0.971852,0.989483,0.975212,0.97692,0.006449,4
6,0.012518,0.003279,0.006515,0.000218,7,{'max_depth': 7},0.968777,0.964955,0.95759,0.982953,0.975705,0.969996,0.008733,5
1,0.010912,0.00237,0.008447,0.002469,2,{'max_depth': 2},0.960417,0.97337,0.956387,0.956207,0.962269,0.96173,0.00627,6
8,0.010707,0.000721,0.007926,0.002133,9,{'max_depth': 9},0.955001,0.958093,0.96394,0.964515,0.959728,0.960255,0.003585,7
9,0.012517,0.002176,0.006287,0.000361,10,{'max_depth': 10},0.954311,0.958219,0.958273,0.964497,0.962692,0.959598,0.003611,8
7,0.011716,0.002478,0.007141,0.001252,8,{'max_depth': 8},0.954729,0.951258,0.950154,0.970739,0.968718,0.95912,0.008816,9
0,0.008119,0.000239,0.008097,0.002364,1,{'max_depth': 1},0.835345,0.866331,0.847389,0.861068,0.867983,0.855623,0.012459,10


Vamos realizar um GridSearch com uma Random Forest.

In [80]:
from sklearn.ensemble import RandomForestClassifier

rf =  RandomForestClassifier()


In [81]:
parameters = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              'n_estimators': [100, 300, 500]}

grid_search = GridSearchCV(rf, parameters, scoring='roc_auc', cv=5, n_jobs=-1)

In [82]:
grid_search.fit(X_train, y_train)

  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'n_estimators': [100, 300, 500]},
             scoring='roc_auc')

In [83]:
results = pd.DataFrame(grid_search.cv_results_)
results.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
28,1.314753,0.021169,0.094446,0.004915,10,300,"{'max_depth': 10, 'n_estimators': 300}",0.998037,0.997252,0.99788,0.999497,0.99806,0.998145,0.000737,1
22,1.330478,0.022246,0.098794,0.007701,8,300,"{'max_depth': 8, 'n_estimators': 300}",0.997819,0.996749,0.997827,0.998922,0.997862,0.997836,0.000687,2
23,2.179431,0.019702,0.149816,0.004626,8,500,"{'max_depth': 8, 'n_estimators': 500}",0.998001,0.996605,0.99788,0.999192,0.997359,0.997807,0.000849,3
25,1.316411,0.013005,0.09615,0.005095,9,300,"{'max_depth': 9, 'n_estimators': 300}",0.998146,0.996515,0.997647,0.999066,0.997521,0.997779,0.000833,4
26,2.20746,0.018059,0.149347,0.003941,9,500,"{'max_depth': 9, 'n_estimators': 500}",0.99791,0.996569,0.997216,0.999317,0.997827,0.997768,0.000913,5


In [84]:
grid_search.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 300,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [85]:
rf =  RandomForestClassifier(bootstrap = True,
 ccp_alpha= 0.0,
 class_weight= None,
 criterion= 'gini',
 max_depth = 9,
 max_features= 'auto',
 max_leaf_nodes = None,
 max_samples = None,
 min_impurity_decrease = 0.0,
 min_samples_leaf = 1,
 min_samples_split = 2,
 min_weight_fraction_leaf = 0.0,
 n_estimators = 500,
 n_jobs = None,
 oob_score = False,
 random_state = None,
 verbose = 0,
 warm_start = False)

In [86]:
rf.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(max_depth=9, n_estimators=500)

In [87]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

print(f"Acurácia (Treino): {accuracy_score(y_train, rf.predict(X_train))}")
print(f"Acurácia (Teste): {accuracy_score(y_test, rf.predict(X_test))}")
print("====================================")
print(f"Acurácia Balanceada (Treino): {balanced_accuracy_score(y_train, rf.predict(X_train))}")
print(f"Acurácia Balanceada (Teste): {balanced_accuracy_score(y_test, rf.predict(X_test))}")
print("====================================")
print(f"Precision (Treino): {precision_score(y_train, rf.predict(X_train))}")
print(f"Precision (Teste): {precision_score(y_test, rf.predict(X_test))}")
print("====================================")
print(f"Recall (Treino): {recall_score(y_train, rf.predict(X_train))}")
print(f"Recall (Teste): {recall_score(y_test, rf.predict(X_test))}")
print("====================================")
print(f"F1-Score (Treino): {f1_score(y_train, rf.predict(X_train))}")
print(f"F1-Score (Teste): {f1_score(y_test, rf.predict(X_test))}")
print("====================================")
print(f"ROCAUC (Treino): {roc_auc_score(y_train, rf.predict_proba(X_train)[:,1])}")
print(f"ROCAUC (Teste): {roc_auc_score(y_test, rf.predict_proba(X_test)[:,1])}")

Acurácia (Treino): 0.99725
Acurácia (Teste): 0.979
Acurácia Balanceada (Treino): 0.9856770833333333
Acurácia Balanceada (Teste): 0.9092459439528023
Precision (Treino): 1.0
Precision (Teste): 0.9518072289156626
Recall (Treino): 0.9713541666666666
Recall (Teste): 0.8229166666666666
F1-Score (Treino): 0.9854689564068693
F1-Score (Teste): 0.88268156424581
ROCAUC (Treino): 0.9999963991058259
ROCAUC (Teste): 0.9931093289085545
