## Criar o treinamento com GridSearch em DecisionTreeRegressor e RandomForestClassifier

In [None]:
import pandas as pd

In [2]:
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/Classificação/Performance/Bank_Personal_Loan_Modelling.xlsx')

In [3]:
df.head()

Unnamed: 0,ID,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal_Loan,Securities_Account,CD_Account,Online,CreditCard
0,1,25,1,49,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,4,1.0,2,0,0,0,0,0,1


### Tratando os dados

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   Family              5000 non-null   int64  
 5   CCAvg               5000 non-null   float64
 6   Education           5000 non-null   int64  
 7   Mortgage            5000 non-null   int64  
 8   Personal_Loan       5000 non-null   int64  
 9   Securities_Account  5000 non-null   int64  
 10  CD_Account          5000 non-null   int64  
 11  Online              5000 non-null   int64  
 12  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(12)
memory usage: 507.9 KB


In [7]:
df.isnull().sum()
# Sem nulos na nossa analise

ID                    0
Age                   0
Experience            0
Income                0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal_Loan         0
Securities_Account    0
CD_Account            0
Online                0
CreditCard            0
dtype: int64

In [8]:
df.head(3)

Unnamed: 0,ID,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal_Loan,Securities_Account,CD_Account,Online,CreditCard
0,1,25,1,49,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,1,1.0,1,0,0,0,0,0,0


In [18]:
# Distribuindo as bases
x = df.copy()
x = df.drop(labels=['Personal_Loan', 'ID'], axis=1)
x.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Securities_Account,CD_Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,1,0,0,0
1,45,19,34,3,1.5,1,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,1


In [19]:
y = df[['Personal_Loan']]
y.head()

Unnamed: 0,Personal_Loan
0,0
1,0
2,0
3,0
4,0


# Treino

In [20]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, stratify=y)

# Decision tree

In [101]:
from sklearn.tree  import DecisionTreeClassifier

dtr =  DecisionTreeClassifier()


In [102]:
# Sem o gridSearch
dtr.fit(x_train, y_train)

In [103]:
# Verificando algumas metricas
from sklearn.metrics import accuracy_score

print(f'Acurácia treino: {accuracy_score(y_train, dtr.predict(x_train))}')
print(f'Acurácia test: {accuracy_score(y_test, dtr.predict(x_test))}')

# Uma acurácia clara de overfit, quer dizer que o modelo decorou o modelo, com isso vamos usar o hiperparametros para retirar esse overfit

Acurácia treino: 1.0
Acurácia test: 0.984


## GridSearch

In [104]:
# Com GridSearch
from sklearn.model_selection import GridSearchCV


parameters = {
          "max_depth": [1,2,3,4,5,6,7,8,9,10]
}

# Nesse caso queremos melhorar a acurácia
grid_search = GridSearchCV(estimator=dtr,param_grid= parameters, scoring = 'accuracy', n_jobs=-1, cv=5 )

In [105]:
grid_search.fit(x_train,y_train)

In [106]:
# Pegando os melhores parametros
grid_search.best_estimator_.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 8,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [107]:
# Arvore tunada
dtr_tunned = DecisionTreeClassifier(
    ccp_alpha= 0.0,
 class_weight= None,
 criterion= 'gini',
 max_depth= 7,
 max_features= None,
 max_leaf_nodes= None,
 min_impurity_decrease= 0.0,
 min_samples_leaf= 1,
 min_samples_split= 2,
 min_weight_fraction_leaf= 0.0,
 random_state= None,
 splitter= 'best'
)


In [108]:
# Treinando a arvore tunada
dtr_tunned.fit(x_train, y_train)

In [110]:
# Verificando como ficou com os melhores parametros

print(f'Acurácia treino: {accuracy_score(y_train, dtr_tunned.predict(x_train))}')
print(f'Acurácia test: {accuracy_score(y_test, dtr_tunned.predict(x_test))}')

# Vemos que os resultados alcançados são bem melhores e mostram o quanto de melhoria tivemos usando o GridSearch

Acurácia treino: 0.9935
Acurácia test: 0.986


# RandomForestRegressor

In [124]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

In [125]:
rf.fit(x_train, y_train)

  rf.fit(x_train, y_train)


In [126]:
# Verificando a acuracia do nosso modelo sem o GridSearch
print(f'Acurácia treino: {accuracy_score(y_train, rf.predict(x_train))}')
print(f'Acurácia test: {accuracy_score(y_test, rf.predict(x_test))}')

# Novamente temos que o modelo decorou aquilo que era para ser predito

Acurácia treino: 1.0
Acurácia test: 0.991


## GridSearch

In [127]:
# Usando o GridSearch
parameters ={
    'n_estimators':[100,200, 300],
    'max_depth':[1,2,3,4,5,6,7,8,9,10]
}

grid_seach  = GridSearchCV(estimator=rf, param_grid = parameters, scoring='accuracy', cv=5, n_jobs=-1)

In [128]:
grid_search.fit(x_train, y_train)

In [129]:
grid_search.best_estimator_.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 8,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [130]:
rf_tunned = RandomForestClassifier(
    ccp_alpha= 0.0,
 class_weight= None,
 criterion= 'gini',
 max_depth= 8,
 max_features= None,
 max_leaf_nodes= None,
 min_impurity_decrease= 0.0,
 min_samples_leaf= 1,
 min_samples_split= 2,
 min_weight_fraction_leaf= 0.0,
 random_state= None,
)

In [131]:
rf_tunned.fit(x_train, y_train)

  rf_tunned.fit(x_train, y_train)


In [132]:
# Verificando se mudou a acurácia do modelo

print(f'Acurácia treino: {accuracy_score(y_train, rf_tunned.predict(x_train))}')
print(f'Acurácia test: {accuracy_score(y_test, rf_tunned.predict(x_test))}')


Acurácia treino: 0.998
Acurácia test: 0.987
