<a href="https://colab.research.google.com/github/Cosamores/cdsi/blob/main/AvaliacaoAlgoritmos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [99]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from google.colab import drive

In [50]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
caminho = '/content/drive/MyDrive/IFSP/CDSI/'

In [53]:
with open(caminho+'credit_.pkl', 'rb') as f:
  X_credit_treinamento, y_credit_treinamento, X_credit_teste, y_credit_teste = pickle.load(f)

In [55]:
X_credit_teste.shape, y_credit_teste.shape

((1500,), (500,))

In [57]:
X_credit_treinamento.shape, y_credit_treinamento.shape

((1500, 3), (500, 3))

In [62]:
X_credit = np.concatenate((X_credit_treinamento, y_credit_treinamento), axis = 0)
X_credit

array([[-1.3754462 ,  0.50128394,  0.10980934],
       [ 1.45826409, -1.59502559, -1.21501497],
       [-0.79356829,  0.2279696 , -0.43370226],
       ...,
       [ 1.37445674, -1.01972449, -1.12564819],
       [-1.57087737, -0.60869993, -0.36981671],
       [-1.03572293, -0.90526119,  0.04244312]])

In [75]:
y_credit = np.concatenate((X_credit_teste, y_credit_teste), axis = 0)
y_credit.shape

# Check for NaNs in X_credit
print(f"Number of NaNs in X_credit: {np.isnan(X_credit).sum()}")

Number of NaNs in X_credit: 3


In [76]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_credit = imputer.fit_transform(X_credit)

print(f"Number of NaNs in X_credit after imputation: {np.isnan(X_credit).sum()}")

Number of NaNs in X_credit after imputation: 0


# Decision Tree Classifier


In [77]:
parametros = {'criterion': ['gini', 'entropy'],
              'splitter': ['best', 'random'],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 5, 10]
              }

In [78]:
gri_search = GridSearchCV(estimator=DecisionTreeClassifier(),
                         param_grid=parametros,
                        )
gri_search.fit(X_credit, y_credit)
melhores_parametros = gri_search.best_params_
melhor_resultado = gri_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'criterion': 'gini', 'min_samples_leaf': 1, 'min_samples_split': 5, 'splitter': 'best'}
0.9845


# Random Forest

In [95]:
parametros = {'criterion': ['gini', 'entropy'],
              'n_estimators': [10, 40, 100, 150],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 5, 10]
              }

In [96]:
gri_search = GridSearchCV(estimator=RandomForestClassifier(),
                         param_grid=parametros,
                        )
gri_search.fit(X_credit, y_credit)
melhores_parametros = gri_search.best_params_
melhor_resultado = gri_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
0.9860000000000001


# KNN

In [81]:
parametros = {'n_neighbors': [3, 5, 10, 20],
              'p': [1, 2]
              }

In [82]:
gri_search = GridSearchCV(estimator=KNeighborsClassifier(),
                         param_grid=parametros,
                         )
gri_search.fit(X_credit, y_credit)
melhores_parametros = gri_search.best_params_
melhor_resultado = gri_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'n_neighbors': 5, 'p': 1}
0.9789999999999999


# Logistic Regression

In [83]:
parametros = {'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
              'C': [1.0, 1.5, 2.0],
              'tol': [1e-5, 1e-4, 1e-3]
              }

In [84]:
gri_search = GridSearchCV(estimator=LogisticRegression(),
                         param_grid=parametros,
                        )
gri_search.fit(X_credit, y_credit)
melhores_parametros = gri_search.best_params_
melhor_resultado = gri_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'C': 1.5, 'solver': 'newton-cg', 'tol': 1e-05}
0.9460000000000001


# SVM (Support Vector Machine): SVC (Support Vector Classifier)

In [85]:
parametros = {'C': [1.0, 1.5, 2.0],
              'tol': [1e-4, 1e-3, 1e-2],
              'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
              }

In [86]:
gri_search = GridSearchCV(estimator=SVC(),
                         param_grid=parametros,
                        )
gri_search.fit(X_credit, y_credit)
melhores_parametros = gri_search.best_params_
melhor_resultado = gri_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'C': 2.0, 'kernel': 'rbf', 'tol': 0.0001}
0.9835


# Redes neurais

In [97]:
parametros = { 'activation': ['relu', 'logistic', 'tanh'],
              'solver': ['adam', 'sgd'],
               'batch_size': [10, 56],
              'max_iter': [500, 1000]
              }

In [98]:
grid_search = GridSearchCV(estimator=MLPClassifier(),
                         param_grid=parametros,
                        )
grid_search.fit(X_credit, y_credit)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)



{'activation': 'relu', 'batch_size': 56, 'max_iter': 500, 'solver': 'adam'}
0.9960000000000001


# Validação Cruzada


In [100]:
resultados_arvore = []
resultados_random_forest = []
resultados_knn = []
resultados_regressao_logistica = []
resultados_svm = []
resultado_rede_neural = []

for i in range(30):
  print(f"Rodada {i}")
  kfold = KFold(n_splits=10, shuffle=True, random_state=i)


  # Decision tree

  arvore = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=1, min_samples_split=5, splitter='best')
  scores_arvore = cross_val_score(arvore, X_credit, y_credit, cv=kfold)
  resultados_arvore.append(scores_arvore.mean())


  # Random Forest

  random_forest = RandomForestClassifier(criterion='entropy', min_samples_leaf=1, min_samples_split=5, n_estimators=150)
  scores_random_forest = cross_val_score(random_forest, X_credit, y_credit, cv=kfold)
  resultados_random_forest.append(scores_random_forest.mean())

  # KNN

  knn = KNeighborsClassifier(n_neighbors=5, p=2)
  scores_knn = cross_val_score(knn, X_credit, y_credit, cv=kfold)
  resultados_knn.append(scores_knn.mean())

  # Logistic Regression

  regressao_logistica = LogisticRegression(C=2.0, solver='newton-cg', tol=0.00001)
  scores_regressao_logistica = cross_val_score(regressao_logistica, X_credit, y_credit, cv=kfold)
  resultados_regressao_logistica.append(scores_regressao_logistica.mean())

  # SVM (SVC)

  svm = SVC(C=2.0, kernel='rbf', tol=0.0001)
  scores_svm = cross_val_score(svm, X_credit, y_credit, cv=kfold)
  resultados_svm.append(scores_svm.mean())

  # Redes Neurais

  rede_neural = MLPClassifier(activation='relu', batch_size=10, max_iter=500, solver='adam')
  scores_rede_neural = cross_val_score(rede_neural, X_credit, y_credit, cv=kfold)
  resultado_rede_neural.append(scores_rede_neural.mean())


Rodada 0
Rodada 1
Rodada 2
Rodada 3
Rodada 4
Rodada 5
Rodada 6
Rodada 7
Rodada 8
Rodada 9
Rodada 10
Rodada 11
Rodada 12
Rodada 13
Rodada 14
Rodada 15
Rodada 16
Rodada 17
Rodada 18
Rodada 19
Rodada 20
Rodada 21
Rodada 22
Rodada 23
Rodada 24
Rodada 25
Rodada 26
Rodada 27
Rodada 28
Rodada 29


In [101]:
resultados = pd.DataFrame({'Decision Tree': resultados_arvore,
                          'Random Forest': resultados_random_forest,
                           'KNN': resultados_knn,
                           'Regressão Logística': resultados_regressao_logistica,
                           'SVM': resultados_svm,
                           'Rede Neural': resultado_rede_neural
                           })

resultados

Unnamed: 0,Decision Tree,Random Forest,KNN,Regressão Logística,SVM,Rede Neural
0,0.9845,0.986,0.9805,0.946,0.9845,0.996
1,0.984,0.987,0.98,0.946,0.9855,0.996
2,0.9905,0.9895,0.9785,0.945,0.9855,0.996
3,0.986,0.986,0.9775,0.945,0.985,0.9965
4,0.988,0.9875,0.982,0.9455,0.9855,0.997
5,0.9865,0.9865,0.977,0.9435,0.9845,0.996
6,0.9875,0.989,0.9785,0.9455,0.986,0.996
7,0.9865,0.9875,0.98,0.9455,0.9855,0.996
8,0.985,0.9875,0.979,0.9445,0.985,0.9965
9,0.9845,0.9865,0.981,0.9475,0.9845,0.9965


In [102]:
resultados.describe()

Unnamed: 0,Decision Tree,Random Forest,KNN,Regressão Logística,SVM,Rede Neural
count,30.0,30.0,30.0,30.0,30.0,30.0
mean,0.9861,0.986767,0.9792,0.945267,0.985133,0.9957
std,0.001798,0.001264,0.001648,0.000944,0.001245,0.000761
min,0.983,0.985,0.976,0.9435,0.982,0.994
25%,0.985,0.986,0.978125,0.9445,0.9845,0.995125
50%,0.986,0.9865,0.97925,0.9455,0.985,0.996
75%,0.986875,0.9875,0.980375,0.945875,0.9855,0.996
max,0.9905,0.9895,0.982,0.9475,0.988,0.997


In [103]:
resultados.var()

Unnamed: 0,0
Decision Tree,3.231034e-06
Random Forest,1.598851e-06
KNN,2.717241e-06
Regressão Logística,8.91954e-07
SVM,1.550575e-06
Rede Neural,5.793103e-07


In [104]:
pickle.dump(rede_neural, open(caminho+'rede_neural_finalizado.sav', 'wb'))

In [None]:
rede_neural_carregada = pickle.load(open(caminho+'rede_neural_finalizado.sav', 'rb'))