# Testing models precision

In [40]:
import pickle
with open('customer.pkl', 'rb') as f:
    X_customer_balanced, Y_customer_balanced = pickle.load(f)

# *SVM - 74,17%(Normal) 74,96%(Boosted)*

In [41]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import GridSearchCV

In [42]:
X_customer_balanced.shape, Y_customer_balanced.shape

((3918, 10), (3918,))

In [43]:
model = SVC(kernel='rbf', random_state=42, C=2.0)

In [44]:
model_boosted = SVC(kernel='rbf', random_state=42, C=10.0, tol=0.001, gamma=0.01)

In [45]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [46]:
scores = cross_val_score(model, X_customer_balanced, Y_customer_balanced, cv=kf, scoring='accuracy')

In [47]:
scores_boosted = cross_val_score(model_boosted, X_customer_balanced, Y_customer_balanced, cv=kf, scoring='accuracy')

In [48]:
# Exibindo os resultados
print(f"Scores de cada fold: {scores}")
print(f"Score médio: {np.mean(scores)}")
print(f"Desvio padrão: {np.std(scores)}")

Scores de cada fold: [0.7372449  0.75       0.75255102 0.73469388 0.72704082 0.69897959
 0.73469388 0.78316327 0.72890026 0.76982097]
Score médio: 0.7417088574560259
Desvio padrão: 0.022401792602462955


In [49]:
# Exibindo os resultados
print(f"Scores de cada fold: {scores_boosted}")
print(f"Score médio: {np.mean(scores_boosted)}")
print(f"Desvio padrão: {np.std(scores_boosted)}")

Scores de cada fold: [0.73979592 0.75       0.76785714 0.75255102 0.71938776 0.70918367
 0.75       0.78316327 0.75191816 0.77237852]
Score médio: 0.749623545070202
Desvio padrão: 0.02153010197017537


In [50]:
parametros = {
    'C': [0.1, 1.0, 10.0],  # Regularização
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],  # Kernels mais comuns
    'gamma': ['scale', 'auto', 0.01, 0.001],  # Para 'rbf' kernel
    'tol': [0.001, 0.0001]  # Tolerância de otimização
}

In [51]:
grid_search = GridSearchCV(estimator=SVC(), param_grid=parametros)
grid_search.fit(X_customer_balanced, Y_customer_balanced)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'C': 10.0, 'gamma': 0.01, 'kernel': 'rbf', 'tol': 0.001}
0.7468110615893867


# ANN - Artificial Neural Network - 74,5%

In [52]:
from sklearn.neural_network import MLPClassifier

In [53]:
# model_ann_boosted = MLPClassifier(max_iter=1500, verbose=True, solver='adam', activation='relu', hidden_layer_sizes=(50, 50), batch_size=10)

In [54]:
model_ann = MLPClassifier(max_iter=1500, verbose=True, tol=0.000000, solver='adam', activation='relu', hidden_layer_sizes=(10,10))

In [55]:
kf_ann = KFold(n_splits=10, shuffle=True, random_state=42)

In [56]:
scores_ann = cross_val_score(model_ann, X_customer_balanced, Y_customer_balanced, cv=kf_ann, scoring='accuracy')

Iteration 1, loss = 0.76138989
Iteration 2, loss = 0.72384907
Iteration 3, loss = 0.69953451
Iteration 4, loss = 0.68094347
Iteration 5, loss = 0.66722715
Iteration 6, loss = 0.65546919
Iteration 7, loss = 0.64430400
Iteration 8, loss = 0.63371888
Iteration 9, loss = 0.62314497
Iteration 10, loss = 0.61348529
Iteration 11, loss = 0.60379113
Iteration 12, loss = 0.59475241
Iteration 13, loss = 0.58646577
Iteration 14, loss = 0.57899789
Iteration 15, loss = 0.57291362
Iteration 16, loss = 0.56742829
Iteration 17, loss = 0.56301630
Iteration 18, loss = 0.55968415
Iteration 19, loss = 0.55656447
Iteration 20, loss = 0.55409431
Iteration 21, loss = 0.55190973
Iteration 22, loss = 0.54995516
Iteration 23, loss = 0.54800247
Iteration 24, loss = 0.54624929
Iteration 25, loss = 0.54432915
Iteration 26, loss = 0.54255074
Iteration 27, loss = 0.54070145
Iteration 28, loss = 0.53922931
Iteration 29, loss = 0.53776417
Iteration 30, loss = 0.53611454
Iteration 31, loss = 0.53437457
Iteration 32, los

In [57]:
# scores_ann_boosted = cross_val_score(model_ann_boosted, X_customer_balanced, Y_customer_balanced, cv=kf_ann, scoring='accuracy')

In [58]:
# Exibindo os resultados
print(f"Scores de cada fold: {scores_ann}")
print(f"Score médio: {np.mean(scores_ann)}")
print(f"Desvio padrão: {np.std(scores_ann)}")

Scores de cada fold: [0.71938776 0.71683673 0.76530612 0.7372449  0.70663265 0.70408163
 0.73469388 0.78061224 0.73913043 0.77493606]
Score médio: 0.7378862414531029
Desvio padrão: 0.026204990025326523


In [59]:
# # Exibindo os resultados
# print(f"Scores de cada fold: {scores_ann_boosted}")
# print(f"Score médio: {np.mean(scores_ann_boosted)}")
# print(f"Desvio padrão: {np.std(scores_ann_boosted)}")

In [60]:
# parametros = {
#     'activation': ['relu', 'logistic', 'tanh'],  # Funções de ativação
#     'solver': ['adam', 'sgd'],  # Otimizadores
#     'batch_size': [32, 64],  # Tamanhos de lote comuns
#     'learning_rate': ['constant', 'adaptive'],  # Taxa de aprendizado
#     'hidden_layer_sizes': [(50,), (100,), (50, 50)],  # Tamanho das camadas ocultas
#     'alpha': [0.0001, 0.001]  # Parâmetro de regularização L2
# }

In [61]:
# grid_search = GridSearchCV(estimator=MLPClassifier(), param_grid=parametros)
# grid_search.fit(X_customer_balanced, Y_customer_balanced)
# melhores_parametros = grid_search.best_params_
# melhor_resultado = grid_search.best_score_
# print(melhores_parametros)
# print(melhor_resultado)

# Decision Tree - 66%(Normal) 72%(Boosted)

In [62]:
from sklearn.tree import DecisionTreeClassifier

In [63]:
# Normal
model_tree = DecisionTreeClassifier(criterion='entropy', random_state=42)

In [64]:
# Boosted
model_tree_boosted = DecisionTreeClassifier(criterion='gini', splitter='random', min_samples_leaf=10, min_samples_split=2, random_state=42)

In [65]:
kf_tree = KFold(n_splits=10, shuffle=True, random_state=42)

In [66]:
scores_tree = cross_val_score(model_tree, X_customer_balanced, Y_customer_balanced, cv=kf_tree, scoring='accuracy')

In [67]:
# Exibindo os resultados
print(f"Scores de cada fold: {scores_tree}")
print(f"Score médio: {np.mean(scores_tree)}")
print(f"Desvio padrão: {np.std(scores_tree)}")

Scores de cada fold: [0.6505102  0.65306122 0.66071429 0.67091837 0.62755102 0.63010204
 0.68367347 0.69642857 0.69309463 0.68030691]
Score médio: 0.6646360718200324
Desvio padrão: 0.023205420177937815


In [68]:
scores_tree_boosted = cross_val_score(model_tree_boosted, X_customer_balanced, Y_customer_balanced, cv=kf_tree, scoring='accuracy')

In [69]:
# Exibindo os resultados
print(f"Scores de cada fold: {scores_tree_boosted}")
print(f"Score médio: {np.mean(scores_tree_boosted)}")
print(f"Desvio padrão: {np.std(scores_tree_boosted)}")

Scores de cada fold: [0.70918367 0.70153061 0.73979592 0.73469388 0.72193878 0.69897959
 0.70918367 0.7627551  0.73401535 0.73657289]
Score médio: 0.7248649459783915
Desvio padrão: 0.019211770606043753


In [70]:
# Params
parametros = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

In [71]:
# Finding Best Params
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=parametros)
grid_search.fit(X_customer_balanced, Y_customer_balanced)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'criterion': 'entropy', 'min_samples_leaf': 10, 'min_samples_split': 5, 'splitter': 'random'}
0.7309846352334037


In [72]:
# # Calculando desvio padrão
# std_dev = np.std(scores_tree_boosted)

# # Plotando o desvio padrão
# plt.bar('Desvio Padrão', std_dev)
# plt.ylabel('Valor')
# plt.title('Desvio Padrão dos Scores')

# plt.show()

# KNN - 70% Boosted(72%)

In [73]:
from sklearn.neighbors import KNeighborsClassifier

In [74]:
model_knn = KNeighborsClassifier(n_neighbors=10, metric='minkowski', p = 2)

In [75]:
model_knn_boosted = KNeighborsClassifier(n_neighbors=25, metric='minkowski', p = 1)

In [76]:
kf_knn = KFold(n_splits=10, shuffle=True, random_state=42)

In [77]:
scores_knn = cross_val_score(model_knn, X_customer_balanced, Y_customer_balanced, cv=kf_knn, scoring='accuracy')

In [78]:
scores_knn_boosted = cross_val_score(model_knn_boosted, X_customer_balanced, Y_customer_balanced, cv=kf_knn, scoring='accuracy')

In [79]:
# Exibindo os resultados
print(f"Scores de cada fold: {scores_knn}")
print(f"Score médio: {np.mean(scores_knn)}")
print(f"Desvio padrão: {np.std(scores_knn)}")

Scores de cada fold: [0.70153061 0.70408163 0.72704082 0.69132653 0.68877551 0.66071429
 0.68367347 0.75510204 0.70588235 0.71611253]
Score médio: 0.703423978286967
Desvio padrão: 0.02444292091659416


In [80]:
# Exibindo os resultados
print(f"Scores de cada fold: {scores_knn_boosted}")
print(f"Score médio: {np.mean(scores_knn_boosted)}")
print(f"Desvio padrão: {np.std(scores_knn_boosted)}")

Scores de cada fold: [0.74489796 0.72704082 0.75255102 0.74234694 0.68877551 0.70918367
 0.71428571 0.75765306 0.72122762 0.72890026]
Score médio: 0.7286862571115402
Desvio padrão: 0.020241083994342945


In [81]:
# Define the parameter grid
parametros = {
    'n_neighbors': range(1, 31),  # Test a range of neighbor values
    'metric': ['minkowski', 'euclidean', 'manhattan'],  # Different distance metrics
    'p': [1, 2]  # Minkowski parameter (1 for manhattan, 2 for euclidean)
}

In [82]:
# Finding Best Params
grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=parametros)
grid_search.fit(X_customer_balanced, Y_customer_balanced)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'metric': 'minkowski', 'n_neighbors': 25, 'p': 1}
0.7302258451273229


# Logistic Regression - 70%

In [83]:
from sklearn.linear_model import LogisticRegression

In [84]:
model_logistic = LogisticRegression(random_state=42, max_iter=150)

In [85]:
model_logistic_boosted = LogisticRegression(random_state=0, max_iter=100, C=1.0, solver='newton-cg', tol=0.0001)

In [86]:
kf_logistic = KFold(n_splits=10, shuffle=True, random_state=42)

In [87]:
scores_logistic = cross_val_score(model_logistic, X_customer_balanced, Y_customer_balanced, cv=kf_logistic, scoring='accuracy')

In [88]:
scores_logistic_boosted = cross_val_score(model_logistic_boosted, X_customer_balanced, Y_customer_balanced, cv=kf_logistic, scoring='accuracy')

In [89]:
# Exibindo os resultados
print(f"Scores de cada fold: {scores_logistic}")
print(f"Score médio: {np.mean(scores_logistic)}")
print(f"Desvio padrão: {np.std(scores_logistic)}")

Scores de cada fold: [0.67602041 0.70663265 0.71938776 0.71683673 0.66326531 0.67091837
 0.72959184 0.75255102 0.71355499 0.74680307]
Score médio: 0.7095562137898638
Desvio padrão: 0.029277793572685756


In [90]:
# Exibindo os resultados
print(f"Scores de cada fold: {scores_logistic_boosted}")
print(f"Score médio: {np.mean(scores_logistic_boosted)}")
print(f"Desvio padrão: {np.std(scores_logistic_boosted)}")

Scores de cada fold: [0.67602041 0.70663265 0.71938776 0.71683673 0.66326531 0.67091837
 0.72959184 0.75255102 0.71355499 0.74680307]
Score médio: 0.7095562137898638
Desvio padrão: 0.029277793572685756


In [91]:
# Define the parameter grid
parametros = {
    'C': [1.0, 1.5, 2.0],  # Regularization strength
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],  # Solvers
    'max_iter': [100, 200, 300],
    'tol': [0.0001, 0.00001, 0.000001]# Number of iterations
}

In [92]:
# Finding Best Params
grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=parametros)
grid_search.fit(X_customer_balanced, Y_customer_balanced)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'C': 1.0, 'max_iter': 100, 'solver': 'newton-cg', 'tol': 0.0001}
0.7064893007011234


# Naive Bayes - 72%

In [93]:
from sklearn.naive_bayes import GaussianNB

In [94]:
model_naive = GaussianNB()
kf_naive = KFold(n_splits=10, shuffle=True, random_state=42)

In [95]:
scores_naive = cross_val_score(model_naive, X_customer_balanced, Y_customer_balanced, cv=kf_naive, scoring='accuracy')

In [96]:
# Exibindo os resultados
print(f"Scores de cada fold: {scores_naive}")
print(f"Score médio: {np.mean(scores_naive)}")
print(f"Desvio padrão: {np.std(scores_naive)}")

Scores de cada fold: [0.68877551 0.7244898  0.70918367 0.73214286 0.68877551 0.68112245
 0.72193878 0.77295918 0.72634271 0.75959079]
Score médio: 0.7205321258938358
Desvio padrão: 0.028564427059807447


# Random Forest - 75% - 76% Boosted

In [97]:
from sklearn.ensemble import RandomForestClassifier

In [98]:
model_forest = RandomForestClassifier(n_estimators=80, criterion='entropy', random_state=42)

In [99]:
model_forest_boosted = RandomForestClassifier(n_estimators=150, criterion='gini', random_state=42, min_samples_leaf=10, min_samples_split=10)

In [100]:
kf_forest = KFold(n_splits=10, shuffle=True, random_state=42)

In [101]:
scores_forest_boosted = cross_val_score(model_forest_boosted, X_customer_balanced, Y_customer_balanced, cv=kf_forest, scoring='accuracy')

In [102]:
scores_forest = cross_val_score(model_forest, X_customer_balanced, Y_customer_balanced, cv=kf_forest, scoring='accuracy')

In [103]:
# Exibindo os resultados
print(f"Scores de cada fold: {scores_forest}")
print(f"Score médio: {np.mean(scores_forest)}")
print(f"Desvio padrão: {np.std(scores_forest)}")

Scores de cada fold: [0.75       0.75765306 0.7627551  0.75       0.7372449  0.72193878
 0.73469388 0.78316327 0.76470588 0.76982097]
Score médio: 0.7531975833811785
Desvio padrão: 0.01735613718267566


In [104]:
# Exibindo os resultados
print(f"Scores de cada fold: {scores_forest_boosted}")
print(f"Score médio: {np.mean(scores_forest_boosted)}")
print(f"Desvio padrão: {np.std(scores_forest_boosted)}")

Scores de cada fold: [0.74234694 0.78571429 0.75510204 0.76020408 0.7372449  0.70153061
 0.76020408 0.78571429 0.76726343 0.78772379]
Score médio: 0.7583048436766011
Desvio padrão: 0.025273003212994954


In [105]:
parametros = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [10, 40, 100, 150],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

In [106]:
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=parametros)
grid_search.fit(X_customer_balanced, Y_customer_balanced)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'criterion': 'gini', 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_estimators': 150}
0.7611049208955613
