In [33]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import json

# Configuração inicial
np.random.seed(42)

In [34]:
# Função para calcular métricas
def calculate_metrics(y_true, y_pred, average='macro'):
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, average=average),
        'recall': recall_score(y_true, y_pred, average=average),
        'f1': f1_score(y_true, y_pred, average=average)
    }

    print(f"Acurácia: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1: {metrics['f1']:.4f}")
    
    return metrics

In [35]:

# Função genérica para busca de hiperparâmetros e avaliação
def optimize_and_evaluate(model, param_dist, X, y, n_iter=20, cv_splits=5, scoring='f1_macro', model_name="Modelo"):
    # Validação cruzada estratificada
    cv = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)
    
    # Configuração do RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=n_iter,
        cv=cv,
        scoring=scoring,
        return_train_score=True,
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    
    # Executar a busca
    random_search.fit(X, y)
    
    y_pred = random_search.predict(X)
    calculate_metrics(y, y_pred)
    
    # Armazenar histórico de desempenho
    cv_results = pd.DataFrame(random_search.cv_results_)
    history = {
        'best_params': random_search.best_params_,
        'mean_test_score': cv_results['mean_test_score'].tolist(),
        'std_test_score': cv_results['std_test_score'].tolist(),
        'mean_train_score': cv_results['mean_train_score'].tolist(),
        'std_train_score': cv_results['std_train_score'].tolist(),
        'acuracia: ': accuracy_score(y, y_pred),
        'precision': precision_score(y, y_pred, average='macro'),
        'recall': recall_score(y, y_pred, average='macro'),
        'f1': f1_score(y, y_pred, average='macro')
    }
    
    # Melhor modelo
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    best_score = random_search.best_score_
    
    print(f"\n{model_name} - Melhores hiperparâmetros: {best_params}")
    print(f"{model_name} - Melhor modelo {best_model}")
    print(f"{model_name} - Melhor {scoring}: {best_score:.4f}")    
    return best_model, history

In [None]:
from sklearn.preprocessing import StandardScaler

# extraindo os dados separados de treino e teste

X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

y_train = np.ravel(y_train) # transformando em array
y_test = np.ravel(y_test) # transformando em array

# realizando a normalização dos dados
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [37]:
params_knn = {
    'n_neighbors': range(2, 21),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

params_decision_tree = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': list(range(3, 21, 2)),
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8]
}

params_random_forest = {
    'n_estimators': range(1, 100, 5),
    'criterion': ['gini', 'entropy'],
    'max_depth': list(range(3, 30, 3))
}

params_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto', 0.1, 1],
    'max_iter': [1000, 2500, 5000]
}

params_mlp = {
    'hidden_layer_sizes': [(10,), (20,), (50,), (100,), (50, 50), (100, 50), (50, 25)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}

params_xgboost = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.1, 0.3, 0.5],
    'max_depth': range(3, 11),
    'subsample': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 1]
}

params_lightgbm = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': range(3, 11),
    'num_leaves': [15, 31, 63],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

In [None]:
model = DecisionTreeClassifier()
model_name = 'Decision Tree'
history_dict = {}

print(f"\nTreinando {model_name} ...")

best_model, history = optimize_and_evaluate(
    model=model,
    param_dist=params_decision_tree,
    X=X_train,
    y=y_train,
    n_iter=20,
    cv_splits=5,
    scoring='f1_macro',
    model_name=model_name
)

# Armazenar histórico e melhor modelo
history_dict[model_name] = history

# trasforma o resultado e cria um arquivo json
result = json.dumps(history_dict)

with open(f'{model_name} - busca_hiperparametros_.json', 'w') as f:
    f.write(result)



Treinando Decision Tree ...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Acurácia: 0.5958
Precision: 0.5847
Recall: 0.4370
F1: 0.4290

Decision Tree - Melhores hiperparâmetros: {'splitter': 'best', 'min_samples_split': 20, 'min_samples_leaf': 4, 'max_depth': 9, 'criterion': 'gini'}
Decision Tree - Melhor modelo DecisionTreeClassifier(max_depth=9, min_samples_leaf=4, min_samples_split=20)
Decision Tree - Melhor f1_macro: 0.3946


In [None]:
model = RandomForestClassifier()
model_name = 'Random Forest'
history_dict = {}

print(f"\nTreinando {model_name} ...")

best_model, history = optimize_and_evaluate(
    model=model,
    param_dist=params_random_forest,
    X=X_train,
    y=y_train,
    n_iter=20,
    cv_splits=5,
    scoring='f1_macro',
    model_name=model_name
)

# Armazenar histórico e melhor modelo
history_dict[model_name] = history

# trasforma o resultado e cria um arquivo json
result = json.dumps(history_dict)

with open(f'{model_name} - busca_hiperparametros_.json', 'w') as f:
    f.write(result)



Treinando Random Forest ...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Acurácia: 0.9974
Precision: 0.9983
Recall: 0.9937
F1: 0.9960

Random Forest - Melhores hiperparâmetros: {'n_estimators': 71, 'max_depth': 27, 'criterion': 'entropy'}
Random Forest - Melhor modelo RandomForestClassifier(criterion='entropy', max_depth=27, n_estimators=71)
Random Forest - Melhor f1_macro: 0.3874


In [None]:
model = XGBClassifier()
model_name = 'XGBoost'
history_dict = {}

print(f"\nTreinando {model_name} ...")

best_model, history = optimize_and_evaluate(
    model=model,
    param_dist=params_xgboost,
    X=X_train,
    y=y_train,
    n_iter=20,
    cv_splits=5,
    scoring='f1_macro',
    model_name=model_name
)

# Armazenar histórico e melhor modelo
history_dict[model_name] = history

# trasforma o resultado e cria um arquivo json
result = json.dumps(history_dict)

with open(f'{model_name} - busca_hiperparametros_.json', 'w') as f:
    f.write(result)



Treinando XGBoost ...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Acurácia: 0.6828
Precision: 0.7383
Recall: 0.5462
F1: 0.5686

XGBoost - Melhores hiperparâmetros: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.5, 'gamma': 0.1}
XGBoost - Melhor modelo XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.1, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.5, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
           

In [None]:
model = LGBMClassifier()
model_name = 'LightGBM'
history_dict = {}

print(f"\nTreinando {model_name} ...")

best_model, history = optimize_and_evaluate(
    model=model,
    param_dist=params_lightgbm,
    X=X_train,
    y=y_train,
    n_iter=20,
    cv_splits=5,
    scoring='f1_macro',
    model_name=model_name
)

# Armazenar histórico e melhor modelo
history_dict[model_name] = history

# trasforma o resultado e cria um arquivo json
result = json.dumps(history_dict)

with open(f'{model_name} - busca_hiperparametros_.json', 'w') as f:
    f.write(result)



Treinando LightGBM ...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022897 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 410
[LightGBM] [Info] Number of data points in the train set: 24265, number of used features: 42
[LightGBM] [Info] Start training from score -0.647197
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018461 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 415
[LightGBM] [Info] Start training from score -1.012580
[LightGBM] [Info] Start training from score -2.178526
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.024872 seconds.
You can set `force_col_wise=true` to 

In [None]:
model = knn = KNeighborsClassifier()
model_name = 'KNN'
history_dict = {}

print(f"\nTreinando {model_name} ...")

best_model, history = optimize_and_evaluate(
    model=model,
    param_dist=params_knn,
    X=X_train,
    y=y_train,
    n_iter=20,
    cv_splits=5,
    scoring='f1_macro',
    model_name=model_name
)

# Armazenar histórico e melhor modelo
history_dict[model_name] = history

# trasforma o resultado e cria um arquivo json
result = json.dumps(history_dict)

with open(f'{model_name} - busca_hiperparametros_.json', 'w') as f:
    f.write(result)


Treinando KNN ...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Acurácia: 1.0000
Precision: 1.0000
Recall: 1.0000
F1: 1.0000

KNN - Melhores hiperparâmetros: {'weights': 'distance', 'n_neighbors': 4, 'metric': 'euclidean'}
KNN - Melhor modelo KNeighborsClassifier(metric='euclidean', n_neighbors=4, weights='distance')
KNN - Melhor f1_macro: 0.3756


In [None]:
model = SVC()
model_name = 'SVM'
history_dict = {}

print(f"\nTreinando {model_name} ...")

best_model, history = optimize_and_evaluate(
    model=model,
    param_dist=params_svm,
    X=X_train,
    y=y_train,
    n_iter=20,
    cv_splits=5,
    scoring='f1_macro',
    model_name=model_name
)

# Armazenar histórico e melhor modelo
history_dict[model_name] = history

# trasforma o resultado e cria um arquivo json
result = json.dumps(history_dict)

with open(f'{model_name} - busca_hiperparametros_.json', 'w') as f:
    f.write(result)


Treinando SVM ...
Fitting 5 folds for each of 20 candidates, totalling 100 fits




Acurácia: 0.4545
Precision: 0.3732
Recall: 0.3711
F1: 0.3597

SVM - Melhores hiperparâmetros: {'max_iter': 1000, 'kernel': 'sigmoid', 'gamma': 'scale', 'C': 100}
SVM - Melhor modelo SVC(C=100, kernel='sigmoid', max_iter=1000)
SVM - Melhor f1_macro: 0.3696


In [None]:
model = MLPClassifier()
model_name = 'MLP'
history_dict = {}

print(f"\nTreinando {model_name} ...")

best_model, history = optimize_and_evaluate(
    model=model,
    param_dist=params_mlp,
    X=X_train,
    y=y_train,
    n_iter=20,
    cv_splits=5,
    scoring='f1_macro',
    model_name=model_name
)

# Armazenar histórico e melhor modelo
history_dict[model_name] = history

# trasforma o resultado e cria um arquivo json
result = json.dumps(history_dict)

with open(f'{model_name} - busca_hiperparametros_.json', 'w') as f:
    f.write(result)



Treinando MLP ...
Fitting 5 folds for each of 20 candidates, totalling 100 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

Acurácia: 0.6516
Precision: 0.6157
Recall: 0.5102
F1: 0.5191

MLP - Melhores hiperparâmetros: {'solver': 'lbfgs', 'learning_rate': 'invscaling', 'hidden_layer_sizes': (100,), 'alpha': 0.01, 'activation': 'tanh'}
MLP - Melhor modelo MLPClassifier(activation='tanh', alpha=0.01, learning_rate='invscaling',
              solver='lbfgs')
MLP - Melhor f1_macro: 0.3915


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [45]:
from sklearn.ensemble import StackingClassifier

# Parâmetros para Stacking de MLPs
params_stacking_mlp = {
    'final_estimator__hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'final_estimator__activation': ['relu', 'tanh'],
    'final_estimator__alpha': [0.0001, 0.001, 0.01],
    'stack_method': ['auto', 'predict_proba']
}

# Parâmetros para Stacking genérico
params_stacking_generic = {
    'final_estimator__C': [0.1, 1, 10],
    'final_estimator__kernel': ['linear', 'rbf'],
    'stack_method': ['auto', 'predict_proba']
}

# Definição dos modelos base para Stacking de MLPs
base_estimators_mlp = [
    ('mlp1', MLPClassifier()),
    ('mlp2', MLPClassifier()),
    ('mlp3', MLPClassifier())]

# Definição dos modelos base para Stacking Heterogêneo
base_estimators_generic = [
    ('rf', RandomForestClassifier()),
    ('knn', KNeighborsClassifier()),
    ('dt', DecisionTreeClassifier())]

In [None]:

# Adicionar Stacking de MLPs
model = StackingClassifier(
    estimators=base_estimators_mlp,
    final_estimator=MLPClassifier(max_iter=1000),
    cv=5
)
model_name = 'Stacking MLP'
history_dict = {}

print(f"\nTreinando {model_name} ...")

best_model, history = optimize_and_evaluate(
    model=model,
    param_dist=params_stacking_mlp,
    X=X_train,
    y=y_train,
    n_iter=20,
    cv_splits=5,
    scoring='f1_macro',
    model_name=model_name
)

# Armazenar histórico e melhor modelo
history_dict[model_name] = history

# Transforma o resultado e cria um arquivo json
result = json.dumps(history_dict)

with open(f'{model_name} - busca_hiperparametros_.json', 'w') as f:
    f.write(result)


Treinando Stacking MLP ...
Fitting 5 folds for each of 20 candidates, totalling 100 fits




Acurácia: 0.6324
Precision: 0.7401
Recall: 0.4506
F1: 0.4270

Stacking MLP - Melhores hiperparâmetros: {'stack_method': 'predict_proba', 'final_estimator__hidden_layer_sizes': (50, 50), 'final_estimator__alpha': 0.01, 'final_estimator__activation': 'tanh'}
Stacking MLP - Melhor modelo StackingClassifier(cv=5,
                   estimators=[('mlp1', MLPClassifier()),
                               ('mlp2', MLPClassifier()),
                               ('mlp3', MLPClassifier())],
                   final_estimator=MLPClassifier(activation='tanh', alpha=0.01,
                                                 hidden_layer_sizes=(50, 50),
                                                 max_iter=1000),
                   stack_method='predict_proba')
Stacking MLP - Melhor f1_macro: 0.3630


In [None]:
# Adicionar Stacking Genérico
model = StackingClassifier(
    estimators=base_estimators_generic,
    final_estimator=SVC(probability=True),
    cv=5
)
model_name = 'Stacking Generic'
history_dict = {}

print(f"\nTreinando {model_name} ...")

best_model, history = optimize_and_evaluate(
    model=model,
    param_dist=params_stacking_generic,
    X=X_train,
    y=y_train,
    n_iter=20,
    cv_splits=5,
    scoring='accuracy',
    model_name=model_name
)

# Armazenar histórico e melhor modelo
history_dict[model_name] = history

# Transforma o resultado e cria um arquivo json
result = json.dumps(history_dict)

with open(f'{model_name} - busca_hiperparametros_.json', 'w') as f:
    f.write(result)


Treinando Stacking Generic ...
Fitting 5 folds for each of 12 candidates, totalling 60 fits




Acurácia: 0.8867
Precision: 0.5874
Recall: 0.6666
F1: 0.6217

Stacking Generic - Melhores hiperparâmetros: {'stack_method': 'auto', 'final_estimator__kernel': 'linear', 'final_estimator__C': 10}
Stacking Generic - Melhor modelo StackingClassifier(cv=5,
                   estimators=[('rf', RandomForestClassifier()),
                               ('knn', KNeighborsClassifier()),
                               ('dt', DecisionTreeClassifier())],
                   final_estimator=SVC(C=10, kernel='linear', probability=True))
Stacking Generic - Melhor accuracy: 0.5712


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
