In [57]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import uniform, randint
import json

# Configuração inicial
np.random.seed(42)

In [66]:
# Função para calcular métricas
def calculate_metrics(y_true, y_pred, average='macro'):
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, average=average, zero_division=0),
        'recall': recall_score(y_true, y_pred, average=average, zero_division=0),
        'f1': f1_score(y_true, y_pred, average=average, zero_division=0)
    }

    print(f"Acurácia: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1: {metrics['f1']:.4f}")
    
    return metrics

# Função genérica para busca de hiperparâmetros e avaliação
def optimize_and_evaluate(model, param_dist, X, y, n_iter=20, cv_splits=5, scoring='f1_macro', model_name="Modelo"):
    # Validação cruzada estratificada
    cv = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)
    
    # Configuração do RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=n_iter,
        cv=cv,
        scoring=scoring,
        return_train_score=True,
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    
    # Executar a busca
    random_search.fit(X, y)
    
    # Armazenar histórico de desempenho
    cv_results = pd.DataFrame(random_search.cv_results_)
    history = {
        'mean_test_score': cv_results['mean_test_score'].tolist(),
        'std_test_score': cv_results['std_test_score'].tolist(),
        'params': cv_results['params'].tolist()
    }
    
    # Melhor modelo
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    best_score = random_search.best_score_
    
    print(f"\n{model_name} - Melhores hiperparâmetros: {best_params}")
    print(f"{model_name} - Melhor {scoring}: {best_score:.4f}")
    
    return best_model, history

# Função para plotar evolução dos resultados
def plot_search_results(history_dict, metric='f1_macro'):
    plt.figure(figsize=(10, 6))
    for model_name, history in history_dict.items():
        mean_scores = history['mean_test_score']
        std_scores = history['std_test_score']
        plt.plot(range(len(mean_scores)), mean_scores, label=f'{model_name} (média)')
        plt.fill_between(range(len(mean_scores)), 
                        mean_scores - std_scores, 
                        mean_scores + std_scores, 
                        alpha=0.2)
    plt.title(f'Evolução do {metric} durante a busca de hiperparâmetros')
    plt.xlabel('Iteração')
    plt.ylabel(f'{metric} (média)')
    plt.legend()
    plt.grid(True)
    plt.show()

# Função para plotar boxplot das métricas
def plot_metrics_boxplot(metrics_dict):
    metrics_df = pd.DataFrame(metrics_dict).T
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=metrics_df)
    plt.title('Comparação de Métricas entre Modelos')
    plt.ylabel('Valor da Métrica')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.show()

In [50]:
from sklearn.preprocessing import StandardScaler

# extraindo os dados separados de treino e teste

X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

y_train = np.ravel(y_train) # transformando em array
y_test = np.ravel(y_test) # transformando em array

# realizando a normalização dos dados
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [51]:
# Dicionários para armazenar resultados
history_dict = {}
metrics_dict = {}
best_models = {}

In [None]:
params_knn = {
    'n_neighbors': range(2, 21),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

params_decision_tree = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': list(range(3, 21, 2)),
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8]
}

params_random_forest = {
    'n_estimators': range(1, 100, 5),
    'criterion': ['gini', 'entropy'],
    'max_depth': list(range(3, 30, 3))
}

params_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto', 0.1, 1],
    'max_iter': [1000, 2500, 5000]
}

params_mlp = {
    'hidden_layer_sizes': [(10,), (20,), (50,), (100,), (50, 50), (100, 50), (50, 25)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}

params_xgboost = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.1, 0.3, 0.5],
    'max_depth': range(3, 11),
    'subsample': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 1]
}

params_lightgbm = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': range(3, 11),
    'num_leaves': [15, 31, 63],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

In [None]:
model = DecisionTreeClassifier()
model_name = 'Decision Tree'

print(f"\nTreinando Decision Tree ...")

best_model, history = optimize_and_evaluate(
    model=model,
    param_dist=params_decision_tree,
    X=X_train,
    y=y_train,
    n_iter=20,
    cv_splits=5,
    scoring='f1_macro',
    model_name=model_name
)

# Armazenar histórico e melhor modelo
history_dict[model_name] = history
best_models[model_name] = best_model

# Avaliar métricas no conjunto completo
y_pred = best_model.predict(X_train)

metrics = calculate_metrics(y_train, y_pred, average='macro')
metrics_dict[model_name] = metrics

# trasforma o resultado e cria um arquivo json
result = json.dumps(history_dict)

with open(f'historico_busca_hiperparametros_{model_name}.json', 'w') as f:
    f.write(result)



Treinando Decision Tree ...
Fitting 5 folds for each of 20 candidates, totalling 100 fits

Decision Tree - Melhores hiperparâmetros: {'splitter': 'best', 'min_samples_split': 20, 'min_samples_leaf': 4, 'max_depth': 9, 'criterion': 'gini'}
Decision Tree - Melhor f1_macro: 0.4008
Acurácia: 0.5894
Precision: 0.5591
Recall: 0.4318
F1: 0.4227
{'Decision Tree': {'mean_test_score': [0.39337197584237715, 0.4007625883773893, 0.37617969157242703, 0.39448744031769395, 0.3920826078462809, 0.3919780550408446, 0.39332952952476286, 0.3116133088305769, 0.38618303342569443, 0.3850422792984215, 0.3970059886252619, 0.38388297383980385, 0.3902047236294478, 0.3847099866489866, 0.39360891109099855, 0.3909722722892146, 0.3899648163102377, 0.31628968922940526, 0.3951737868695605, 0.37888876829318524], 'std_test_score': [0.0019505280708409303, 0.003039586784805576, 0.01053005974557641, 0.005237597153986394, 0.008921573475589847, 0.008910765459401714, 0.0039904392430780726, 0.025236653362989983, 0.006310672317

In [None]:
model = KNeighborsClassifier()
model_name = 'KNN'

print(f"\nTreinando KNN ...")

best_model, history = optimize_and_evaluate(
    model=model,
    param_dist=params_knn,
    X=X_train,  # Substitua por seus dados
    y=y_train,  # Substitua por seus dados
    n_iter=20,
    cv_splits=5,
    scoring='f1_macro',
    model_name=model_name
)

# Armazenar histórico e melhor modelo
history_dict[model_name] = history
best_models[model_name] = best_model

# Avaliar métricas no conjunto completo
y_pred = best_model.predict(X_train)
y_prob = best_model.predict_proba(X_train) if hasattr(best_model, 'predict_proba') else None
metrics = calculate_metrics(y_train, y_pred, y_prob, average='macro')
metrics_dict[model_name] = metrics

In [None]:

# Visualizações
plot_search_results(history_dict, metric='F1-Score Macro')
plot_metrics_boxplot(metrics_dict)

# Exibir resultados finais
print("\nResultados Finais:")
for model_name, metrics in metrics_dict.items():
    print(f"\n{model_name}:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.4f}")