# 03 — Modelagem e comparação de modelos
Objetivo:

- Treinar e comparar múltiplos modelos (SVM, KNN, MLP)
- Utilizar validação cruzada estratificada
- Ajustar hiperparâmetros via GridSearch
- Comparar métricas (accuracy, precision, recall, F1)
- Selecionar modelo candidato para Deploy

In [7]:
import numpy as np
import pandas as pd
import sys
from pathlib import Path
from imblearn.over_sampling import SMOTE
sys.path.append(str(Path("..").resolve()))
from src import train 
from src.models import save_model
from src.config import MODELS_CONFIG

from src.visual import plot_confusion_matrix, plot_roc_curve

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

RANDOM_STATE = 42
smote = SMOTE(random_state=RANDOM_STATE)
np.random.seed(RANDOM_STATE)

## Dataset de Treino e Pré-processamento

In [8]:
data_path = "../data/processed/train_dataset.csv"  
TARGET_COL = "Depression"  

df, X, y = train.load_data(data_path, target_col=TARGET_COL)
df.head()

print("Shape de X:", X.shape)
print("Distribuição da variável alvo:")
y.value_counts(normalize=True)

Shape de X: (22285, 11)
Distribuição da variável alvo:


Depression
1    0.585685
0    0.414315
Name: proportion, dtype: float64

## Treinamento dos modelos

In [3]:
results = []

for model_key, cfg in MODELS_CONFIG.items():
    print("="*60)
    print(f"Modelo: {cfg['display_name']}")
    
    smote = SMOTE(random_state=RANDOM_STATE) if cfg["use_smote"] else None

    pipeline = train.build_pipeline(
        model=cfg["model"],
        use_scaler=cfg["use_scaler"],
        use_smote=cfg["use_smote"],
        smote=smote
    )

    metrics = train.cross_validate(
        pipeline=pipeline,
        X=X,
        y=y,
        threshold=0.5,
        n_splits=10,
        random_state=RANDOM_STATE,
        verbose=True
    )

    train.summarize_cv_results(metrics)

    results.append({
        "model_key": model_key,
        "model": cfg["display_name"],
        "accuracy_mean": np.mean(metrics["accuracy"]),
        "precision_mean": np.mean(metrics["precision"]),
        "recall_mean": np.mean(metrics["recall"]),
        "f1_mean": np.mean(metrics["f1"])
    })

Modelo: K-Nearest Neighbors

FOLD 1


Acurácia: 0.8095
Precisão: 0.8474
Recall:   0.8227
F1-score: 0.8349

FOLD 2
Acurácia: 0.8342
Precisão: 0.8725
Recall:   0.8397
F1-score: 0.8558

FOLD 3
Acurácia: 0.8351
Precisão: 0.8663
Recall:   0.8497
F1-score: 0.8579

FOLD 4
Acurácia: 0.8360
Precisão: 0.8741
Recall:   0.8413
F1-score: 0.8574

FOLD 5
Acurácia: 0.8293
Precisão: 0.8632
Recall:   0.8420
F1-score: 0.8525

FOLD 6
Acurácia: 0.8302
Precisão: 0.8651
Recall:   0.8413
F1-score: 0.8530

FOLD 7
Acurácia: 0.8329
Precisão: 0.8641
Recall:   0.8482
F1-score: 0.8560

FOLD 8
Acurácia: 0.8189
Precisão: 0.8594
Recall:   0.8258
F1-score: 0.8423

FOLD 9
Acurácia: 0.8279
Precisão: 0.8622
Recall:   0.8404
F1-score: 0.8511

FOLD 10
Acurácia: 0.8252
Precisão: 0.8703
Recall:   0.8243
F1-score: 0.8467

MÉDIAS E DESVIOS-PADRÃO
Accuracy  : 0.8279 | DP: 0.0079
Precision : 0.8645 | DP: 0.0072
Recall    : 0.8375 | DP: 0.0092
F1        : 0.8508 | DP: 0.0071
Modelo: Support Vector Machine

FOLD 1
Acurácia: 0.8230
Precisão: 0.8582
Recall:   0.8358
F1-s

## Resultados Aprupados

In [4]:
results_df = pd.DataFrame(results).sort_values(
    by="f1_mean",
    ascending=False
)

results_df

Unnamed: 0,model_key,model,accuracy_mean,precision_mean,recall_mean,f1_mean
1,svm,Support Vector Machine,0.840994,0.871597,0.854402,0.862886
2,mlp,MLP (Neural Net),0.837849,0.871745,0.848418,0.859699
0,knn,K-Nearest Neighbors,0.827919,0.864473,0.837524,0.850762


## Melhores Modelos para busca

In [5]:
TOP_MODELS = results_df.head(3)["model_key"].tolist()
TOP_MODELS

['svm', 'mlp', 'knn']

In [None]:
grid_results = {}

for model_key in TOP_MODELS:
    cfg = MODELS_CONFIG[model_key]
    print("="*60)
    print(f"GridSearch para: {cfg['display_name']}")

    smote = SMOTE(random_state=RANDOM_STATE) if cfg["use_smote"] else None

    pipeline = train.build_pipeline(
        model=cfg["model"],
        use_scaler=cfg["use_scaler"],
        use_smote=cfg["use_smote"],
        smote=smote
    )

    grid = train.run_gridsearch(
        pipeline=pipeline,
        param_grid=cfg["param_grid"],
        X=X,
        y=y,
        scoring="recall",
        n_splits=5
    )

    grid_results[model_key] = grid

    print("Best params:", grid["best_params"])
    print("Best F1:", grid["best_score"])

GridSearch para: Support Vector Machine
Fitting 5 folds for each of 12 candidates, totalling 60 fits


KeyboardInterrupt: 

## Melhor Limiar

In [None]:
best_model_key = TOP_MODELS[1]
best_pipeline = grid_results[best_model_key]["best_estimator"]

best_threshold, best_f1 = train.find_best_threshold(
    model=best_pipeline,
    X_val=X,
    y_val=y
)

print("Melhor threshold:", best_threshold)
print("F1 nesse threshold:", best_f1)

Melhor threshold: 0.33
F1 nesse threshold: 0.8740481655554341


## Dicionáro das Novas Métricas

In [9]:

MODELS = {
      "knn": {
        "display_name": "K-Nearest Neighbors",
        "model": KNeighborsClassifier(
                    n_neighbors=15,
                    weights="distance",
                    metric="minkowski",
                    p=2),
        "use_scaler": True,
        "use_smote": True,
        "param_grid": {
            "model__n_neighbors": [11, 15, 19],        # CORRIGIDO
            "model__weights": ["uniform", "distance"], # CORRIGIDO
            "model__metric": ["minkowski"],            # CORRIGIDO
            "model__p": [1, 2]                         # CORRIGIDO
        }
    },

    "mlp": {
        "display_name": "MLP (Neural Net)",
        "model": MLPClassifier(
                hidden_layer_sizes=(128, 64, 32),
                activation='logistic',
                solver='adam',
                max_iter=800,
                alpha = 0.01,
                learning_rate_init= 0.001,
                early_stopping=True,
                n_iter_no_change=20,
                random_state=42
            ),
        "use_scaler": True,
        "use_smote": True,
        "param_grid": {
            "model__hidden_layer_sizes": [(128, 64, 32), (64, 32), (32, 16)], # CORRIGIDO
            "model__activation": ["relu", "logistic", "tanh"],               # CORRIGIDO
            "model__alpha": [0.0001, 0.01],                                  # CORRIGIDO
            "model__learning_rate_init": [0.001, 0.0001],                    # CORRIGIDO
            "model__solver": ["adam"]                                        # CORRIGIDO
        }
    },
    
    "svm": {
        "display_name": "Support Vector Machine",
        "model": SVC(kernel='poly', probability=True, C = 0.001),
        "use_scaler": True,
        "use_smote": True,
        "param_grid": {
            "model__C": [0.001, 1],
            "model__kernel": ["rbf", "linear"]
        }
    },

 
}


## Novo treino + Salvar Modelos

In [None]:
results = []
trained_models = {}

for model_key, cfg in MODELS.items():
    print("="*60)
    print(f"Modelo: {cfg['display_name']}")
    
    smote = SMOTE(random_state=RANDOM_STATE) if cfg["use_smote"] else None

    pipeline = train.build_pipeline(
        model=cfg["model"],
        use_scaler=cfg["use_scaler"],
        use_smote=cfg["use_smote"],
        smote=smote
    )

    metrics = train.cross_validate(
        pipeline=pipeline,
        X=X,
        y=y,
        threshold=0.5,
        n_splits=10,
        random_state=RANDOM_STATE,
        verbose=True
    )

    train.summarize_cv_results(metrics)
    results.append({
        "model_key": model_key,
        "model": cfg["display_name"],
        "accuracy_mean": np.mean(metrics["accuracy"]),
        "precision_mean": np.mean(metrics["precision"]),
        "recall_mean": np.mean(metrics["recall"]),
        "f1_mean": np.mean(metrics["f1"])
    })

    final_model = train.train_final_model(
    model=cfg["model"],
    X=X,
    y=y,
    use_scaler=cfg["use_scaler"],
    use_smote=cfg["use_smote"],
    smote=smote
    )

    trained_models[model_key] = {
    "model": final_model,
    "metrics": metrics
    }
    save_model(
            model=final_model,
            model_name=f"{model_key}_1.joblib",
            metadata={
                "cv_accuracy": np.mean(metrics["accuracy"]),
                "cv_f1": np.mean(metrics["f1"]),
                "cv_precision": np.mean(metrics["precision"]),
                "cv_recall": np.mean(metrics["recall"])
            },
            overwrite=True
        )

    

Modelo: K-Nearest Neighbors

FOLD 1
Acurácia: 0.8201
Precisão: 0.8610
Recall:   0.8261
F1-score: 0.8432

FOLD 2
Acurácia: 0.8165
Precisão: 0.8567
Recall:   0.8245
F1-score: 0.8403

FOLD 3
Acurácia: 0.8394
Precisão: 0.8714
Recall:   0.8513
F1-score: 0.8612

FOLD 4
Acurácia: 0.8259
Precisão: 0.8620
Recall:   0.8369
F1-score: 0.8493

FOLD 5
Acurácia: 0.8255
Precisão: 0.8613
Recall:   0.8369
F1-score: 0.8489

FOLD 6
Acurácia: 0.8128
Precisão: 0.8474
Recall:   0.8299
F1-score: 0.8386

FOLD 7
Acurácia: 0.8353
Precisão: 0.8788
Recall:   0.8337
F1-score: 0.8557

FOLD 8
Acurácia: 0.8182
Precisão: 0.8505
Recall:   0.8368
F1-score: 0.8436

FOLD 9
Acurácia: 0.8348
Precisão: 0.8657
Recall:   0.8498
F1-score: 0.8577

FOLD 10
Acurácia: 0.8250
Precisão: 0.8663
Recall:   0.8291
F1-score: 0.8473

MÉDIAS E DESVIOS-PADRÃO
Accuracy  : 0.8254 | DP: 0.0084
Precision : 0.8621 | DP: 0.0088
Recall    : 0.8355 | DP: 0.0086
F1        : 0.8486 | DP: 0.0072
Salvando o Modelo: knn_1.joblib 
knn_1.joblib Salvo Com Su