Imports

In [1]:
import os
import sys
import numpy as np
import pandas as pd

# Visualização
import matplotlib.pyplot as plt
import seaborn as sns

# Path do projeto
PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Sklearn / Imblearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

# Métricas
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_auc_score,
    RocCurveDisplay,
    PrecisionRecallDisplay
)

# Funções do projeto
from src.train import (
    load_train_test,
    build_pipeline,
    cross_validate,
    summarize_cv_results,
    run_gridsearch,
    find_best_threshold,
    evaluate_on_test
)

RANDOM_STATE = 42


Carregar Dados

In [2]:
DATA_DIR = os.path.join(PROJECT_ROOT, "data", "processed")


X_train, y_train, X_test, y_test = load_train_test(
    train_path=os.path.join(DATA_DIR, "train_dataset.csv"),
    test_path=os.path.join(DATA_DIR, "test_dataset.csv"),
    target_col="Depression"  
)

print("Train:", X_train.shape, y_train.shape)
print("Test :", X_test.shape, y_test.shape)

y_train.value_counts(normalize=True)


Train: (22257, 11) (22257,)
Test : (5565, 11) (5565,)


Depression
1    0.585703
0    0.414297
Name: proportion, dtype: float64

Baseline Simples

In [None]:
baseline_model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    random_state=RANDOM_STATE
)

baseline_pipeline = build_pipeline(
    model=baseline_model,
    use_scaler=True,
    use_smote=False
)

metrics_baseline = cross_validate(
    baseline_pipeline,
    X_train,
    y_train,
    threshold=0.5,
    n_splits=10
)

summarize_cv_results(metrics_baseline)



FOLD 1
Acurácia: 0.8293
Precisão: 0.8642
Recall:   0.8404
F1-score: 0.8521

FOLD 2
Acurácia: 0.8378
Precisão: 0.8805
Recall:   0.8367
F1-score: 0.8580

FOLD 3
Acurácia: 0.8549
Precisão: 0.8877
Recall:   0.8612
F1-score: 0.8743

FOLD 4
Acurácia: 0.8482
Precisão: 0.8833
Recall:   0.8535
F1-score: 0.8682

FOLD 5
Acurácia: 0.8455
Precisão: 0.8721
Recall:   0.8627
F1-score: 0.8674

FOLD 6
Acurácia: 0.8432
Precisão: 0.8835
Recall:   0.8436
F1-score: 0.8631

FOLD 7
Acurácia: 0.8419
Precisão: 0.8754
Recall:   0.8512
F1-score: 0.8631

FOLD 8
Acurácia: 0.8431
Precisão: 0.8816
Recall:   0.8457
F1-score: 0.8633

FOLD 9
Acurácia: 0.8409
Precisão: 0.8751
Recall:   0.8496
F1-score: 0.8621

FOLD 10
Acurácia: 0.8409
Precisão: 0.8836
Recall:   0.8388
F1-score: 0.8606

MÉDIAS E DESVIOS-PADRÃO
Accuracy  : 0.8426 | DP: 0.0063
Precision : 0.8787 | DP: 0.0066
Recall    : 0.8483 | DP: 0.0085
F1        : 0.8632 | DP: 0.0057


Modelo com SMOTE

In [5]:
smote = SMOTE(
    sampling_strategy=0.8,
    random_state=RANDOM_STATE
)

model = LogisticRegression(
    max_iter=1000,
    random_state=RANDOM_STATE
)

pipeline_smote = build_pipeline(
    model=model,
    use_scaler=True,
    use_smote=True,
    smote=smote
)

metrics_smote = cross_validate(
    pipeline_smote,
    X_train,
    y_train,
    threshold=0.5,
    n_splits=10
)

summarize_cv_results(metrics_smote)



FOLD 1


Acurácia: 0.8333
Precisão: 0.8520
Recall:   0.8657
F1-score: 0.8588

FOLD 2
Acurácia: 0.8459
Precisão: 0.8693
Recall:   0.8673
F1-score: 0.8683

FOLD 3
Acurácia: 0.8549
Precisão: 0.8707
Recall:   0.8834
F1-score: 0.8770

FOLD 4
Acurácia: 0.8477
Precisão: 0.8653
Recall:   0.8765
F1-score: 0.8709

FOLD 5
Acurácia: 0.8504
Precisão: 0.8610
Recall:   0.8880
F1-score: 0.8743

FOLD 6
Acurácia: 0.8441
Precisão: 0.8639
Recall:   0.8712
F1-score: 0.8675

FOLD 7
Acurácia: 0.8473
Precisão: 0.8608
Recall:   0.8819
F1-score: 0.8712

FOLD 8
Acurácia: 0.8467
Precisão: 0.8689
Recall:   0.8695
F1-score: 0.8692

FOLD 9
Acurácia: 0.8463
Precisão: 0.8599
Recall:   0.8810
F1-score: 0.8704

FOLD 10
Acurácia: 0.8409
Precisão: 0.8670
Recall:   0.8603
F1-score: 0.8636

MÉDIAS E DESVIOS-PADRÃO
Accuracy  : 0.8458 | DP: 0.0054
Precision : 0.8639 | DP: 0.0054
Recall    : 0.8745 | DP: 0.0085
F1        : 0.8691 | DP: 0.0049


Melhor Configuração

In [7]:
param_grid = {
    "model__C": [0.01, 0.1, 1, 10],
    "model__penalty": ["l2"]
}

grid_results = run_gridsearch(
    pipeline=pipeline_smote,
    param_grid=param_grid,
    X=X_train,
    y=y_train,
    scoring="f1",
    n_splits=5
)

grid_results["best_params"]


Fitting 5 folds for each of 4 candidates, totalling 20 fits


{'model__C': 0.01, 'model__penalty': 'l2'}

Threshold ótimo

In [8]:
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train,
    y_train,
    test_size=0.2,
    stratify=y_train,
    random_state=RANDOM_STATE
)

best_model = grid_results["best_estimator"]
best_model.fit(X_tr, y_tr)

best_threshold, best_f1 = find_best_threshold(
    best_model,
    X_val,
    y_val
)

best_threshold, best_f1


(np.float64(0.36000000000000004), 0.8759255914755283)

Avaliação FINAL no teste

In [9]:
evaluate_on_test(
    best_model,
    X_test,
    y_test,
    threshold=best_threshold
)



AVALIAÇÃO FINAL NO TESTE
Acurácia:  0.8451
Precisão:  0.8204
Recall:    0.9417
F1-score:  0.8769
