In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, learning_curve
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.base import accuracy_score

In [8]:
PROCESSED_DIR = Path.cwd().parent / "data" / "processed"

df_final_cluster = pd.read_csv(PROCESSED_DIR / 'df_cluster_centroids.csv')

In [None]:
num_cols = ['IDADE', 'DIAS_ENTRE_CONSULTA_DIAGNOSTICO', 'DIAS_ENTRE_CONSULTA_TRATAMENTO', 'DIAS_ENTRE_DIAGNOSTICO_TRATAMENTO']
bin_cols = ['SEM_RECIDIVA']
cat_cols = ['CATEGORIA_ATENDIMENTO', 'DIAGNOSTICO_E_TRATAMENTO_ANTERIOR', 'CODIGO_BASE_DIAGNOSTICO',
            'ESTADIO_CLINICO', 'TNM_T', 'TNM_N', 'TNM_M']
ord_cols = ['PSA', 'GLEASON']

X = df_final_cluster[num_cols + ord_cols + bin_cols + cat_cols]
y = df_final_cluster["TRATAMENTO"]

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ord_cols),
        ('bin', 'passthrough', bin_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ]
)

In [None]:
feature_selector = SelectKBest(score_func=mutual_info_classif, k=20)
knnModel = KNeighborsClassifier(n_neighbors=9, weights='distance')

knn_pipeline = Pipeline(steps=[
    ('preprocessamento', preprocessor),
    ('feature_selection', feature_selector),
    ('classificador', knnModel)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.2)
knn_pipeline.fit(X_train, y_train)

y_pred = knn_pipeline.predict(X_test)

In [None]:
print("A acucária do classificador é: ", accuracy_score(y_test, y_pred))
print("KNN - Acurácia no teste:", knn_pipeline.score(X_test, y_test))

disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred), display_labels=label_encoder.classes_)
disp.plot()
plt.title(f'Matriz de Confusão KNN')
plt.show()

### Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

label_encoder = LabelEncoder()
y_enc = label_encoder.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.3, stratify=y_enc, random_state=42
)

In [None]:
pipeline = Pipeline([
    ("kbest", SelectKBest(score_func=mutual_info_classif)),
    ("rf",    RandomForestClassifier(random_state=42, n_jobs=-1)),
])

In [None]:
param_grid = {
    "kbest__k":            [10, 20, 30, "all"],
    "rf__n_estimators":    [200, 400, 600],
    "rf__max_depth":       [None, 10, 20],
    "rf__min_samples_leaf":[1, 2, 4],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

gsearch = GridSearchCV(
    pipeline,
    param_grid,
    scoring="accuracy",
    cv=cv,
    n_jobs=-1,
    verbose=2,
)

In [None]:
gsearch.fit(X_train, y_train)

print("Melhor acurácia (CV):", gsearch.best_score_)
print("Melhores parâmetros :", gsearch.best_params_)

best_model = gsearch.best_estimator_

In [None]:
y_pred = best_model.predict(X_test)
print("\nRelatório de classificação \n") 
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

cm = ConfusionMatrixDisplay.from_estimator(
    best_model, X_test, y_test, display_labels=label_encoder.classes_, xticks_rotation=45
)
plt.title("Matriz de Confusão – RandomForest + SelectKBest")
plt.show()

### XGBoost

In [None]:
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay
from xgboost import XGBClassifier
from xgboost import plot_importance

In [None]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ord_cols),
        ('bin', 'passthrough', bin_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
    ]
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, stratify=y_encoded, test_size=0.2, random_state=42)

In [None]:
xgb_model = XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    device='cuda',
    subsample=0.8,
    reg_lambda=3,
    reg_alpha=0.5,
    n_estimators=300,
    max_depth=10,
    learning_rate=0.1,
    colsample_bytree=1,
    random_state=42
)

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessamento', preprocessor),
    ('classificador', xgb_model)
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [None]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))


fig, axes = plt.subplots(1, 3, figsize=(20, 5))

ConfusionMatrixDisplay.from_estimator(
    estimator=pipeline,
    X=X_test,
    y=y_test,
    display_labels=le.classes_,
    xticks_rotation=45,
    ax=axes[0],
    cmap="Blues"
)
axes[0].set_title("Matriz de Confusão – XGBoost")

if len(le.classes_) == 2:
    y_score = pipeline.predict_proba(X_test)[:, 1]

    RocCurveDisplay.from_predictions(
        y_test, y_score, ax=axes[1])
    axes[1].set_title("ROC – XGBoost")

    PrecisionRecallDisplay.from_predictions(
        y_test, y_score, ax=axes[2])
    axes[2].set_title("PR Curve – XGBoost")
else:
    axes[1].axis("off")
    axes[2].axis("off")
    axes[1].text(0.5, 0.5, "ROC / PR disponíveis\napenas para problema binário",
                 ha="center", va="center")

plt.tight_layout()
plt.show()

In [None]:
importances_fig, ax = plt.subplots(figsize=(8, 10))

booster = pipeline.named_steps["classificador"].get_booster()
plot_importance(booster, ax=ax, max_num_features=20, height=0.4)
ax.set_title("Top‐20 Features – ganho (XGBoost)")
plt.show()