# Modelo de clasificación

## Ajustes iniciales

In [None]:
# Librerias
# Estandar
import time
import os 
import warnings
os.environ["LOKY_MAX_CPU_COUNT"] = "8" 
os.makedirs("resultados/graficas", exist_ok=True)

# Visualizacion
import matplotlib.pyplot as plt
import seaborn as sns

# Procesamiento
import numpy as np
import pandas as pd

# Adicionales
import time
import lime
import joblib

In [None]:
# Librerias de modelado
# Escalado de numericas
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import(
    classification_report,
    roc_auc_score, roc_curve,
    confusion_matrix, ConfusionMatrixDisplay,
    recall_score, precision_recall_curve,
    average_precision_score
)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline


In [None]:
# Lectura de datos
df = pd.read_csv("data/dengue.csv")

# Ajuste de tipos de datos
# Tipo aseguradora y Evento (One-Hot Encoding)
df = pd.get_dummies(df, columns=['TIP_SS', 'EVENTO'], drop_first=True)

# Pais, municipio y departamento (Frecuencias)
for col in ['PAIS_OCU', 'DPTO_OCU', 'MUN_OCU']:
    freq = df[col].value_counts(normalize=True).to_dict()
    df[col] = df[col].map(freq).fillna(0).astype('float64')

scaler = StandardScaler()
# Escalado de variables numéricas
df['EDAD_AJUSTADA'] = scaler.fit_transform(df[['EDAD_AJUSTADA']])

In [None]:
# División de conjuntos de datos

# Definir variable predictora y objetivo
X = df.drop(columns=['confirmados'])
y = df['confirmados'] 

# Division en train y test (60% train, 20% test, 20% validación)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# Verificación de la división
print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Train: {y_train.shape}, Test: {y_test.shape}")

## Funciones

In [None]:
def plot_precision_recall_curve(y_test, y_proba, model_name):
    precision, recall, _ = precision_recall_curve
    ap_score = average_precision_score(y_test, y_proba)
    
    plt.figure(figsize=(6, 5))
    plt.plot(recall, precision, label=f'AP = {ap_score:.2f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve - {model_name}')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
# Función para graficar la curva ROC
def plot_roc_curve(y_test, y_proba, model_name):
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = roc_auc_score(y_test, y_proba)

    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"Curva ROC - {model_name}")
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()

In [None]:
# Matriz de confusión
def matriz_confusion(y_true, y_pred, model_name="Modelo"):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    plt.figure(figsize = (6, 4))
    disp.plot(cmap="Blues")
    plt.title(f"Matriz de Confusión - {model_name}")
    plt.grid(False)
    plt.show()


In [None]:
# Evaluacion del gridsearch
def evaluar(modelo, X_train, y_train, X_test, y_test, model_name="Modelo", guardar=True):    

    print(f"Entrenando {model_name} ====")

    # Entrenamiento (Validación cruzada interna)
    start_time = time.time()
    modelo.fit(X_train, y_train)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"⏱️ Tiempo de entrenamiento: {elapsed_time:.2f} segundos")

    y_pred = modelo.predic(X_test)
    y_proba = modelo.predict_proba(X_test)[:, 1]

    # ==== TEXTO ====
    print("\n Classification Report:")
    report = classification_report(y_test, y_pred)
    print(report)
    if guardar:
        with open(f"class_report.txt","w") as f:
            f.write(report)

    # ==== METRICAS ====
    recall = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    print(f"Recall:{recall:.4f}")




    # Resultados del mejor modelo en CV (train) 
    print(f"\n Evaluación Cross-Validation de {model_name}")
    print("📍Mejores parámetros encontrados:")
    print(modelo.best_params_)

    recall_cv = modelo.best_score_
    print(f"\n Mejor Recall promedio en CV: {recall_cv:.4f}")


    # Evaluación en test
    print(f"\n Evaluación en Test de {model_name}")
    y_pred = modelo.predict(X_test)

    # Predicción de proba AUC y curva ROC
    y_proba = None
    if hasattr(modelo, "predict_proba"):
        y_proba = modelo.predict_proba(X_test)[:, 1]
    elif hasattr(modelo, "decision_function"):
        y_proba = modelo.decision_function(X_test)

    # Reporte de clasificación
    print("\n Classification Report (Test):")
    print(classification_report(y_test, y_pred))

    # Metricas
    recall_t = recall_score(y_test, y_pred)
    print(f" Recall (Test): {recall_t:.4f}")
    if y_proba is not None:
        auc_t = roc_auc_score(y_test, y_proba)
        print(f" ROC AUC (Test): {auc_t:.4f}")
        plot_roc_curve(y_test, y_proba, model_name)
        plot_precision_recall_curve(y_test, y_proba, model_name)
    else:
        auc_t = None
        print("El modelo no soporta calculo de AUC")
        
    # Comparación
    print(f"- Recall en Train: {recall_cv:.5f}")
    print(f"- Recall en Test:  {recall_t:.5f}")
    print(f"- Diferencia Recall {abs(recall_cv-recall_t):.4f}")

    # Matriz de confusión
    print("\n Matriz de Confusión")
    matriz_confusion(y_test, y_pred, model_name)
    print("\n --------------------------------")
    
    

In [None]:
# Random Forest
pipeline_rf = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('clf', RandomForestClassifier(
        random_state=42,
        class_weight='balanced'
    ))
])
param_dist = {
    'clf__n_estimators': [100, 200, 300],   # Arboles en el bosque
    'clf__max_depth': [10, 20, None],       # Profundidad máxima
    'clf__min_samples_split': [2, 5, 10],   # Número minimo de muestras
}
modelo_rf = RandomizedSearchCV(
    pipeline_rf,
    param_distributions=param_dist,
    n_iter=10,
    scoring='recall',
    cv=3,
    n_jobs=-1,
    verbose=2,
)

In [None]:
evaluar_gridsearch(modelo_rf,
                   X_train, y_train,
                   X_test, y_test,
                   model_name= "RandomForest")