# Modelo de clasificación

## Ajustes iniciales

In [1]:
# Librerias
# Estandar
import time
import os 
import warnings
os.environ["LOKY_MAX_CPU_COUNT"] = "8" 

# Visualizacion
import matplotlib.pyplot as plt
import seaborn as sns

# Procesamiento
import numpy as np
import pandas as pd

# Adicionales
import time
import joblib

In [2]:
# Librerias de modelado
# Escalado de numericas
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import(
    classification_report,
    roc_auc_score, roc_curve,
    confusion_matrix, ConfusionMatrixDisplay,
    recall_score, precision_recall_curve,
    average_precision_score
)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from lime.lime_tabular import LimeTabularExplainer

In [15]:
# Lectura de datos
df = pd.read_csv(r"C:\Users\Hp\VISUAL\entrega_3\data\dengue.csv") 

# Ajuste de tipos de datos
# Tipo aseguradora y Evento (One-Hot Encoding)
df = pd.get_dummies(df, columns=['TIP_SS', 'EVENTO'], drop_first=True)

# Pais, municipio y departamento (Frecuencias)
for col in ['PAIS_OCU', 'DPTO_OCU', 'MUN_OCU']:
    freq = df[col].value_counts(normalize=True).to_dict()
    df[col] = df[col].map(freq).fillna(0).astype('float64')

scaler = StandardScaler()
# Escalado de variables numéricas
df['EDAD_AJUSTADA'] = scaler.fit_transform(df[['EDAD_AJUSTADA']])

In [17]:
# División de conjuntos de datos

# Definir variable predictora y objetivo
X = df.drop(columns=['CONFIRMADOS'])
y = df['CONFIRMADOS'] 

# Division en train y test (60% train, 20% test, 20% validación)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# Verificación de la división
print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Train: {y_train.shape}, Test: {y_test.shape}")

Train: (568368, 20), Test: (378912, 20)
Train: (568368,), Test: (378912,)


## Funciones

In [18]:
def plot_precision_recall_curve(y_test, y_proba, model_name, guardar=False):
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    ap_score = average_precision_score(y_test, y_proba)
    
    plt.figure(figsize=(6, 4))
    plt.plot(recall, precision, label=f'AP = {ap_score:.2f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve - {model_name}')
    plt.legend()
    plt.grid(True)
    if guardar:
        ruta = "resultados/graficas/press_recall.png"
        plt.savefig(ruta, bbox_inches='tight')
    plt.show()


In [19]:
# Función para graficar la curva ROC
def plot_roc_curve(y_test, y_proba, model_name, guardar=False):
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = roc_auc_score(y_test, y_proba)

    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"Curva ROC - {model_name}")
    plt.legend(loc="lower right")
    plt.grid()
    if guardar:
        ruta = "resultados/graficas/ROC_AUC.png"
        plt.savefig(ruta, bbox_inches='tight')
    plt.show()

In [20]:
# Matriz de confusión
def matriz_confusion(y_true, y_pred, model_name="Modelo", guardar=False):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    plt.figure(figsize = (6, 4))
    disp.plot(cmap="Blues")
    plt.title(f"Matriz de Confusión - {model_name}")
    plt.grid(False)
    if guardar:
        ruta = "resultados/graficas/matriz_confusion.png"
        plt.savefig(ruta, bbox_inches='tight')
    plt.show()


In [21]:
def plot_feature_importance(model, X_train, model_name="Modelo", guardar=False):
    importances = model.named_steps['clf'].feature_importances_
    feature_names = X_train.columns
    feat_imp_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
    feat_imp_df = feat_imp_df.sort_values(by='importance', ascending=False)

    plt.figure(figsize=(6, 4))
    sns.barplot(data=feat_imp_df, x='importance', y='feature', palette='viridis')
    plt.title(f'Importancia de Variables - {model_name}')
    plt.tight_layout()
    if guardar:
        ruta = "resultados/graficas/feat_importance.png"
        plt.savefig(ruta, bbox_inches='tight')
    plt.show()

In [22]:
def lime_explanation(model, X_train, X_test, model_name="Modelo", guardar=False, index=0):
    explainer = LimeTabularExplainer(
        training_data=X_train.values,
        feature_names=X_train.columns.tolist(),
        class_names=["No", "Sí"],
        mode="classification"
    )
    exp = explainer.explain_instance(X_test.iloc[index].values, model.predict_proba, num_features=10)
    fig = exp.as_pyplot_figure()
    fig.suptitle(f"LIME Explanation - {model_name} - Obs {index}", fontsize=12)
    if guardar:
        ruta = f"resultados/graficas/lime_explanation.png"
        fig.savefig(ruta, bbox_inches='tight')
    plt.show()

In [23]:
# Evaluacion del gridsearch
def evaluar(modelo, X_train, y_train, X_test, y_test, model_name= "Modelo", guardar=True):    

    print(f"Entrenando {model_name} ====")

    if guardar:
        os.makedirs("resultados/graficas", exist_ok=True)
        os.makedirs("resultados/modelos", exist_ok=True)
        
    # Entrenamiento (Validación cruzada interna)
    start_time = time.time()
    modelo.fit(X_train, y_train)
    end_time = time.time()
    print(f"⏱️ Tiempo de entrenamiento: {end_time - start_time:.2f} segundos")

    y_pred = modelo.predict(X_test)
    y_proba = modelo.predict_proba(X_test)[:, 1]

    # ==== TEXTO ====
    print("\n Classification Report:")
    report = classification_report(y_test, y_pred)
    print(report)
    if guardar:
        with open(f"class_report.txt","w") as f:
            f.write(report)

    # ==== METRICAS ====
    recall = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    print(f"Recall:{recall:.4f}")
    print(f"ROC AUC: {auc:.4f}")

    # ==== Graficos ====
    plot_roc_curve(y_test, y_proba, model_name, guardar=guardar)
    plot_precision_recall_curve(y_test, y_proba, model_name, guardar=guardar)
    matriz_confusion(y_test, y_pred, model_name, guardar=guardar)
    plot_feature_importance(modelo, X_train, model_name, guardar=guardar)
    lime_explanation(modelo, X_train, X_test, model_name, guardar=guardar)

    # === Guardar modelo ===
    if guardar:
        path = "resultados/modelos/random_forest.pkl"
        joblib.dump(modelo, path)
        print(f"\n📁 Modelo guardado como: {path}")
    print("\n --------------------------------")
    
    

## Modelo

In [10]:
# Random Forest
pipeline_rf = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('clf', RandomForestClassifier(
        random_state=42,
        class_weight='balanced'
    ))
])
param_dist = {
    'clf__n_estimators': [100, 200, 300],   # Arboles en el bosque
    'clf__max_depth': [10, 20, None],       # Profundidad máxima
    'clf__min_samples_split': [2, 5, 10],   # Número minimo de muestras
}

In [11]:
evaluar(
    pipeline_rf, 
    X_train, y_train, 
    X_test, y_test, 
    model_name="Random Forest")


NameError: name 'X_train' is not defined