# Modelo de clasificación

## Ajustes iniciales

In [15]:
# Librerias
# Estandar
import time
import os 
import warnings
os.environ["LOKY_MAX_CPU_COUNT"] = "8" 

# Visualizacion
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.express as px

# Procesamiento
import numpy as np
import pandas as pd

# Adicionales
import time
import joblib
import json

In [16]:
# Librerias de modelado
# Escalado de numericas
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import(
    classification_report,
    roc_auc_score, roc_curve,
    confusion_matrix, ConfusionMatrixDisplay,
    recall_score, precision_recall_curve,
    average_precision_score
)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from lime.lime_tabular import LimeTabularExplainer

In [17]:
# Lectura de datos
df = pd.read_csv(r"C:\Users\Hp\VISUAL\entrega_3\data\dengue.csv") 

# Ajuste de tipos de datos
# Tipo aseguradora y Evento (One-Hot Encoding)
df = pd.get_dummies(df, columns=['TIP_SS', 'EVENTO'], drop_first=True)

# Pais, municipio y departamento (Frecuencias)
for col in ['PAIS_OCU', 'DPTO_OCU', 'MUN_OCU']:
    freq = df[col].value_counts(normalize=True).to_dict()
    df[col] = df[col].map(freq).fillna(0).astype('float64')

scaler = StandardScaler()
# Escalado de variables numéricas
df['EDAD_AJUSTADA'] = scaler.fit_transform(df[['EDAD_AJUSTADA']])

In [18]:
# División de conjuntos de datos

# Definir variable predictora y objetivo
X = df.drop(columns=['CONFIRMADOS'])
y = df['CONFIRMADOS'] 

# Division en train y test (60% train, 20% test, 20% validación)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# Verificación de la división
print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Train: {y_train.shape}, Test: {y_test.shape}")

Train: (568368, 20), Test: (378912, 20)
Train: (568368,), Test: (378912,)


## Funciones

In [19]:
def plot_precision_recall_curve(y_test, y_proba, model_name="modelo", guardar=False):
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    ap_score = average_precision_score(y_test, y_proba)
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=recall, y=precision, 
        mode='lines',
        name=f'AP = {ap_score:.2f}',
        line=dict(color='dodgerblue')))
    fig.add_trace(go.Scatter(
        x=[0, 1], y=[1, 0],
        mode='lines',
        name='Random',
        line=dict(dash='dash', color='gray')
    ))
    fig.update_layout(
        xaxis_title='Recall',
        yaxis_title='Precision',
        template='plotly_white',
        legend_title="Score",
        xaxis=dict(showgrid=True),
        yaxis=dict(showgrid=True)
    )
    fig.show()
    if guardar:
        fig.write_json(f"resultados/graficas/precision_recall_curve.json")

    


In [20]:
def plot_roc_curve(y_test, y_proba, model_name, guardar=False):
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    auc_score = roc_auc_score(y_test, y_proba)

    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=fpr, y=tpr,
        mode='lines',
        name=f'AUC = {auc_score:.4f}',
        line=dict(color='dodgerblue')
    ))
    fig.add_trace(go.Scatter(
        x=[0, 1], y=[0, 1],
        mode='lines',
        name='Random',
        line=dict(dash='dash', color='gray')
    ))
    fig.update_layout(
        xaxis_title="False Positive Rate",
        yaxis_title="True Positive Rate",
        template="plotly_white"
    )
    fig.show()
    if guardar:
        fig.write_json(f"resultados/graficas/ROC_AUC.json")

    return fig


In [21]:
def matriz_confusion(y_true, y_pred, model_name="Modelo", guardar=False):
    # Calcular la matriz de confusión
    cm = confusion_matrix(y_true, y_pred)
    cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # porcentajes por fila
    cm_percent = np.round(cm_percent * 100, 2)

    labels = np.array([['TN', 'FP'], ['FN', 'TP']])

    # Crear el texto con valores (conteo y porcentaje) y etiquetas
    z_text = [[f"{labels[i][j]}<br>{int(cm[i][j])} ({cm_percent[i][j]}%)" for j in range(cm.shape[1])]
              for i in range(cm.shape[0])]

    # Invertir filas para que TP esté abajo a la derecha
    z_flipped = cm[::-1]
    z_text_flipped = z_text[::-1]

    # Crear la figura con Plotly
    fig = ff.create_annotated_heatmap(
        z=z_flipped,
        annotation_text=z_text_flipped,
        colorscale="Blues",
        showscale=True,
        hoverinfo="z"
    )

    fig.update_layout(
        xaxis_title="Predicción",
        yaxis_title="Real",
        template="plotly_white"
    )
    fig.show()

    if guardar:
        fig.write_json(f"resultados/graficas/matriz_confusion.json")

    return fig


In [22]:
def plot_feature_importance(model, X_train, model_name="Modelo", guardar=True):
    # Extraer importancias
    importances = model.named_steps['clf'].feature_importances_
    feature_names = X_train.columns
    feat_imp_df = pd.DataFrame({'feature': feature_names, 'importance': importances})

    # Ordenar y tomar top 10
    feat_imp_df = feat_imp_df.sort_values(by='importance', ascending=False).head(10)

    # Crear gráfico interactivo
    fig = px.bar(
        feat_imp_df,
        x='importance',
        y='feature',
        orientation='h',
        color_discrete_sequence=['dodgerblue']
    )
    fig.update_layout(yaxis=dict(autorange="reversed"), template="plotly_white")
    fig.show()
    if guardar:
        fig.write_json(f"resultados/graficas/feat_importance.json")

    return fig


In [23]:
def lime_explanation(model, X_train, X_test, model_name="Modelo", index=0, guardar=False):
    explainer = LimeTabularExplainer(
        training_data=X_train.values,
        feature_names=X_train.columns.tolist(),
        class_names=["No", "Sí"],
        mode="classification"
    )
    exp = explainer.explain_instance(
        X_test.iloc[index].values,
        model.predict_proba,
        num_features=10
    )
    exp_list = exp.as_list()
    features = [feat for feat, _ in exp_list]
    weights = [val for _, val in exp_list]
    colors = ['dodgerblue' if w >= 0 else 'lightgray' for w in weights]
    fig = go.Figure(go.Bar(
        x=weights,
        y=features,
        orientation='h',
        marker_color=colors
    ))
    fig.update_layout(
        xaxis_title="Contribución a la predicción",
        yaxis_title="Características",
        template="plotly_white",
        margin=dict(l=100, r=40, t=60, b=40),
        height=400
    )
    fig.show()
    if guardar:
        fig.write_json(f"resultados/graficas/lime.json")

    return fig


In [24]:
# Evaluacion del gridsearch
def evaluar(modelo, X_train, y_train, X_test, y_test, model_name= "Modelo", guardar=True):    

    print(f"Entrenando {model_name} ====")

    if guardar:
        os.makedirs("resultados/graficas", exist_ok=True)
        os.makedirs("resultados/modelos", exist_ok=True)
        
    # Entrenamiento (Validación cruzada interna)
    start_time = time.time()
    modelo.fit(X_train, y_train)
    end_time = time.time()
    print(f"⏱️ Tiempo de entrenamiento: {end_time - start_time:.2f} segundos")

    y_pred = modelo.predict(X_test)
    y_proba = modelo.predict_proba(X_test)[:, 1]

    # ==== TEXTO ====
    print("\n Classification Report:")
    report = classification_report(y_test, y_pred)
    print(report)
    if guardar:
        with open(f"resultados/class_report.txt","w") as f:
            f.write(report)

    # ==== METRICAS ====
    recall = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    print(f"Recall:{recall:.4f}")
    print(f"ROC AUC: {auc:.4f}")

    # ==== Graficos ====
    plot_roc_curve(y_test, y_proba, model_name, guardar=guardar)
    plot_precision_recall_curve(y_test, y_proba, model_name, guardar=guardar)
    matriz_confusion(y_test, y_pred, model_name, guardar=guardar)
    plot_feature_importance(modelo, X_train, model_name, guardar=guardar)
    lime_explanation(modelo, X_train, X_test, model_name, guardar=guardar)

    # === Guardar modelo ===
    if guardar:
        path = "resultados/modelos/random_forest.pkl"
        joblib.dump(modelo, path)
        print(f"\n📁 Modelo guardado como: {path}")
    print("\n --------------------------------")
    
    

## Modelo

In [27]:
# Random Forest
pipeline_rf = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('clf', RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        min_samples_split=2,
        random_state=42,
        class_weight='balanced'
    ))
])

In [28]:
evaluar(
    pipeline_rf, 
    X_train, y_train, 
    X_test, y_test, 
    model_name="Random Forest")


Entrenando Random Forest ====



`BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7. Use `sklearn.utils.validation.validate_data` instead. This function becomes public and is part of the scikit-learn developer API.



⏱️ Tiempo de entrenamiento: 276.77 segundos

 Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.84      0.81    161486
           1       0.87      0.82      0.84    217426

    accuracy                           0.83    378912
   macro avg       0.82      0.83      0.83    378912
weighted avg       0.83      0.83      0.83    378912

Recall:0.8165
ROC AUC: 0.9095



X does not have valid feature names, but RandomForestClassifier was fitted with feature names




📁 Modelo guardado como: resultados/modelos/random_forest.pkl

 --------------------------------
