# Operación Anti-Zombie - Notebook Técnico
Evaluación 3: Integración de Modelos, Árboles de Decisión y Clúster Jerárquico




In [None]:
# %%
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
    mean_squared_error,
    mean_absolute_error,
    r2_score
)

from sklearn.cluster import AgglomerativeClustering, KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.decomposition import PCA


## Carga de datasets del brote zombie (desde carpeta `../data`)

In [None]:
# %%
# Ruta base: subimos un nivel desde /notebook a la carpeta del proyecto
BASE_PATH = os.path.abspath(os.path.join(os.getcwd(), ".."))
data_path = os.path.join(BASE_PATH, "data")

pacientes = pd.read_excel(os.path.join(data_path, "pacientes_brote_zombie.xlsx"))
evolucion = pd.read_excel(os.path.join(data_path, "evolucion_brote.xlsx"))
tratamientos = pd.read_excel(os.path.join(data_path, "tratamientos_experimentales.xlsx"))
red_contagios = pd.read_excel(os.path.join(data_path, "red_contagios.xlsx"))

display(pacientes.head())
display(evolucion.head())


# 1. Comparación de modelos de clasificación (Estado_Actual)

In [None]:
# %%
pacientes.info()

features_clf = [
    "Edad",
    "Dias_Incubacion",
    "Nivel_Zombificacion",
    "Numero_Personas_Contagiadas",
    "Temperatura_Corporal",
    "Frecuencia_Cardiaca",
    "Nivel_Consciencia",
    "Agresividad",
    "Capacidad_Cognitiva",
    "Dias_Desde_Tratamiento",
    "Mejoria_Porcentual",
]

target_clf = "Estado_Actual"

X = pacientes[features_clf]
y = pacientes[target_clf]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

modelo_arbol = make_pipeline(
    SimpleImputer(strategy="median"),
    DecisionTreeClassifier(
        max_depth=5,
        random_state=42
    )
)

modelo_rf = make_pipeline(
    SimpleImputer(strategy="median"),
    RandomForestClassifier(
        n_estimators=200,
        random_state=42
    )
)

modelo_arbol.fit(X_train, y_train)
modelo_rf.fit(X_train, y_train)

y_pred_arbol = modelo_arbol.predict(X_test)
y_pred_rf = modelo_rf.predict(X_test)


In [None]:
# %%
def resumen_clasificacion(y_true, y_pred, nombre_modelo):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="weighted")
    print(f"=== {nombre_modelo} ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 (weighted): {f1:.4f}")
    print("\nClassification report:")
    print(classification_report(y_true, y_pred))
    print("-"*60)

resumen_clasificacion(y_test, y_pred_arbol, "Modelo 1: Árbol de decisión")
resumen_clasificacion(y_test, y_pred_rf, "Modelo 2: Random Forest")

resultados_modelos = pd.DataFrame({
    "Modelo": ["Árbol de decisión", "Random Forest"],
    "Accuracy": [
        accuracy_score(y_test, y_pred_arbol),
        accuracy_score(y_test, y_pred_rf)
    ],
    "F1_weighted": [
        f1_score(y_test, y_pred_arbol, average="weighted"),
        f1_score(y_test, y_pred_rf, average="weighted")
    ]
})

display(resultados_modelos)


In [None]:
# %%
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

cm_arbol = confusion_matrix(y_test, y_pred_arbol, labels=y.unique())
cm_rf = confusion_matrix(y_test, y_pred_rf, labels=y.unique())

sns.heatmap(cm_arbol, annot=True, fmt="d", ax=axes[0],
            xticklabels=y.unique(), yticklabels=y.unique())
axes[0].set_title("Árbol de decisión")
axes[0].set_xlabel("Predicho")
axes[0].set_ylabel("Real")

sns.heatmap(cm_rf, annot=True, fmt="d", ax=axes[1],
            xticklabels=y.unique(), yticklabels=y.unique())
axes[1].set_title("Random Forest")
axes[1].set_xlabel("Predicho")
axes[1].set_ylabel("Real")

plt.tight_layout()
plt.show()


# 2. Árbol de decisión regresivo (Tasa_Contagio_R0)

In [None]:
# %%
evolucion.head()

target_reg = "Tasa_Contagio_R0"
features_reg = [col for col in evolucion.columns if col not in ["Fecha", target_reg]]

Xr = evolucion[features_reg]
yr = evolucion[target_reg]

Xr_train, Xr_test, yr_train, yr_test = train_test_split(
    Xr, yr, test_size=0.3, random_state=42
)

modelo_regresivo = make_pipeline(
    SimpleImputer(strategy="median"),
    DecisionTreeRegressor(max_depth=5, random_state=42)
)

modelo_regresivo.fit(Xr_train, yr_train)
yr_pred = modelo_regresivo.predict(Xr_test)


In [None]:
# %%
rmse = mean_squared_error(yr_test, yr_pred, squared=False)
mae = mean_absolute_error(yr_test, yr_pred)
r2 = r2_score(yr_test, yr_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE : {mae:.4f}")
print(f"R²  : {r2:.4f}")


In [None]:
# %%
arbol_reg = modelo_regresivo.named_steps["decisiontreeregressor"]

plt.figure(figsize=(18, 8))
plot_tree(
    arbol_reg,
    feature_names=features_reg,
    filled=True,
    rounded=True,
    max_depth=3
)
plt.title("Árbol de decisión regresivo (profundidad hasta 3 para visualizar)")
plt.show()

plt.figure(figsize=(6, 6))
plt.scatter(yr_test, yr_pred, alpha=0.7)
plt.xlabel("Tasa_Contagio_R0 real")
plt.ylabel("Tasa_Contagio_R0 predicha")
plt.title("Comparación real vs predicho - Árbol regresivo")
plt.plot([yr_test.min(), yr_test.max()],
         [yr_test.min(), yr_test.max()],
         linestyle="--")
plt.show()


# 3. Árbol de decisión lógico (clasificación Estado_Actual)

In [None]:
# %%
modelo_arbol_clf = make_pipeline(
    SimpleImputer(strategy="median"),
    DecisionTreeClassifier(
        max_depth=4,
        min_samples_leaf=20,
        random_state=42
    )
)

modelo_arbol_clf.fit(X_train, y_train)
y_pred_arbol_clf = modelo_arbol_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_arbol_clf))
print("F1 weighted:", f1_score(y_test, y_pred_arbol_clf, average="weighted"))


In [None]:
# %%
cm = confusion_matrix(y_test, y_pred_arbol_clf, labels=y.unique())

plt.figure(figsize=(7, 6))
sns.heatmap(cm, annot=True, fmt="d",
            xticklabels=y.unique(),
            yticklabels=y.unique())
plt.xlabel("Predicho")
plt.ylabel("Real")
plt.title("Matriz de confusión - Árbol de decisión lógico")
plt.show()

print(classification_report(y_test, y_pred_arbol_clf))


In [None]:
# %%
arbol_clf = modelo_arbol_clf.named_steps["decisiontreeclassifier"]

plt.figure(figsize=(22, 10))
plot_tree(
    arbol_clf,
    feature_names=features_clf,
    class_names=arbol_clf.classes_,
    filled=True,
    rounded=True
)
plt.title("Árbol de decisión lógico - Estado_Actual")
plt.show()


# 4. Clustering jerárquico aglomerativo y divisivo

In [None]:
# %%
num_cols_cluster = [
    "Edad",
    "Dias_Incubacion",
    "Nivel_Zombificacion",
    "Numero_Personas_Contagiadas",
    "Temperatura_Corporal",
    "Frecuencia_Cardiaca",
    "Nivel_Consciencia",
    "Agresividad",
    "Capacidad_Cognitiva",
    "Dias_Desde_Tratamiento",
    "Mejoria_Porcentual",
]

X_num = pacientes[num_cols_cluster]

imputer = SimpleImputer(strategy="median")
scaler = StandardScaler()

X_imp = imputer.fit_transform(X_num)
X_scaled = scaler.fit_transform(X_imp)


In [None]:
# %%
linked = linkage(X_scaled, method="ward")

plt.figure(figsize=(12, 6))
dendrogram(linked, truncate_mode="lastp", p=20)
plt.title("Dendrograma (aglomerativo, truncado en 20 clústeres finales)")
plt.xlabel("Clústeres / Pacientes")
plt.ylabel("Distancia")
plt.show()

agglo = AgglomerativeClustering(
    n_clusters=4,
    linkage="ward"
)

clusters_agg = agglo.fit_predict(X_scaled)
pacientes["Cluster_Agg"] = clusters_agg

resumen_agg = pacientes.groupby("Cluster_Agg")[num_cols_cluster].mean().round(2)
display(resumen_agg)


In [None]:
# %%
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_agg, alpha=0.7)
plt.legend(*scatter.legend_elements(), title="Cluster_Agg")
plt.title("Proyección PCA - Clúster aglomerativo")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()


In [None]:
# %%
def divisive_clustering(X, n_clusters=4, random_state=42):
    clusters = np.zeros(X.shape[0], dtype=int)
    current_k = 1

    while current_k < n_clusters:
        cluster_ids, counts = np.unique(clusters, return_counts=True)
        cluster_to_split = cluster_ids[np.argmax(counts)]
        idx = np.where(clusters == cluster_to_split)[0]

        if len(idx) < 2:
            break

        km = KMeans(n_clusters=2, random_state=random_state)
        sub_labels = km.fit_predict(X[idx])

        new_cluster_id = clusters.max() + 1
        clusters[idx[sub_labels == 1]] = new_cluster_id

        current_k += 1

    return clusters

clusters_div = divisive_clustering(X_scaled, n_clusters=4)
pacientes["Cluster_Div"] = clusters_div

resumen_div = pacientes.groupby("Cluster_Div")[num_cols_cluster].mean().round(2)
display(resumen_div)


In [None]:
# %%
plt.figure(figsize=(8, 6))
scatter2 = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_div, alpha=0.7)
plt.legend(*scatter2.legend_elements(), title="Cluster_Div")
plt.title("Proyección PCA - Clúster divisivo")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()
