In [None]:
# ==========================================
# 1. SETUP Y LIBRERÍAS
# ==========================================
!pip install tensorflow keras scikit-learn matplotlib pandas numpy opencv-python-headless statsmodels scipy

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.metrics import (accuracy_score, balanced_accuracy_score, f1_score,
                             roc_auc_score, confusion_matrix, matthews_corrcoef,
                             recall_score, precision_score)
from statsmodels.stats.contingency_tables import mcnemar
import scipy.stats

# Rutas
BASE_PATH = '/content/drive/MyDrive/proyecto_completo/'
DATA_PATH = os.path.join(BASE_PATH, 'preprocesamiento')
MODELS_PATH = os.path.join(BASE_PATH, 'models')

# Cargar Datos de Test
print("Cargando Test Set...")
X_test = np.load(os.path.join(DATA_PATH, 'X_test_improved.npy'))
y_test = np.load(os.path.join(DATA_PATH, 'y_test_improved.npy'))
print(f"Test Set cargado: {X_test.shape}")

In [None]:
# ==========================================
# 2. DEFINICIÓN DE MODELOS A EVALUAR
# ==========================================
model_files = {
    "Esc1_Base":    "resnet50_neumonia_final.h5",
    "Esc2_Focal":   "resnet50_focal_loss_final.h5",
    "Esc3_BWCCE":   "resnet50_bwcc_final.h5",
    "Esc4_LDAM":    "resnet50_ldam_final.h5",
    "Esc5_ROS":     "resnet50_ROS_best.h5",
    "Esc6_GAN":     "resnet50_Base_GAN_FINETUNED_best.h5",
    "Esc7_Hibrido": "resnet50_Focal_GAN_FINETUNED_best.h5"
}

# Verificar existencia
for name, file in model_files.items():
    path = os.path.join(MODELS_PATH, file)
    exists = "✅" if os.path.exists(path) else "X"
    print(f"{exists} {name}: {file}")

In [None]:
# ==========================================
# 3. FUNCIÓN DE CÁLCULO DE MÉTRICAS
# ==========================================
def calculate_extended_metrics(y_true, y_pred_prob):
    # Convertir probabilidad a clase (umbral 0.5)
    y_pred = (y_pred_prob > 0.5).astype(int)

    # Matriz de Confusión
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    # Métricas Base
    acc = accuracy_score(y_true, y_pred)
    bal_acc = balanced_accuracy_score(y_true, y_pred)

    # Métricas Clínicas
    sensitivity = recall_score(y_true, y_pred) # Recall clase 1
    specificity = tn / (tn + fp)               # Recall clase 0
    precision = precision_score(y_true, y_pred)

    # Métricas Compuestas
    f1 = f1_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    g_mean = np.sqrt(sensitivity * specificity)
    try:
        auc = roc_auc_score(y_true, y_pred_prob)
    except:
        auc = 0.0

    return {
        "Accuracy": acc,
        "Bal_Accuracy": bal_acc,
        "Sensitivity": sensitivity,
        "Specificity": specificity,
        "Precision": precision,
        "F1_Score": f1,
        "MCC": mcc,
        "G_Mean": g_mean,
        "AUC": auc
    }

In [None]:
# ==========================================
# 4. GENERACIÓN DE TABLA COMPARATIVA
# ==========================================
results_list = []
predictions_dict = {} # Guardamos preds para los tests estadísticos

print("Evaluando modelos...")

for name, filename in model_files.items():
    path = os.path.join(MODELS_PATH, filename)
    if not os.path.exists(path): continue

    # Cargar sin compilar (más rápido y seguro)
    model = load_model(path, compile=False)

    # Predecir
    probs = model.predict(X_test, verbose=0)
    if probs.shape[1] > 1: probs = probs[:, 1] # Si es softmax
    else: probs = probs.ravel()                # Si es sigmoid

    # Guardar para uso posterior
    predictions_dict[name] = probs

    # Calcular métricas
    metrics = calculate_extended_metrics(y_test, probs)
    metrics["Model"] = name
    results_list.append(metrics)

    print(f"-> {name} procesado.")
    del model
    tf.keras.backend.clear_session()

# Crear DataFrame
df_results = pd.DataFrame(results_list)
cols = ["Model", "MCC", "Bal_Accuracy", "Sensitivity", "Specificity", "F1_Score", "AUC"]
df_final = df_results[cols].sort_values(by="MCC", ascending=False)

print("\n=== TABLA FINAL DE RESULTADOS ===")
display(df_final)
df_final.to_csv(os.path.join(BASE_PATH, 'tabla_resultados_finales.csv'), index=False)

In [None]:
# ==========================================
# 5. TESTS ESTADÍSTICOS (MCNEMAR & DELONG)
# ==========================================

# --- A. TEST DE MCNEMAR ---
def run_mcnemar(y_true, prob_a, prob_b):
    pred_a = (prob_a > 0.5).astype(int)
    pred_b = (prob_b > 0.5).astype(int)

    # Tabla de contingencia
    c_a = (pred_a == y_true)
    c_b = (pred_b == y_true)
    n01 = np.sum(~c_a & c_b) # A falla, B acierta
    n10 = np.sum(c_a & ~c_b) # A acierta, B falla

    # Cálculo exacto
    table = [[0, n10], [n01, 0]] # Solo importan los desacuerdos
    res = mcnemar([[0, n10], [n01, 0]], exact=True)
    return res.pvalue

# --- B. TEST DE DELONG (Matemática Pura) ---
def compute_midrank(x):
    J = np.argsort(x)
    Z = x[J]
    N = len(x)
    T = np.zeros(N, dtype=np.float64)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]: j += 1
        T[i:j] = 0.5 * (i + j - 1)
        i = j
    T2 = np.empty(N, dtype=np.float64)
    T2[J] = T + 1
    return T2

def fastDeLong(predictions_sorted_transposed, label_1_count):
    m = label_1_count
    n = predictions_sorted_transposed.shape[1] - m
    positive_examples = predictions_sorted_transposed[:, :m]
    negative_examples = predictions_sorted_transposed[:, m:]
    k = predictions_sorted_transposed.shape[0]

    tx = np.empty([k, m], dtype=np.float64)
    ty = np.empty([k, n], dtype=np.float64)
    tz = np.empty([k, m + n], dtype=np.float64)

    for r in range(k):
        tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
        tx[r, :] = compute_midrank(positive_examples[r, :])
        ty[r, :] = compute_midrank(negative_examples[r, :])

    aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
    v01 = (tz[:, :m] - tx[:, :]) / n
    v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
    sx = np.cov(v01)
    sy = np.cov(v10)
    delongcov = sx / m + sy / n
    return aucs, delongcov

def run_delong(y_true, prob_a, prob_b):
    order = np.argsort(y_true)[::-1]
    y_sorted = y_true[order]
    num_pos = np.sum(y_sorted == 1)

    preds_A = prob_a[order]
    preds_B = prob_b[order]

    data = np.array([preds_A, preds_B])
    aucs, cov = fastDeLong(data, num_pos)

    l = np.array([[1, -1]])
    z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, cov), l.T))
    p_val = 2.0 * (1.0 - scipy.stats.norm.cdf(z[0][0]))
    return p_val

# --- EJECUCIÓN COMPARATIVA (Vs GAN) ---
ref_model = "Esc6_GAN" # modelo estrella
if ref_model in predictions_dict:
    print(f"\nCOMPARACIONES ESTADÍSTICAS (Referencia: {ref_model})")
    print(f"{'MODELO':<15} | {'McNemar p':<12} | {'DeLong p':<12} | {'Significancia'}")
    print("-" * 60)

    for name, probs in predictions_dict.items():
        if name == ref_model: continue

        p_mc = run_mcnemar(y_test, predictions_dict[ref_model], probs)
        p_dl = run_delong(y_test, predictions_dict[ref_model], probs)

        sig = "SIGNIFICATIVO" if p_mc < 0.05 or p_dl < 0.05 else "NO Sig."
        print(f"{name:<15} | {p_mc:.4f}       | {p_dl:.4f}       | {sig}")

In [None]:
# ==========================================
# 6. AUDITORÍA DE EXPLICABILIDAD (ROI GRAD-CAM)
# ==========================================

def make_gradcam_heatmap(img_array, model, last_conv_layer_name):
    grad_model = tf.keras.models.Model(
        [model.inputs], [model.get_layer(last_conv_layer_name).output, model.output]
    )
    with tf.GradientTape() as tape:
        last_conv_layer_output, preds = grad_model(img_array)
        pred_index = tf.argmax(preds[0])
        class_channel = preds[:, pred_index]

    grads = tape.gradient(class_channel, last_conv_layer_output)
    pooled_grads = tf.reduce_mean(grads, axis=(0, 1, 2))
    last_conv_layer_output = last_conv_layer_output[0]
    heatmap = last_conv_layer_output @ pooled_grads[..., tf.newaxis]
    heatmap = tf.squeeze(heatmap)
    heatmap = tf.maximum(heatmap, 0) / tf.math.reduce_max(heatmap)
    return cv2.resize(heatmap.numpy(), (224, 224))

def ejecutar_auditoria_roi(model_name):
    print(f"\nAuditando explicabilidad de: {model_name}...")
    path = os.path.join(MODELS_PATH, model_files[model_name])
    model = load_model(path, compile=False)

    # Filtrar solo clase Pneumonia (1)
    indices_pneumonia = np.where(y_test == 1)[0]
    hits = 0

    # ROI (Margen 5%)
    margin = int(224 * 0.05)

    for idx in indices_pneumonia:
        img = X_test[idx]
        img_input = np.expand_dims(img, axis=0)

        try:
            heatmap = make_gradcam_heatmap(img_input, model, 'conv5_block3_out')
            if np.max(heatmap) == 0: continue

            # Punto de máxima atención
            max_idx = np.unravel_index(np.argmax(heatmap), heatmap.shape)
            cY, cX = max_idx

            # Verificar si cae en zona central (ROI Pulmonar)
            if (margin < cX < 224-margin) and (margin < cY < 224-margin):
                hits += 1
        except: continue

    score = (hits / len(indices_pneumonia)) * 100
    print(f"Robustez de Atención (ROI Score): {score:.2f}% ({hits}/{len(indices_pneumonia)})")

# Ejecutar auditoría en el modelo GAN
if "Esc6_GAN" in model_files:
    ejecutar_auditoria_roi("Esc6_GAN")