In [None]:
# ==========================================
# 1. INSTALACIÓN DE DEPENDENCIAS
# ==========================================
!pip install tensorflow keras scikit-learn matplotlib pandas numpy opencv-python-headless

import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import tensorflow.keras.backend as K
from sklearn.utils import resample, shuffle

# Configuración de Semilla para Reproducibilidad
def seed_everything(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything()
print(f"TensorFlow Version: {tf.__version__}")

In [None]:
# ==========================================
# 2. CONEXIÓN CON DRIVE Y CARGA DE DATOS
# ==========================================
from google.colab import drive
drive.mount('/content/drive')

BASE_PATH = '/content/drive/MyDrive/proyecto_completo/'
DATA_PATH = os.path.join(BASE_PATH, 'preprocesamiento')
MODELS_PATH = os.path.join(BASE_PATH, 'models')
GAN_DATA_PATH = os.path.join(BASE_PATH, 'checkpoints') # Donde se guardó el .npy generado

os.makedirs(MODELS_PATH, exist_ok=True)

# Cargar Datos Originales (Desbalanceados)
print("Cargando datos originales...")
X_train_orig = np.load(os.path.join(DATA_PATH, 'X_train_unbalanced.npy'))
y_train_orig = np.load(os.path.join(DATA_PATH, 'y_train_unbalanced.npy'))
X_val = np.load(os.path.join(DATA_PATH, 'X_val_improved.npy'))
y_val = np.load(os.path.join(DATA_PATH, 'y_val_improved.npy'))

print(f"X_train Original: {X_train_orig.shape}")
print(f"Distribución Original: {np.bincount(y_train_orig.astype(int))}")

In [None]:
# ==========================================
# 3. ESTRATEGIAS DE BALANCEO DE DATOS
# ==========================================

# --- A. RANDOM OVER SAMPLING (ROS) - Para Escenario 5 ---
print("\n--- Preparando Dataset ROS ---")
X_normal = X_train_orig[y_train_orig == 0]
X_pneumonia = X_train_orig[y_train_orig == 1]

# Duplicar clase minoritaria
X_normal_ros = resample(X_normal, replace=True, n_samples=len(X_pneumonia), random_state=42)

X_train_ros = np.concatenate((X_normal_ros, X_pneumonia))
y_train_ros = np.concatenate((np.zeros(len(X_normal_ros)), np.ones(len(X_pneumonia))))
X_train_ros, y_train_ros = shuffle(X_train_ros, y_train_ros, random_state=42)
print(f"Dataset ROS listo: {X_train_ros.shape}")

# --- B. DATA AUGMENTATION SINTÉTICO (GAN) - Para Escenarios 6 y 7 ---
print("\n--- Preparando Dataset GAN ---")
# Cargar datos generados en el Notebook 1
gan_file = [f for f in os.listdir(GAN_DATA_PATH) if 'generated_data' in f and f.endswith('.npy')][0]
synthetic_path = os.path.join(GAN_DATA_PATH, gan_file)
print(f"Cargando sintéticos desde: {synthetic_path}")

synthetic_imgs = np.load(synthetic_path)

# Preprocesar sintéticos para que coincidan con ResNet (N, 224, 224, 3)
if synthetic_imgs.ndim == 3: synthetic_imgs = np.expand_dims(synthetic_imgs, axis=-1)
synthetic_imgs = synthetic_imgs.astype('float32') / 255.0 # Normalizar [0, 1]
synthetic_imgs = np.repeat(synthetic_imgs, 3, axis=-1)    # Grayscale -> RGB

# Crear etiquetas (Asumiendo que generamos la clase minoritaria 0)
synthetic_labels = np.zeros(len(synthetic_imgs))

# Concatenar
X_train_gan = np.concatenate((X_train_orig, synthetic_imgs), axis=0)
y_train_gan = np.concatenate((y_train_orig, synthetic_labels), axis=0)
X_train_gan, y_train_gan = shuffle(X_train_gan, y_train_gan, random_state=42)

print(f"Dataset GAN listo: {X_train_gan.shape}")

In [None]:
# ==========================================
# 4. FUNCIONES DE PÉRDIDA PERSONALIZADAS
# ==========================================

# --- FOCAL LOSS ---
def focal_loss(gamma=2.0, alpha=0.25):
    def focal_loss_fixed(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.cast(y_pred, tf.float32)
        y_pred = tf.clip_by_value(y_pred, K.epsilon(), 1. - K.epsilon())
        pt = tf.where(tf.equal(y_true, 1), y_pred, 1 - y_pred)
        alpha_factor = tf.where(tf.equal(y_true, 1), alpha, 1 - alpha)
        return K.mean(-alpha_factor * K.pow(1 - pt, gamma) * K.log(pt))
    return focal_loss_fixed

# --- BWCCE (Balanced Weighted Cross Entropy) ---
def bwce_loss(class_weights={0: 1.85, 1: 0.69}):
    def bwce_loss_fixed(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.cast(y_pred, tf.float32)
        y_pred = tf.clip_by_value(y_pred, K.epsilon(), 1 - K.epsilon())
        bce = -(y_true * K.log(y_pred) + (1 - y_true) * K.log(1 - y_pred))
        weights = y_true * class_weights[1] + (1 - y_true) * class_weights[0]
        return K.mean(weights * bce)
    return bwce_loss_fixed

# --- LDAM LOSS ---
# Margen basado en frecuencias (Calculado previamente C=0.4)
margin_normal = 0.04 # Aprox para ejemplo
margin_pneumonia = 0.02
def ldams_loss():
    def ldams_loss_fixed(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.clip_by_value(y_pred, K.epsilon(), 1 - K.epsilon())
        logit = tf.math.log(y_pred / (1 - y_pred))
        margin = tf.where(tf.equal(y_true, 1.0), margin_pneumonia, margin_normal)
        adjusted_logit = tf.where(tf.equal(y_true, 1.0), logit - margin, logit + margin)
        exp_adjusted = tf.exp(adjusted_logit)
        adjusted_pred = exp_adjusted / (exp_adjusted + 1.0)
        bce = - (y_true * tf.math.log(adjusted_pred + K.epsilon()) +
                 (1 - y_true) * tf.math.log(1 - adjusted_pred + K.epsilon()))
        return K.mean(bce)
    return ldams_loss_fixed

In [None]:
# ==========================================
# 5. ARQUITECTURA CLASIFICADOR BASE
# ==========================================
def create_resnet50_model(input_shape=(224, 224, 3)):
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
    base_model.trainable = False # Congelar base

    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    predictions = Dense(1, activation='sigmoid')(x)

    return Model(inputs=base_model.input, outputs=predictions)

In [None]:
# ==========================================
# 6. CONFIGURACIÓN COMÚN
# ==========================================
BATCH_SIZE = 32
EPOCHS = 50
LR = 1e-4

# Data Augmentation (Solo para Train)
train_datagen = ImageDataGenerator(
    rotation_range=15, width_shift_range=0.1, height_shift_range=0.1,
    shear_range=0.1, zoom_range=0.1, horizontal_flip=True, fill_mode='nearest'
)

# Callbacks Base
def get_callbacks(filename):
    return [
        ModelCheckpoint(os.path.join(MODELS_PATH, filename), save_best_only=True, monitor='val_accuracy', mode='max'),
        EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-7)
    ]

In [None]:
# ==========================================
# 7. EJECUCIÓN DE ESCENARIOS (1 al 5)
# ==========================================

# --- ESCENARIO 1: BASE ---
print("\n--- Entrenando Escenario 1: BASE ---")
model_s1 = create_resnet50_model()
model_s1.compile(optimizer=Adam(LR), loss='binary_crossentropy', metrics=['accuracy'])
model_s1.fit(train_datagen.flow(X_train_orig, y_train_orig, batch_size=BATCH_SIZE),
             steps_per_epoch=len(X_train_orig)//BATCH_SIZE, epochs=EPOCHS,
             validation_data=(X_val, y_val), callbacks=get_callbacks('resnet50_neumonia_final.h5'))

# --- ESCENARIO 2: FOCAL LOSS ---
print("\n--- Entrenando Escenario 2: FOCAL ---")
model_s2 = create_resnet50_model()
model_s2.compile(optimizer=Adam(LR), loss=focal_loss(), metrics=['accuracy'])
model_s2.fit(train_datagen.flow(X_train_orig, y_train_orig, batch_size=BATCH_SIZE),
             steps_per_epoch=len(X_train_orig)//BATCH_SIZE, epochs=EPOCHS,
             validation_data=(X_val, y_val), callbacks=get_callbacks('resnet50_focal_loss_final.h5'))

# --- ESCENARIO 3: BWCCE ---
print("\n--- Entrenando Escenario 3: BWCCE ---")
model_s3 = create_resnet50_model()
model_s3.compile(optimizer=Adam(LR), loss=bwce_loss(), metrics=['accuracy'])
model_s3.fit(train_datagen.flow(X_train_orig, y_train_orig, batch_size=BATCH_SIZE),
             steps_per_epoch=len(X_train_orig)//BATCH_SIZE, epochs=EPOCHS,
             validation_data=(X_val, y_val), callbacks=get_callbacks('resnet50_bwcc_final.h5'))

# --- ESCENARIO 4: LDAM ---
print("\n--- Entrenando Escenario 4: LDAM ---")
model_s4 = create_resnet50_model()
model_s4.compile(optimizer=Adam(LR), loss=ldams_loss(), metrics=['accuracy'])
model_s4.fit(train_datagen.flow(X_train_orig, y_train_orig, batch_size=BATCH_SIZE),
             steps_per_epoch=len(X_train_orig)//BATCH_SIZE, epochs=EPOCHS,
             validation_data=(X_val, y_val), callbacks=get_callbacks('resnet50_ldam_final.h5'))

# --- ESCENARIO 5: ROS ---
print("\n--- Entrenando Escenario 5: ROS ---")
model_s5 = create_resnet50_model()
model_s5.compile(optimizer=Adam(LR), loss='binary_crossentropy', metrics=['accuracy'])
model_s5.fit(train_datagen.flow(X_train_ros, y_train_ros, batch_size=BATCH_SIZE),
             steps_per_epoch=len(X_train_ros)//BATCH_SIZE, epochs=EPOCHS,
             validation_data=(X_val, y_val), callbacks=get_callbacks('resnet50_ROS_best.h5'))

In [None]:
# ==========================================
# 8. ESCENARIO 6: BASE + GAN (Con Fine-Tuning)
# ==========================================
print("\n--- Entrenando Escenario 6: GAN (Fase 1) ---")
model_s6 = create_resnet50_model()
model_s6.compile(optimizer=Adam(LR), loss='binary_crossentropy', metrics=['accuracy'])

# Fase 1: Entrenar cabeza
hist_s6_1 = model_s6.fit(train_datagen.flow(X_train_gan, y_train_gan, batch_size=BATCH_SIZE),
                         steps_per_epoch=len(X_train_gan)//BATCH_SIZE, epochs=EPOCHS,
                         validation_data=(X_val, y_val),
                         callbacks=get_callbacks('resnet50_Base_GAN_temp.h5'))

print("\n--- Entrenando Escenario 6: GAN (Fase 2 - Fine Tuning) ---")
# Descongelar últimas capas
for layer in model_s6.layers[-30:]: layer.trainable = True

model_s6.compile(optimizer=Adam(1e-5), loss='binary_crossentropy', metrics=['accuracy'])

# Fase 2: Fine Tuning
model_s6.fit(train_datagen.flow(X_train_gan, y_train_gan, batch_size=BATCH_SIZE),
             steps_per_epoch=len(X_train_gan)//BATCH_SIZE, epochs=30,
             validation_data=(X_val, y_val),
             callbacks=get_callbacks('resnet50_Base_GAN_FINETUNED_best.h5'))

In [None]:
# ==========================================
# 9. ESCENARIO 7: HÍBRIDO (FOCAL + GAN + Fine-Tuning)
# ==========================================
print("\n--- Entrenando Escenario 7: HÍBRIDO (Fase 1) ---")
model_s7 = create_resnet50_model()
model_s7.compile(optimizer=Adam(LR), loss=focal_loss(), metrics=['accuracy'])

# Fase 1
model_s7.fit(train_datagen.flow(X_train_gan, y_train_gan, batch_size=BATCH_SIZE),
             steps_per_epoch=len(X_train_gan)//BATCH_SIZE, epochs=EPOCHS,
             validation_data=(X_val, y_val),
             callbacks=get_callbacks('resnet50_Focal_GAN_temp.h5'))

print("\n--- Entrenando Escenario 7: HÍBRIDO (Fase 2 - Fine Tuning) ---")
for layer in model_s7.layers[-30:]: layer.trainable = True

# Nota: Pasamos el objeto de loss instanciado
model_s7.compile(optimizer=Adam(1e-5), loss=focal_loss(), metrics=['accuracy'])

# Fase 2
model_s7.fit(train_datagen.flow(X_train_gan, y_train_gan, batch_size=BATCH_SIZE),
             steps_per_epoch=len(X_train_gan)//BATCH_SIZE, epochs=30,
             validation_data=(X_val, y_val),
             callbacks=get_callbacks('resnet50_Focal_GAN_FINETUNED_best.h5'))

print("¡Todos los escenarios entrenados y modelos guardados!")