# 01 — Config + Index + Split + Dataloaders (FIX)

Este notebook:
- Configura rutas/semilla
- Indexa imágenes
- Divide **train/val/test** (estratificado)
- **Guarda los splits** en `OUT_DIR` (CSV) para reutilizarlos en todos los notebooks
- Construye `train_loader`, `val_loader`, `test_loader` con transforms correctos (`train_tfms` y `eval_tfms`)


In [5]:
# ===== CONFIG (FIX) =====
import os, random
import numpy as np
import torch

# Rutas (ajusta a tu máquina)
DATA_DIR = r"C:\Users\User\Downloads\Dataset_COVID\Balanceo"
OUT_DIR  = r"C:\Users\User\Downloads\COVID_HYBRID_experiment2"
os.makedirs(OUT_DIR, exist_ok=True)

# Reproducibilidad
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Dispositivo
USE_GPU = True
DEVICE = "cuda" if (USE_GPU and torch.cuda.is_available()) else "cpu"
print("DEVICE:", DEVICE)
if DEVICE == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))

# Hiperparámetros base (mismos que tu notebook)
IMG_SIZE    = 224
BATCH_SIZE  = 12
NUM_WORKERS = 0

CLASS_NAMES = ["No-COVID","COVID"]

# Guardamos una mini-config para que otros notebooks se sincronicen
import json
with open(os.path.join(OUT_DIR, "config_runtime.json"), "w", encoding="utf-8") as f:
    json.dump(dict(SEED=SEED, IMG_SIZE=IMG_SIZE, BATCH_SIZE=BATCH_SIZE, NUM_WORKERS=NUM_WORKERS,
                   CLASS_NAMES=CLASS_NAMES), f, ensure_ascii=False, indent=2)

print("Config guardada en:", os.path.join(OUT_DIR, "config_runtime.json"))


DEVICE: cuda
GPU: NVIDIA GeForce RTX 4050 Laptop GPU
Config guardada en: C:\Users\User\Downloads\COVID_HYBRID_experiment2\config_runtime.json


In [7]:
# ===== INDEX + SUBSET (usa tu función original, aquí va un ejemplo genérico) =====
import pandas as pd

def build_index(root):
    rows = []
    for cls in sorted(os.listdir(root)):
        d = os.path.join(root, cls)
        if not os.path.isdir(d): 
            continue
        for f in os.listdir(d):
            ext = os.path.splitext(f)[1].lower()
            if ext in [".png",".jpg",".jpeg",".bmp",".tif",".tiff"]:
                rows.append((os.path.join(d,f), cls))
    df = pd.DataFrame(rows, columns=["path","class"])
    return df

df = build_index(DATA_DIR)

# Mapea clases a labels binarios de forma explícita (ajusta si tu carpeta usa otros nombres)
# Asumimos: "COVID" -> 1, "No-COVID"/"Normal" -> 0
def to_label(c):
    c_up = str(c).strip().upper()
    if "COVID" in c_up:
        return 1
    return 0

df["label"] = df["class"].apply(to_label)

print("Total imágenes:", len(df))
print(df["label"].value_counts().sort_index())


Total imágenes: 9564
label
0    5948
1    3616
Name: count, dtype: int64


In [9]:
# ===== SPLIT (estratificado) + GUARDAR CSV =====
from sklearn.model_selection import train_test_split

df = df.sample(frac=1.0, random_state=SEED).reset_index(drop=True)

train_df, test_df = train_test_split(
    df, test_size=0.15, stratify=df["label"], random_state=SEED
)
train_df, val_df  = train_test_split(
    train_df, test_size=0.15, stratify=train_df["label"], random_state=SEED
)

for name, part in [("train",train_df),("val",val_df),("test",test_df)]:
    print(f"{name} ({len(part)}):", part["label"].value_counts().sort_index().to_dict())

# Guardar splits (clave para NO evaluar con todo el dataset por error)
train_path = os.path.join(OUT_DIR, "train_split.csv")
val_path   = os.path.join(OUT_DIR, "val_split.csv")
test_path  = os.path.join(OUT_DIR, "test_split.csv")

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

print("Splits guardados:")
print(" -", train_path)
print(" -", val_path)
print(" -", test_path)


train (6909): {0: 4297, 1: 2612}
val (1220): {0: 759, 1: 461}
test (1435): {0: 892, 1: 543}
Splits guardados:
 - C:\Users\User\Downloads\COVID_HYBRID_experiment2\train_split.csv
 - C:\Users\User\Downloads\COVID_HYBRID_experiment2\val_split.csv
 - C:\Users\User\Downloads\COVID_HYBRID_experiment2\test_split.csv


In [11]:
# ===== DATASET + DATALOADERS (FIX: sin '...' y con eval_tfms) =====
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

class XRayDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df.reset_index(drop=True)
        self.transform = transform

    def __len__(self): 
        return len(self.df)

    def __getitem__(self, idx):
        p = self.df.iloc[idx]["path"]
        y = int(self.df.iloc[idx]["label"])
        img = Image.open(p).convert("L").convert("RGB")
        if self.transform:
            img = self.transform(img)
        return img, y

# Train transforms: leves (no cambiamos tu pipeline, solo arreglamos la celda rota)
train_tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(7),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3),
])

# Eval/Test transforms: deterministas
eval_tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3),
])

train_ds = XRayDataset(train_df, transform=train_tfms)
val_ds   = XRayDataset(val_df,   transform=eval_tfms)
test_ds  = XRayDataset(test_df,  transform=eval_tfms)

loader_kwargs = dict(batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=(DEVICE=="cuda"))
train_loader = DataLoader(train_ds, shuffle=True,  **loader_kwargs)
val_loader   = DataLoader(val_ds,   shuffle=False, **loader_kwargs)
test_loader  = DataLoader(test_ds,  shuffle=False, **loader_kwargs)

xb, yb = next(iter(train_loader))
print("Sanity batch:", xb.shape, yb.shape)


Sanity batch: torch.Size([12, 3, 224, 224]) torch.Size([12])
