In [23]:
import random
from pathlib import Path
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset
from torchvision.transforms import InterpolationMode

# === CONFIG ===
ROOT = "datasets"         # contiene 'vacio/' y 'con_pez/' (con subcarpetas 1/2/3)
TRAIN_RATIO = 0.8
BATCH_SIZE  = 32
SEED        = 2908
random.seed(SEED)


# Transformaciones (idénticas a las de ImageNet)
train_tf = transforms.Compose([
    # Pequeña variacion de encuadre/zoom
    transforms.RandomResizedCrop(
        size=224, scale=(0.90, 1.00), ratio=(0.95, 1.05), interpolation=InterpolationMode.BICUBIC
    ),
    #Simetria horizontal (no afecta para vacio)
    transforms.RandomHorizontalFlip(p=0.5),
    # Rotacion leve, aplicada solo a veces
    transforms.RandomApply([
        transforms.RandomRotation(degrees=8, interpolation=InterpolationMode.BICUBIC)
    ], p=0.4),
    # Variaciones de luz/contraste/saturacion/tono, aplicada solo a veces
    transforms.RandomApply([
        transforms.ColorJitter(brightness=0.15, contrast=0.15, saturation=0.10, hue=0.02)
    ], p=0.5),
    # Simula pequeño motion blur
    transforms.RandomApply([
        transforms.GaussianBlur(kernel_size=3, sigma=(0.1, 0.8))
    ], p=0.3),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

val_tf = transforms.Compose([
    transforms.Resize((224, 224), interpolation=InterpolationMode.BICUBIC),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Cargar datasets y aplicar transformaciones (ImageFolder detecta las subcarpetas de primer nivel de ROOT y las convierte en una clase)
ds_train_view = datasets.ImageFolder(ROOT, transform=train_tf)
ds_val_view   = datasets.ImageFolder(ROOT, transform=val_tf)

print("Clases detectadas:", ds_train_view.classes) 
class_to_idx = ds_train_view.class_to_idx   # Diccionario de clases generado automaticamente por ImageFolder
idx_con = class_to_idx["con_pez"]
idx_vac = class_to_idx["vacio"]
print("Diccionario de clase: ", class_to_idx)

samples = ds_train_view.samples # Lista de tuplas de las imagenes con sus respectivas etiquetas numericas (0, 1)

# --- Recolectar índices ---
def indices_con_pez_de(subcarpeta: str):
    idxs = []
    for i, (p, y) in enumerate(samples):
        if y == idx_con and subcarpeta in Path(p).parts:
            idxs.append(i)       
    return idxs

idx_vacio = [i for i, (_, y) in enumerate(samples) if y == idx_vac]
idx_con_1 = indices_con_pez_de("1_pez")
idx_con_2 = indices_con_pez_de("2_peces")
idx_con_3 = indices_con_pez_de("3_peces")

# Toma una lista con indices y divide esta lista para train y val (0.8 para train y 0.2 para val)
def split_indices(idxs, ratio=TRAIN_RATIO):
    idxx = idxs[:]
    random.shuffle(idxx)
    k = int(len(idxx) * ratio)
    return idxx[:k], idxx[k:]
    
tr_vac, va_vac = split_indices(idx_vacio)
tr_1,   va_1   = split_indices(idx_con_1)
tr_2,   va_2   = split_indices(idx_con_2)
tr_3,   va_3   = split_indices(idx_con_3)

train_indices = tr_vac + tr_1 + tr_2 + tr_3
val_indices   = va_vac + va_1 + va_2 + va_3

train_ds = Subset(ds_train_view, train_indices)  # augment al vuelo (train_tf)
val_ds   = Subset(ds_val_view,   val_indices)    # sin augment (val_tf)


# --- Reporte rápido ---
print("Conteos por grupo (train | val):")
print(f"  vacio   : {len(tr_vac):4d} | {len(va_vac):4d}")
print(f"  1_pez   : {len(tr_1):4d} | {len(va_1):4d}")
print(f"  2_peces : {len(tr_2):4d} | {len(va_2):4d}")
print(f"  3_peces : {len(tr_3):4d} | {len(va_3):4d}")
print(f"TOTAL -> train: {len(train_ds)} | val: {len(val_ds)}")

Clases detectadas: ['con_pez', 'vacio']
Diccionario de clase:  {'con_pez': 0, 'vacio': 1}
Conteos por grupo (train | val):
  vacio   :  600 |  150
  1_pez   :  624 |  157
  2_peces :  209 |   53
  3_peces :   91 |   23
TOTAL -> train: 1524 | val: 383


In [32]:
import torch
from torch import nn, optim, amp
from torchvision import models

# ===== Configuración de entrenamiento =====
EPOCHS = 15
LR     = 3e-4
PATIENCE = 5  # early stopping por falta de mejora en val_acc

# Asegura reproducibilidad básica 
torch.manual_seed(42)


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                      num_workers=2, pin_memory=(DEVICE=="cuda"))
val_dl   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False,
                      num_workers=2, pin_memory=(DEVICE=="cuda"))

# ===== Modelo: MobileNetV3 preentrenada en ImageNet → 2 clases =====
model = models.mobilenet_v3_small(weights=models.MobileNet_V3_Small_Weights.IMAGENET1K_V1)
model.classifier[3] = nn.Linear(1024, 2)  # ['con_pez','vacio']
model = model.to(DEVICE)

criterion = nn.CrossEntropyLoss(weight=torch.tensor([2.0, 1.0]).to(DEVICE))
optimizer = optim.Adam(model.parameters(), lr=LR)

# (Opcional) Scheduler suave que reduce LR si no mejora la val_loss
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5,
                                                 patience=2)

# ===== Funciones auxiliares =====
def run_epoch(dataloader, train: bool):
    if train:
        model.train()
    else:
        model.eval()

    total, correct, loss_sum = 0, 0, 0.0

    # AMP opcional para GPU: acelera y ahorra VRAM (solo si DEVICE == 'cuda')
    scaler = amp.GradScaler(device="cuda", enabled=(DEVICE == "cuda"))

    for x, y in dataloader:
        x, y = x.to(DEVICE), y.to(DEVICE)

        if train:
            optimizer.zero_grad(set_to_none=True)

        with torch.set_grad_enabled(train):
            with amp.autocast(device_type="cuda", enabled=(DEVICE == "cuda")):
                logits = model(x)
                loss = criterion(logits, y)

            if train:
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()

        loss_sum += loss.item() * x.size(0)
        preds = logits.argmax(1)
        correct += (preds == y).sum().item()
        total   += x.size(0)

    avg_loss = loss_sum / total
    acc = correct / total
    return avg_loss, acc

# ===== Loop de entrenamiento con early stopping =====
best_acc = 0.0
best_state = None
epochs_no_improve = 0

for epoch in range(1, EPOCHS + 1):
    train_loss, train_acc = run_epoch(train_dl, train=True)
    val_loss,   val_acc   = run_epoch(val_dl,   train=False)

    # Scheduler por val_loss
    scheduler.step(val_loss)

    print(f"Epoch {epoch:02d} | "
          f"train_loss={train_loss:.4f} acc={train_acc:.3f} | "
          f"val_loss={val_loss:.4f} acc={val_acc:.3f} | "
          f"lr={optimizer.param_groups[0]['lr']:.2e}")

    # Guardar mejor por val_acc
    if val_acc > best_acc:
        best_acc = val_acc
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= PATIENCE:
            print(f"Early stopping: sin mejora en val_acc por {PATIENCE} épocas.")
            break

# ===== Guardado del mejor modelo =====
if best_state is not None:
    model.load_state_dict(best_state)
save_path = "mobilenetv3_binaria_best.pt"
torch.save(model.state_dict(), save_path)
print(f"\nMejor val_acc = {best_acc:.3f}. Pesos guardados en: {save_path}")



Epoch 01 | train_loss=0.1933 acc=0.885 | val_loss=1.4591 acc=0.473 | lr=3.00e-04
Epoch 02 | train_loss=0.0654 acc=0.969 | val_loss=0.7444 acc=0.768 | lr=3.00e-04
Epoch 03 | train_loss=0.0534 acc=0.973 | val_loss=0.4690 acc=0.854 | lr=3.00e-04
Epoch 04 | train_loss=0.0348 acc=0.989 | val_loss=2.3311 acc=0.496 | lr=3.00e-04
Epoch 05 | train_loss=0.0346 acc=0.985 | val_loss=1.2761 acc=0.668 | lr=3.00e-04
Epoch 06 | train_loss=0.0213 acc=0.993 | val_loss=0.6106 acc=0.825 | lr=1.50e-04
Epoch 07 | train_loss=0.0169 acc=0.993 | val_loss=0.2162 acc=0.945 | lr=1.50e-04
Epoch 08 | train_loss=0.0143 acc=0.995 | val_loss=0.0343 acc=0.982 | lr=1.50e-04
Epoch 09 | train_loss=0.0174 acc=0.992 | val_loss=0.0463 acc=0.982 | lr=1.50e-04
Epoch 10 | train_loss=0.0155 acc=0.993 | val_loss=0.0341 acc=0.984 | lr=1.50e-04
Epoch 11 | train_loss=0.0110 acc=0.994 | val_loss=0.0487 acc=0.979 | lr=1.50e-04
Epoch 12 | train_loss=0.0155 acc=0.995 | val_loss=0.1134 acc=0.953 | lr=1.50e-04
Epoch 13 | train_loss=0.0122

In [33]:
# ===== BLOQUE DE EVALUACIÓN FINAL =====
import torch
import numpy as np
from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    accuracy_score
)

# --- Modo evaluación ---
model.eval()
y_true, y_pred = [], []

# --- Recolectar todas las predicciones sobre el conjunto de validación ---
with torch.no_grad():
    for x, y in val_dl:
        x, y = x.to(DEVICE), y.to(DEVICE)
        logits = model(x)
        preds = logits.argmax(1)
        y_true.extend(y.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())

y_true = np.array(y_true)
y_pred = np.array(y_pred)

# --- Matriz de confusión ---
cm = confusion_matrix(y_true, y_pred)
print("\n=== MATRIZ DE CONFUSIÓN ===")
print(cm)

# --- Métricas por clase ---
precision = precision_score(y_true, y_pred, average=None)
recall    = recall_score(y_true, y_pred, average=None)
f1        = f1_score(y_true, y_pred, average=None)

# --- Accuracy global ---
accuracy = accuracy_score(y_true, y_pred)

# --- Nombres de las clases ---
try:
    clases = train_dl.dataset.dataset.classes
except:
    clases = ["con_pez", "vacio"]

# --- Reporte detallado por clase ---
print("\n=== MÉTRICAS POR CLASE ===")
for i, nombre in enumerate(clases):
    print(f"{nombre:10s} → Precision: {precision[i]:.3f} | Recall: {recall[i]:.3f} | F1-score: {f1[i]:.3f}")

print(f"\n=== ACCURACY GLOBAL ===\n{accuracy:.3f}")

# --- Interpretación breve ---
print("\nInterpretación:")
print(" - Alta precisión en 'vacio' → el modelo elimina pocos frames con peces.")
print(" - Alto recall en 'vacio' → el modelo elimina la mayoría de los frames realmente vacíos.")
print(" - Accuracy mide el porcentaje total de aciertos sobre todo el conjunto de validación.")



=== MATRIZ DE CONFUSIÓN ===
[[232   1]
 [  5 145]]

=== MÉTRICAS POR CLASE ===
con_pez    → Precision: 0.979 | Recall: 0.996 | F1-score: 0.987
vacio      → Precision: 0.993 | Recall: 0.967 | F1-score: 0.980

=== ACCURACY GLOBAL ===
0.984

Interpretación:
 - Alta precisión en 'vacio' → el modelo elimina pocos frames con peces.
 - Alto recall en 'vacio' → el modelo elimina la mayoría de los frames realmente vacíos.
 - Accuracy mide el porcentaje total de aciertos sobre todo el conjunto de validación.
