In [1]:
import os
import torch
import torch.nn as nn
from torchvision import transforms, models
from PIL import Image
import sys

# === CONFIG ===
CLIP_LENGTH = 32  # O 16 si usaste menos frames en entrenamiento
IMAGE_SIZE = 224  # Debe coincidir con el entrenamiento
NUM_CLASSES = 2
CLASS_NAMES = ['entrada', 'salida']
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === TRANSFORMACIÓN ===
transform = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.ToTensor()
])

# === MODELO ===
class CNN_LSTM(nn.Module):
    def __init__(self, hidden_dim=512, num_layers=1, num_classes=2):
        super(CNN_LSTM, self).__init__()
        base_model = models.mobilenet_v2(pretrained=True)
        self.cnn = base_model.features
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        self.feature_dim = 1280

        for param in self.cnn.parameters():  # congelar el CNN
            param.requires_grad = False

        self.lstm = nn.LSTM(input_size=self.feature_dim,
                            hidden_size=hidden_dim,
                            num_layers=num_layers,
                            batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        B, T, C, H, W = x.shape
        x = x.view(B * T, C, H, W)
        with torch.no_grad():
            features = self.cnn(x)
            features = self.pool(features).view(B, T, -1)
        output, _ = self.lstm(features)
        final_output = output[:, -1, :]
        logits = self.fc(final_output)
        return logits

# === FUNCIÓN DE INFERENCIA ===
def inferir_clip(clip_path, model):
    frames = sorted(os.listdir(clip_path))[:CLIP_LENGTH]
    clip = []

    for frame_name in frames:
        img_path = os.path.join(clip_path, frame_name)
        image = Image.open(img_path).convert('RGB')
        image = transform(image)
        clip.append(image)

    clip_tensor = torch.stack(clip, dim=0).unsqueeze(0).to(DEVICE)  # (1, T, C, H, W)

    model.eval()
    with torch.no_grad():
        logits = model(clip_tensor)
        probs = torch.softmax(logits, dim=1)
        predicted = torch.argmax(probs, dim=1).item()

    return CLASS_NAMES[predicted], probs.cpu().numpy()

# === EJECUCIÓN ===
if __name__ == "__main__":
    # Cambia este path por la carpeta de tu clip de prueba
    clip_test_path = '../dataset/entrada/entrada_001'  # ejemplo

    model = CNN_LSTM(num_classes=NUM_CLASSES).to(DEVICE)
    model.load_state_dict(torch.load('cnn_lstm_supermercado.pth', map_location=DEVICE))

    clase, probs = inferir_clip(clip_test_path, model)
    print(f"🧠 Predicción: {clase} — Probabilidades: {probs}")




🧠 Predicción: entrada — Probabilidades: [[9.9975425e-01 2.4575321e-04]]
