In [3]:
import cv2
import mediapipe as mp
import torch
import numpy as np
from torchvision import transforms
from torchvision.models import resnet18
import torch.nn as nn

# Definición del modelo
class SignLanguageModel(nn.Module):
    def __init__(self):
        super(SignLanguageModel, self).__init__()
        self.base_model = resnet18(pretrained=True)
        self.base_model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        self.base_model.fc = nn.Linear(self.base_model.fc.in_features, 29)
    
    def forward(self, x):
        return self.base_model(x)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Cargar el modelo entrenado
model = SignLanguageModel().to(device)
model.load_state_dict(torch.load('modelo_final.pth'))
model.eval()

# MediaPipe para detección de manos
mp_hands = mp.solutions.hands

def detect_and_crop_hand(image, margin=20):
    with mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.5) as hands:
        results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                h, w, c = image.shape
                x_min = min([landmark.x for landmark in hand_landmarks.landmark]) * w
                x_max = max([landmark.x for landmark in hand_landmarks.landmark]) * w
                y_min = min([landmark.y for landmark in hand_landmarks.landmark]) * h
                y_max = max([landmark.y for landmark in hand_landmarks.landmark]) * h
                
                # Añadir margen
                x_min = max(0, int(x_min - margin))
                x_max = min(w, int(x_max + margin))
                y_min = max(0, int(y_min - margin))
                y_max = min(h, int(y_max + margin))
                
                cropped_image = image[y_min:y_max, x_min:x_max]
                return cropped_image
        return None

# Transformaciones de imagen
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.Grayscale(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

# Capturar y procesar imágenes de la cámara web
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    cropped_image = detect_and_crop_hand(frame)
    
    if cropped_image is not None:
        input_tensor = transform(cropped_image).unsqueeze(0).to(device)
        output = model(input_tensor)
        _, predicted = torch.max(output, 1)
        predicted_class = predicted.item()
        
        # Mostrar la clase predicha en la imagen
        cv2.putText(frame, f'Predicted: {predicted_class}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, cv2.LINE_AA)
    
    # Mostrar el frame
    cv2.imshow('Sign Language Detection', frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


Una foto

In [10]:
import cv2
import mediapipe as mp
import torch
import numpy as np
from torchvision import transforms
from torchvision.models import resnet18
import torch.nn as nn
import matplotlib.pyplot as plt
import time

# Definición del modelo
class SignLanguageModel(nn.Module):
    def __init__(self):
        super(SignLanguageModel, self).__init__()
        self.base_model = resnet18(pretrained=True)
        self.base_model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        self.base_model.fc = nn.Linear(self.base_model.fc.in_features, 29)
    
    def forward(self, x):
        return self.base_model(x)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Cargar el modelo entrenado
model = SignLanguageModel().to(device)
model.load_state_dict(torch.load('modelo_final.pth'))
model.eval()

# MediaPipe para detección de manos
mp_hands = mp.solutions.hands

def detect_and_crop_hand(image, margin=20):
    with mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.5) as hands:
        results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                h, w, c = image.shape
                x_min = min([landmark.x for landmark in hand_landmarks.landmark]) * w
                x_max = max([landmark.x for landmark in hand_landmarks.landmark]) * w
                y_min = min([landmark.y for landmark in hand_landmarks.landmark]) * h
                y_max = max([landmark.y for landmark in hand_landmarks.landmark]) * h
                
                # Añadir margen
                x_min = max(0, int(x_min - margin))
                x_max = min(w, int(x_max + margin))
                y_min = max(0, int(y_min - margin))
                y_max = min(h, int(y_max + margin))
                
                cropped_image = image[y_min:y_max, x_min:x_max]
                return cropped_image
        return None

# Transformaciones de imagen
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.Grayscale(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

# Lista de etiquetas para las clases
labels = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ") + ["nothing", "space"]

# Capturar una sola imagen de la cámara web
cap = cv2.VideoCapture(0)

# Asegurarse de que la cámara esté abierta
if not cap.isOpened():
    print("Error: No se pudo abrir la cámara.")
else:
    # Esperar brevemente para dar tiempo a la cámara a inicializarse
    time.sleep(2)

    ret, frame = cap.read()
    cap.release()

    if ret:
        cropped_image = detect_and_crop_hand(frame)
        
        if cropped_image is not None:
            input_tensor = transform(cropped_image).unsqueeze(0).to(device)
            
            # Mostrar la imagen procesada
            plt.imshow(cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB))
            plt.title("Imagen procesada para predicción")
            plt.axis('off')
            plt.show()
            
            # Hacer la predicción
            output = model(input_tensor)
            _, predicted = torch.max(output, 1)
            predicted_class = predicted.item()
            
            # Mostrar la clase predicha
            print(f'Predicted class: {labels[predicted_class]}')
        else:
            print("No se detectó ninguna mano en la imagen.")
    else:
        print("No se pudo capturar la imagen de la cámara.")


No se pudo capturar la imagen de la cámara.
