In [7]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import cv2
import pickle
import pyttsx3

In [9]:
# GRU model definition (same as trained model)
class GRUClassifier(nn.Module):
    def __init__(self, input_size=512, hidden_size=256, num_layers=2, num_classes=2000):
        super(GRUClassifier, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        gru_out, _ = self.gru(x)
        last_out = gru_out[:, -1, :]
        logits = self.fc(last_out)
        return logits

In [11]:
# Initialize device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
# Load ResNet18 feature extractor (remove last FC layer)
resnet = models.resnet18(pretrained=True)
resnet = nn.Sequential(*list(resnet.children())[:-1])
resnet.to(device).eval()




Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Con

In [15]:
# Preprocessing for ResNet input
preprocess = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [17]:
def extract_resnet_feature(frame):
    # Convert frame to PIL Image and preprocess
    img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    img = img.resize((224, 224))  # Resize explicitly to 224x224
    img_tensor = preprocess(img).unsqueeze(0).to(device)
    with torch.no_grad():
        feature = resnet(img_tensor)  # Output shape: (1, 512, 1, 1)
    feature = feature.view(-1).cpu()  # Flatten to (512,)
    return feature

In [19]:
def predict_sign_from_video(video_path, model_path, label_map, num_frames=10):
    # Load your pretrained GRU model
    model = GRUClassifier(input_size=512)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device).eval()

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"❌ Could not open video {video_path}")
        return "Error"

    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if frame_count < num_frames:
        print(f"⚠️ Video too short ({frame_count} frames), requires at least {num_frames}.")
        cap.release()
        return "Too short"
    # Calculate evenly spaced frame indices (skip frame 0)
    frame_indices = [int(frame_count * i / num_frames) for i in range(1, num_frames + 1)]

    features_sequence = []

    for idx, frame_idx in enumerate(frame_indices):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = cap.read()
        if not ret:
            print(f"Warning: couldn't read frame {frame_idx}")
            continue
        feature = extract_resnet_feature(frame)
        features_sequence.append(feature)

    cap.release()

    if len(features_sequence) == 0:
        print("❌ No frames processed.")
        return "No frames"

    # Stack features into tensor [num_frames, 512]
    features_sequence = torch.stack(features_sequence)

    # Add batch dimension: [1, num_frames, 512]
    sequence = features_sequence.unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(sequence)
        _, predicted = torch.max(outputs, dim=1)

    # Map predicted class index to label
    return label_map.get(predicted.item(), "Unknown")


In [21]:
def speak_label(label):
    engine = pyttsx3.init()
    engine.say(f"The sign is {label}")
    engine.runAndWait()

In [45]:
if __name__ == "__main__":
    # Paths — update as needed
    video_path = "C:\\Users\\deeks_w4ub1k8\\Downloads\\Testing _videos\\delicious.mp4"
    model_path = "gru_classifier_model.pth"
    label_map_path = "label_map.pkl"

    # Load label map (invert if it's {label: index})
    with open(label_map_path, "rb") as f:
        label_map = pickle.load(f)
    if isinstance(next(iter(label_map.keys())), str):
        label_map = {v: k for k, v in label_map.items()}

    # Predict and speak
    predicted_label = predict_sign_from_video(video_path, model_path, label_map)
    print("🔤 Predicted Sign Label:", predicted_label)
    speak_label(predicted_label)

  model.load_state_dict(torch.load(model_path, map_location=device))


🔤 Predicted Sign Label: delicious
