# Model Testing

In [10]:
import cv2
import time
import torch
import torchvision.transforms as transforms
from PIL import Image
import torch.nn as nn

##############################
#                            #
#  CNN Model Architecture    #
#                            #
##############################

class DeepCNN(nn.Module):
  
    def __init__(self, numClasses=22):
        
        super(DeepCNN, self).__init__()
        
        self.features = nn.Sequential(
            
            # Block#1: 3 -> 32
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block#2: 32 -> 64
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block#3: 64 -> 128
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block#4: 128 -> 256
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block#5: 256 -> 512
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 1024),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(1024, numClasses)
        )

    def forward(self, x):
        x = self.features(x)           
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x


#######################################################
#                                                     #
#  Setting up Device, Model, Classes and Transform    #
#                                                     #
#######################################################

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

numClasses = 22
model = DeepCNN(numClasses=numClasses).to(device)

#loading the model
model_path = "best_deepcnn.pth"
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

#class names
classNames = [
    "A", "B", "C", "D", "E", "F", 
    "H", "I", "K", "L", "M", "N", 
    "O", "P", "Q", "R", "T", "U", 
    "V", "W", "X", "Y"
]

#image transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    ),
])


#####################################
#                                   #
#  Prediction of Image with Model   #
#                                   #
#####################################

def predict_letter(model, frameBgr, device):

    #converting BGR to RGB
    img_rgb = cv2.cvtColor(frameBgr, cv2.COLOR_BGR2RGB)
    
    #converting to a PIL Image
    pil_img = Image.fromarray(img_rgb)
    
    #applying transforms
    img_t = transform(pil_img).unsqueeze(0).to(device)
    
    with torch.no_grad():
        outputs = model(img_t)
        _, predicted_idx = torch.max(outputs, 1)
        
    return classNames[predicted_idx.item()]

#####################################
#                                   #
#  Webcam Access and Processing     #
#                                   #
#####################################

cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Failed to open webcam.")
    exit()

print("Webcam opened. Press 'x' to exit.")

recognizedWord = ""
capture_interval = 3
start_time = time.time()
box_size = 224

while True:
    ret, frame = cap.read()
    if not ret:
        print("Failed to read from webcam.")
        break

    frame_height, frame_width = frame.shape[:2]
    
    #computing center bounding box coordinates
    center_x = frame_width//2
    center_y = frame_height//2
    x1 = center_x-(box_size//2)
    y1 = center_y-(box_size//2)
    x2 = x1+box_size
    y2 = y1+box_size

    #drawing a bounding box on the frame
    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

    #displaying the recognized word so far
    cv2.putText(frame, f"Word: {recognizedWord}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 255, 255), 2)

    #displaying an info text on the frame
    cv2.putText(frame, "Image every 3s. Press 'x' to exit.", (10, frame_height - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7,
                (255, 255, 255), 2)

    #checking if the capture interval has elapsed
    current_time = time.time()
    if (current_time - start_time) >= capture_interval:
        #reseting the timer
        start_time = current_time

        #croping the center 224x224 region
        roi = frame[y1:y2, x1:x2]

        #pedicting the letter using the model
        letter = predict_letter(model, roi, device)
        print("Predicted letter:", letter)
        recognizedWord += letter

    #displaying the webcam feed
    cv2.imshow("Detection", frame)

    #exitting if user presses 'x' or 'X'
    key = cv2.waitKey(1) & 0xFF
    if key == ord('x') or key == ord('X'):
        print("User pressed 'x'. Exiting...")
        break

#cleaning up resources
cap.release()
cv2.destroyAllWindows()
print("Final recognized word:", recognizedWord)

Using device: cuda


  model.load_state_dict(torch.load(model_path, map_location=device))


Webcam opened. Press 'x' to exit.
Predicted letter: H
Predicted letter: Y
Predicted letter: W
Predicted letter: Q
User pressed 'x'. Exiting...
Final recognized word: HYWQ
