In [9]:
import os
import random
import cv2
import numpy as np
import torch
from PIL import Image
from torchvision import models, transforms
import time
import matplotlib.pyplot as plt
plt.rcParams["axes.grid"] = False

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.detection.maskrcnn_resnet50_fpn(pretrained=True).to(device).eval()

In [None]:
# Initialize webcam
cam = cv2.VideoCapture(0)
if not cam.isOpened(): 
    print("No camera detected!")
    exit()

# COCO labels
coco_names = [
    'unlabeled', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 
    'fire hydrant', 'street sign', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 
    'elephant', 'bear', 'zebra', 'giraffe', 'hat', 'backpack', 'umbrella', 'shoe', 'eye glasses', 'handbag', 'tie', 
    'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 
    'surfboard', 'tennis racket', 'bottle', 'plate', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 
    'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 
    'potted plant', 'bed', 'mirror', 'dining table', 'window', 'desk', 'toilet', 'door', 'tv', 'laptop', 'mouse', 
    'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'blender', 'book', 
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

# Random colors for each class
colors = [[random.randint(0, 255) for _ in range(3)] for _ in coco_names]

# Preprocessing transformation
transform = transforms.Compose([
    transforms.ToTensor(),
])


while True:
    ret, frame = cam.read()
    if not ret:
        print("Failed to capture image")
        break


    frame = cv2.resize(frame, (640, 480))  # Resize frame for faster processing
    # Convert frame to PIL Image and Tensor
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    image_tensor = transform(image).unsqueeze(0).to(device)

    # Run inference
    with torch.no_grad():
        output = model(image_tensor)[0]

    # Copy frame for drawing
    result_image = frame.copy()
    mask_overlay = np.zeros_like(frame, dtype=np.uint8)  # Create a blank mask

    # Draw bounding boxes, labels, and masks
    for box, label, score, mask in zip(output['boxes'], output['labels'], output['scores'], output['masks']):
        if score > 0.65:  # Confidence threshold
            x1, y1, x2, y2 = map(int, box.tolist())

            # Draw bounding box
            color = colors[label]
            cv2.rectangle(result_image, (x1, y1), (x2, y2), color, 2)

            # Draw label
            text = f"{coco_names[label]}: {score:.2f}"
            cv2.putText(result_image, text, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)

            # Process the mask
            mask = mask.squeeze().cpu().numpy() > 0.5  # Threshold the mask
            mask_colored = np.zeros_like(frame, dtype=np.uint8)
            mask_colored[mask] = color

            # Overlay mask onto the blank overlay image
            mask_overlay = cv2.add(mask_overlay, mask_colored)

    # Blend mask overlay with the result image
    result_image = cv2.addWeighted(result_image, 1, mask_overlay, 0.5, 0)

    # Show result
    cv2.imshow("Mask R-CNN - Real Time", result_image)

    # Exit on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Cleanup
cam.release()
cv2.destroyAllWindows()