In [5]:
import os
import random
import cv2
import numpy as np
import torch
from PIL import Image
from torchvision import models, transforms
import time
import matplotlib.pyplot as plt
plt.rcParams["axes.grid"] = False

In [6]:
# Check if cuda is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == "cpu":
    print("Running on CPU.")
else:
    print("Running on GPU.")

model = models.detection.maskrcnn_resnet50_fpn(pretrained=True).to(device).eval()

Running on CPU.




In [7]:
import cv2
import numpy as np
from sklearn.decomposition import PCA

def calculate_mask_properties(mask):
    # Calculate center point and principal axes of a binary mask.
    # Find all non-zero points (mask pixels)
    y_coords, x_coords = np.nonzero(mask)
    
    if len(x_coords) == 0 or len(y_coords) == 0:
        return None, None, None
    
    # Calculate center point (centroid)
    center_x = int(np.mean(x_coords))
    center_y = int(np.mean(y_coords))
    
    # Prepare data for PCA
    points = np.column_stack((x_coords, y_coords))
    
    # Perform PCA
    pca = PCA(n_components=2)
    pca.fit(points)
    
    # Get principal components and their directions
    components = pca.components_
    eigenvalues = pca.explained_variance_
    
    return (center_x, center_y), components, eigenvalues

def draw_principal_axes(image, center, components, eigenvalues, color):
    
    # Draw principal axes on the image.
    if center is None or components is None or eigenvalues is None:
        return
    
    center_x, center_y = center
    scale_factor1 = 1
    scale_factor2 = 2  
    
    for i in range(2):
        # Scale the component by the square root of its eigenvalue
        direction = components[i] * np.sqrt(eigenvalues[i])
        
        # Calculate end points
        start_point = (
            int(center_x - direction[0] * scale_factor1),
            int(center_y - direction[1] * scale_factor2)
        )
        end_point = (
            int(center_x + direction[0] * scale_factor1),
            int(center_y + direction[1] * scale_factor2)
        )
        
        # Draw the axis line
        cv2.line(image, start_point, end_point, color, 2)
    
    # Draw center point
    cv2.circle(image, (center_x, center_y), 4, color, -1)

In [8]:
# Initialize webcam
cam = cv2.VideoCapture(0)
if not cam.isOpened(): 
    print("No camera detected!")
    exit()

# COCO labels
coco_names = [
    'unlabeled', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 
    'fire hydrant', 'street sign', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 
    'elephant', 'bear', 'zebra', 'giraffe', 'hat', 'backpack', 'umbrella', 'shoe', 'eye glasses', 'handbag', 'tie', 
    'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 
    'surfboard', 'tennis racket', 'bottle', 'plate', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 
    'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 
    'potted plant', 'bed', 'mirror', 'dining table', 'window', 'desk', 'toilet', 'door', 'tv', 'laptop', 'mouse', 
    'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'blender', 'book', 
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

# Random colors for each class
colors = [[random.randint(0, 255) for _ in range(3)] for _ in coco_names]

# Preprocessing transformation
transform = transforms.Compose([
    transforms.ToTensor(),
])


while True:
    ret, frame = cam.read()
    
    frame = cv2.resize(frame, (1280, 720))

    # Convert frame to PIL Image and Tensor
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    image_tensor = transform(image).unsqueeze(0).to(device)

    # Run inference
    with torch.no_grad():
        output = model(image_tensor)[0]

    # Copy frame for drawing
    result_image = frame.copy()
    mask_overlay = np.zeros_like(frame, dtype=np.uint8)  # Create a blank mask

    # Draw bounding boxes, labels, and masks
    for box, label, score, mask in zip(output['boxes'], output['labels'], output['scores'], output['masks']):
        if score > 0.65:  # Confidence threshold
            x1, y1, x2, y2 = map(int, box.tolist())
            
            # Draw bounding box
            color = colors[label]
            cv2.rectangle(result_image, (x1, y1), (x2, y2), color, 3)
            
            # Draw label
            text = f"{coco_names[label]}: {score:.2f}"
            text_size = 1.0
            text_thickness = 1
            text_font = cv2.FONT_HERSHEY_DUPLEX
            lineType = cv2.LINE_AA
            cv2.putText(result_image, text, (x1, y1 - 5), text_font, text_size, color, text_thickness, lineType)
            
            # Process the mask
            mask = mask.squeeze().detach().cpu().numpy() > 0.65
            
            # Calculate center and principal axes
            center, components, eigenvalues = calculate_mask_properties(mask)
            
            # Draw principal axes and center point
            if center is not None:
                draw_principal_axes(result_image, center, components, eigenvalues, color)
                        
            # Apply mask overlay
            frame[mask] = (frame[mask] * 0.5 + np.array(color) * 0.5).astype(np.uint8)

    # Blend mask overlay with the result image
    result_image = cv2.addWeighted(result_image, 1, frame, 0.5, 0)
    
    cv2.imshow("Mask R-CNN - Real Time", result_image)
    # Quit on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Cleanup
cam.release()
cv2.destroyAllWindows()