In [1]:
import cv2
import numpy as np
import torch
import torchvision
import time
from torchvision import transforms as T
from torchvision.models.detection import maskrcnn_resnet50_fpn_v2, MaskRCNN_ResNet50_FPN_V2_Weights
import os

In [2]:
def rcnnwebcam(model_path, output_file='rcnn-output.mp4', fps=20.0, frame_size=(640, 480)):
    # Load the Mask R-CNN model
    weights = MaskRCNN_ResNet50_FPN_V2_Weights.COCO_V1
    model = maskrcnn_resnet50_fpn_v2(weights=weights)
    model.load_state_dict(torch.load(model_path))
    model.eval()  # Set the model to evaluation mode
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Open the webcam
    vc = cv2.VideoCapture(0)
    vc.set(cv2.CAP_PROP_FRAME_WIDTH, frame_size[0])
    vc.set(cv2.CAP_PROP_FRAME_HEIGHT, frame_size[1])

    # Define variables for VideoWriter initialization
    out = None
    recording = False
    output_path = os.path.abspath(output_file)  # Get the absolute path of the output file

    # Automatically start recording
    # Define the codec and create a VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Using 'mp4v' codec
    out = cv2.VideoWriter(output_file, fourcc, fps, frame_size)

    # Check if VideoWriter opened successfully
    if not out.isOpened():
        print("Failed to open VideoWriter")
        return  # Exit the function if VideoWriter fails
    else:
        recording = True
        print(f"Recording started... Saving video to: {output_path}")

    # Define the image transformations
    transform = T.Compose([T.ToTensor()])

    while True:
        ret, frame = vc.read()
        if not ret:
            print("Failed to capture frame")
            break

        # Measure time for preprocessing
        start_preprocess = time.time()
        img_tensor = transform(frame).to(device)
        img_tensor = img_tensor.unsqueeze(0)  # Add batch dimension
        end_preprocess = time.time()

        # Run Mask R-CNN inference
        start_inference = time.time()
        with torch.no_grad():
            predictions = model(img_tensor)
        end_inference = time.time()

        # Measure time for postprocessing
        start_postprocess = time.time()
        boxes = predictions[0]['boxes'].cpu().numpy()  # Bounding boxes
        scores = predictions[0]['scores'].cpu().numpy()  # Confidence scores
        labels = predictions[0]['labels'].cpu().numpy()  # Class labels

        # Filter out low-confidence detections (e.g., score > 0.25)
        threshold = 0.25
        detected_objects = 0
        for i, box in enumerate(boxes):
            if scores[i] > threshold:
                detected_objects += 1
                x1, y1, x2, y2 = box.astype(int)
                label = f'Class {labels[i]}: {scores[i]:.2f}'
                color = (0, 255, 0)  # Green color for bounding box
                cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
                print(f"Box: [{x1} {y1} {x2} {y2}], Class: {labels[i]}, Confidence: {scores[i]:.2f}")
        end_postprocess = time.time()

        # Print detection summary
        print(f"0: {frame_size[1]}x{frame_size[0]} {detected_objects} object(s) detected, {end_inference - start_inference:.1f}ms")
        print(f"Speed: {(end_preprocess - start_preprocess) * 1000:.1f}ms preprocess, {(end_inference - start_inference) * 1000:.1f}ms inference, {(end_postprocess - start_postprocess) * 1000:.1f}ms postprocess per image at shape {img_tensor.shape}")

        # Show the frame
        cv2.imshow("Mask R-CNN Webcam", frame)

        # Write the frame to the output file
        if recording and out is not None:
            # Ensure the frame size matches the VideoWriter frame size
            resized_frame = cv2.resize(frame, frame_size)
            out.write(resized_frame)
            print("Frame written to output file")

        # Capture key press
        key = cv2.waitKey(1) & 0xFF

        # Check if 'q' is pressed to stop
        if key == ord('q'):
            print("Recording stopped by user")
            break

    # Release resources
    vc.release()
    if out is not None:
        out.release()
        # Verify if the file was created
        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
            print(f"Recording stopped and video saved at {output_path}")
        else:
            print(f"Failed to save the video or the file is empty at {output_path}")
    cv2.destroyAllWindows()

In [3]:
# Load and run the Mask R-CNN webcam function
model_path = './mask_rcnn_coco.pth'
rcnnwebcam(model_path)


  model.load_state_dict(torch.load(model_path))


Recording started... Saving video to: c:\Users\alway\OneDrive\Documents\GitHub\Applied-AI\hw2\rcnn\rcnn-output.mp4
Box: [119 13 623 480], Class: 1, Confidence: 0.38
0: 480x640 1 object(s) detected, 1.8ms
Speed: 12.0ms preprocess, 1777.2ms inference, 1.0ms postprocess per image at shape torch.Size([1, 3, 480, 640])
Frame written to output file
Box: [86 130 151 185], Class: 1, Confidence: 0.42
Box: [117 12 628 480], Class: 1, Confidence: 0.38
0: 480x640 2 object(s) detected, 0.2ms
Speed: 5.0ms preprocess, 153.9ms inference, 1.0ms postprocess per image at shape torch.Size([1, 3, 480, 640])
Frame written to output file
Box: [127 14 630 480], Class: 1, Confidence: 0.40
0: 480x640 1 object(s) detected, 0.1ms
Speed: 6.0ms preprocess, 103.2ms inference, 1.0ms postprocess per image at shape torch.Size([1, 3, 480, 640])
Frame written to output file
Box: [119 17 628 480], Class: 1, Confidence: 0.36
Box: [91 131 151 183], Class: 1, Confidence: 0.33
0: 480x640 2 object(s) detected, 0.1ms
Speed: 2.5