# Method that processes a video and detects objects with Faster CRNN based on COCO

In [1]:
import json
import pdb
import cv2
import torch
import torchvision.transforms as transforms
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import numpy as np

### If there is a path_file, load the desired model, and if not, return the original model

In [2]:
def get_fasterrcnn(file_path = None, num_classes = 91):
    model = fasterrcnn_resnet50_fpn(weights='DEFAULT')
    if file_path is not None:
        checkpoint = torch.load(file_path)
        model.load_state_dict(checkpoint['model_state_dict'])

    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    
    return model

In [3]:
model = get_fasterrcnn(num_classes = 92)
model.eval()

# Load COCO labels
with open('annotations/coco_categories.json', 'r') as file:
    categories_data = json.load(file)
categories = categories_data["categories"]

class_dict = {category["id"]: category["name"] for category in categories}
class_dict[91] = "keys"
#class_dict = {0: "_", 1: "keys"}
print(class_dict)
print(len(class_dict))

{0: '__background__', 1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorcycle', 5: 'airplane', 6: 'bus', 7: 'train', 8: 'truck', 9: 'boat', 10: 'traffic light', 11: 'fire hydrant', 13: 'stop sign', 14: 'parking meter', 15: 'bench', 16: 'bird', 17: 'cat', 18: 'dog', 19: 'horse', 20: 'sheep', 21: 'cow', 22: 'elephant', 23: 'bear', 24: 'zebra', 25: 'giraffe', 27: 'backpack', 28: 'umbrella', 31: 'handbag', 32: 'tie', 33: 'suitcase', 34: 'frisbee', 35: 'skis', 36: 'snowboard', 37: 'sports ball', 38: 'kite', 39: 'baseball bat', 40: 'baseball glove', 41: 'skateboard', 42: 'surfboard', 43: 'tennis racket', 44: 'bottle', 46: 'wine glass', 47: 'cup', 48: 'fork', 49: 'knife', 50: 'spoon', 51: 'bowl', 52: 'banana', 53: 'apple', 54: 'sandwich', 55: 'orange', 56: 'broccoli', 57: 'carrot', 58: 'hot dog', 59: 'pizza', 60: 'donut', 61: 'cake', 62: 'chair', 63: 'couch', 64: 'potted plant', 65: 'bed', 67: 'dining table', 70: 'toilet', 72: 'tv', 73: 'laptop', 74: 'mouse', 75: 'remote', 76: 'keyboard', 77: 'ce

In [4]:
transform = transforms.Compose([
    transforms.ToTensor()
])

### Function for drawing a bounding box and label on the image

In [16]:
def draw_box(image, box, label):
    color = (0, 255, 0)  # Green
    thickness = 2
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 0.5
    #pdb.set_trace()
    text = f"{class_dict[label]}"

    cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), color, thickness)
    
    cv2.putText(image, text, (int(box[0]), int(box[1]) - 5), font, font_scale, color, thickness)

    return image

### Function to limit the number of bounding boxes

In [17]:
def intersection_over_union(box_a, box_b):
    x1_a, y1_a, x2_a, y2_a = box_a
    x1_b, y1_b, x2_b, y2_b = box_b

    area_a = (x2_a - x1_a + 1) * (y2_a - y1_a + 1)
    area_b = (x2_b - x1_b + 1) * (y2_b - y1_b + 1)

    x_intersection = max(0, min(x2_a, x2_b) - max(x1_a, x1_b) + 1)
    y_intersection = max(0, min(y2_a, y2_b) - max(y1_a, y1_b) + 1)
    intersection = x_intersection * y_intersection

    union = area_a + area_b - intersection

    iou = intersection / union

    return iou

### Function to process 1 frame

In [18]:
def process_frame(frame):
    # Convert from BGR to RGB
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    input_image = transform(frame)

    predictions = model([input_image])
    
    labels = predictions[0]['labels'].tolist()
    boxes = predictions[0]['boxes'].tolist()
    scores = predictions[0]['scores'].tolist()

    # Filtering detections based on trust
    filtered_boxes = []
    filtered_labels = []
    for box, label, score in zip(boxes, labels, scores):
        if score > 0.5:  # Threshold
            filtered_boxes.append(box)
            filtered_labels.append(label)

    # Remove overlapping bounding boxes
    non_overlapping_boxes = []
    non_overlapping_labels = []
    for i, box in enumerate(filtered_boxes):
        overlapping = False
        for j, other_box in enumerate(filtered_boxes):
            if i != j and intersection_over_union(box, other_box) > 0.5:
                overlapping = True
                break
        if not overlapping:
            non_overlapping_boxes.append(box)
            non_overlapping_labels.append(filtered_labels[i])

    # Draw bounding boxes and labels on the image
    for box, label in zip(non_overlapping_boxes, non_overlapping_labels):
        frame = draw_box(frame, box, label)

    # Convert back from RGB to BGR
    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

    return frame

### Function to process the video, using all the functions above

In [19]:
def process_video(video_path, output_path):
    video_capture = cv2.VideoCapture(video_path)

    fps = int(video_capture.get(cv2.CAP_PROP_FPS))
    frame_count = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
    width = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))

    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    video_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_number = 0
    while True:
        ret, frame = video_capture.read()
        if not ret:
            break

        processed_frame = process_frame(frame)
        video_writer.write(processed_frame)
        
        frame_number += 1
        progress = (frame_number / frame_count) * 100
        print(f"Processing... {progress:.2f}% completed", end='\r')

    video_capture.release()
    video_writer.release()
    print("\nProcess completed")

In [20]:
input_video_path = "input_video2.mp4"
output_video_path = "output_video5.mp4"

process_video(input_video_path, output_video_path)

Processing... 100.00% completed
Process completed
