In [12]:
import json
import pdb
import cv2
import torch
import torchvision.transforms as transforms
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import numpy as np

In [13]:
def get_fasterrcnn(file_path = None, num_classes = 91):
    model = fasterrcnn_resnet50_fpn(weights='DEFAULT')
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    print("Número de características de entrada antes de la modificación:", in_features)
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    if file_path is not None:
        checkpoint = torch.load(file_path)
        model.load_state_dict(checkpoint['model_state_dict'])

    return model

In [14]:
model = get_fasterrcnn("model_finetuned_v2.pth", 92)
model.eval()

# Load COCO labels
# with open('coco_categories.json', 'r') as file:
#     categories_data = json.load(file)
# categories = categories_data["categories"]

# class_dict = {category["id"]: category["name"] for category in categories}
# class_dict[91] = "keys"
class_dict = {0: "_", 1: "keys"}
print(class_dict)
print(len(class_dict))

Número de características de entrada antes de la modificación: 1024
{0: '_', 1: 'keys'}
2


In [15]:
transform = transforms.Compose([
    transforms.ToTensor()
])

In [16]:
# Function for drawing a bounding box and label on the image
def draw_box(image, box, label):
    color = (0, 255, 0)  # Green
    thickness = 2
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 0.5
    #pdb.set_trace()
    text = f"{class_dict[label]}"

    cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), color, thickness)
    
    cv2.putText(image, text, (int(box[0]), int(box[1]) - 5), font, font_scale, color, thickness)

    return image

In [17]:
# Function to limit the number of bounding boxes
def intersection_over_union(box_a, box_b):
    x1_a, y1_a, x2_a, y2_a = box_a
    x1_b, y1_b, x2_b, y2_b = box_b

    area_a = (x2_a - x1_a + 1) * (y2_a - y1_a + 1)
    area_b = (x2_b - x1_b + 1) * (y2_b - y1_b + 1)

    x_intersection = max(0, min(x2_a, x2_b) - max(x1_a, x1_b) + 1)
    y_intersection = max(0, min(y2_a, y2_b) - max(y1_a, y1_b) + 1)
    intersection = x_intersection * y_intersection

    union = area_a + area_b - intersection

    iou = intersection / union

    return iou

In [18]:
def process_frame(frame):
    # Convert from BGR to RGB
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    input_image = transform(frame)

    predictions = model([input_image])
    
    labels = predictions[0]['labels'].tolist()
    boxes = predictions[0]['boxes'].tolist()
    scores = predictions[0]['scores'].tolist()

    # Filtering detections based on trust
    filtered_boxes = []
    filtered_labels = []
    for box, label, score in zip(boxes, labels, scores):
        if score > 0.5:  # Threshold
            filtered_boxes.append(box)
            filtered_labels.append(label)

    # Remove overlapping bounding boxes
    non_overlapping_boxes = []
    non_overlapping_labels = []
    for i, box in enumerate(filtered_boxes):
        overlapping = False
        for j, other_box in enumerate(filtered_boxes):
            if i != j and intersection_over_union(box, other_box) > 0.5:
                overlapping = True
                break
        if not overlapping:
            non_overlapping_boxes.append(box)
            non_overlapping_labels.append(filtered_labels[i])

    # Draw bounding boxes and labels on the image
    for box, label in zip(non_overlapping_boxes, non_overlapping_labels):
        frame = draw_box(frame, box, label)

    # Convert back from RGB to BGR
    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

    return frame

In [19]:
def process_video(video_path, output_path):
    video_capture = cv2.VideoCapture(video_path)

    fps = int(video_capture.get(cv2.CAP_PROP_FPS))
    frame_count = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
    width = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))

    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    video_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_number = 0
    while True:
        ret, frame = video_capture.read()
        if not ret:
            break

        processed_frame = process_frame(frame)
        video_writer.write(processed_frame)
        
        frame_number += 1
        progress = (frame_number / frame_count) * 100
        print(f"Processing... {progress:.2f}% completed", end='\r')

    video_capture.release()
    video_writer.release()
    print("\nProcess completed")

In [20]:
input_video_path = "input_video2.mp4"
output_video_path = "output_video5.mp4"

process_video(input_video_path, output_video_path)

Processing... 100.00% completed
Process completed
