In [26]:
import math
from PIL import Image
import requests
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'
import ipywidgets as widgets
from IPython.display import display, clear_output
import torch
from torch import nn
from torchvision.models import resnet50
import torchvision.transforms as T
import cv2
import numpy as np
torch.set_grad_enabled(False);

In [15]:
model = torch.hub.load('facebookresearch/detr', 'detr_resnet50', pretrained=True)
model.eval();

Using cache found in C:\Users\Dell/.cache\torch\hub\facebookresearch_detr_main


In [20]:
def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    b = out_bbox * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    # Convert to (x0, y0, x1, y1) format
    bboxes = torch.cat([b[:, :2] - b[:, 2:] / 2, b[:, :2] + b[:, 2:] / 2], dim=1)
    return bboxes

In [24]:
COCO_CLASSES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
    'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 
    'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 
    'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 
    'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 
    'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 
    'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'TV', 'laptop', 'mouse', 'remote', 'keyboard', 
    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 
    'teddy bear', 'hair drier', 'toothbrush'
]

# Transformation to preprocess the input frame
transform = T.Compose([
    T.Resize(800),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

def process_frame(frame, model):
    # Convert to PIL image
    pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    
    # Preprocess image
    img = transform(pil_img).unsqueeze(0)  # Add batch dimension

    # Forward pass through DETR model
    outputs = model(img)

    # Extract bounding boxes, labels, and scores
    probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
    keep = probas.max(-1).values > 0.9  # Filter out low confidence predictions
    bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep], pil_img.size)

    # Draw bounding boxes and labels
    draw = ImageDraw.Draw(pil_img)
    for box, label in zip(bboxes_scaled, probas[keep].argmax(-1)):
        x0, y0, x1, y1 = box  # This will now work correctly
        draw.rectangle(((x0, y0), (x1, y1)), outline="red", width=3)
        draw.text((x0, y0), f'{COCO_CLASSES[label]}', fill="red")

    # Convert back to OpenCV format
    return cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)

In [37]:
def process_video(input_video_path, output_video_path):
    # Capture the input video
    cap = cv2.VideoCapture(input_video_path)
    
    # Get video properties
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))  # Total number of frames
    print(total_frames)
    # Define the codec and create VideoWriter object to save the output video
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
    
    frame_count = 0  # Initialize frame counter
    
    # Process each frame
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        frame_count += 1  # Update the frame counter

        # Process the frame with DETR
        processed_frame = process_frame(frame, model)

        # Write the processed frame into the output video
        out.write(processed_frame)

        # Print the current frame and total frames
        print(f'Processing frame {frame_count}/{total_frames}')
    
    # Release everything
    cap.release()
    out.release()
    print("Video processing complete!")

# Run the function
input_video = 'E:/Dataset_project/Crop_videos/vid2_27_7_FaisalTown.mp4'  # Path to input video
output_video = 'E:/Dataset_project/Detection/vid2_27_7_FaisalTown.mp4'  # Path to save the processed video
process_video(input_video, output_video)

416
Processing frame 1/416
Processing frame 2/416
Processing frame 3/416
Processing frame 4/416
Processing frame 5/416
Processing frame 6/416
Processing frame 7/416
Processing frame 8/416
Processing frame 9/416
Processing frame 10/416
Processing frame 11/416
Processing frame 12/416
Processing frame 13/416
Processing frame 14/416
Processing frame 15/416
Processing frame 16/416
Processing frame 17/416
Processing frame 18/416
Processing frame 19/416
Processing frame 20/416
Processing frame 21/416
Processing frame 22/416
Processing frame 23/416
Processing frame 24/416
Processing frame 25/416
Processing frame 26/416
Processing frame 27/416
Processing frame 28/416
Processing frame 29/416
Processing frame 30/416
Processing frame 31/416
Processing frame 32/416
Processing frame 33/416
Processing frame 34/416
Processing frame 35/416
Processing frame 36/416
Processing frame 37/416
Processing frame 38/416
Processing frame 39/416
Processing frame 40/416
Processing frame 41/416
Processing frame 42/4

Processing frame 333/416
Processing frame 334/416
Processing frame 335/416
Processing frame 336/416
Processing frame 337/416
Processing frame 338/416
Processing frame 339/416
Processing frame 340/416
Processing frame 341/416
Processing frame 342/416
Processing frame 343/416
Processing frame 344/416
Processing frame 345/416
Processing frame 346/416
Processing frame 347/416
Processing frame 348/416
Processing frame 349/416
Processing frame 350/416
Processing frame 351/416
Processing frame 352/416
Processing frame 353/416
Processing frame 354/416
Processing frame 355/416
Processing frame 356/416
Processing frame 357/416
Processing frame 358/416
Processing frame 359/416
Processing frame 360/416
Processing frame 361/416
Processing frame 362/416
Processing frame 363/416
Processing frame 364/416
Processing frame 365/416
Processing frame 366/416
Processing frame 367/416
Processing frame 368/416
Processing frame 369/416
Processing frame 370/416
Processing frame 371/416
Processing frame 372/416
