In [7]:
import cv2
import time
import torch
import torchvision
from torchvision.models.detection import RetinaNet_ResNet50_FPN_V2_Weights, RetinaNet_ResNet50_FPN_Weights, retinanet_resnet50_fpn, retinanet_resnet50_fpn_v2
from torchvision.transforms import functional as F
from PIL import Image
from torchvision.models.detection.retinanet import _COCO_CATEGORIES
from collections import deque
import gc


In [8]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [9]:
# model = retinanet_resnet50_fpn(weights=RetinaNet_ResNet50_FPN_Weights.DEFAULT)
weights = RetinaNet_ResNet50_FPN_V2_Weights.DEFAULT
model = retinanet_resnet50_fpn_v2(weights=weights)
model.to(device)
model.eval()

RetinaNet(
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      

In [10]:
video_path = './data/lange_10.mp4'
output_path = './result/lange_10_Retina.mp4'

cap = cv2.VideoCapture(video_path)

total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

fourcc = cv2.VideoWriter_fourcc(*'mp4v')

out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))


In [11]:
def detect(frame):
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img = F.to_tensor(img).unsqueeze(0) 
    img = img.to(device)
    
    with torch.no_grad():
        predictions = model(img)

    return predictions


In [12]:
fps_start_time = time.time()
vehicle_classes = [2, 3, 5, 7]  # 2 - car, 3 - motorcycle, 5 - bus, 7 - truck

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    predictions = detect(frame)

    for i, pred in enumerate(predictions[0]['boxes']):
        score = predictions[0]['scores'][i].item()
        if score > 0.5:
            x1, y1, x2, y2 = pred.tolist()
            label = predictions[0]['labels'][i].item()
            if label in vehicle_classes:
                cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 6)
                cv2.putText(frame, f'{_COCO_CATEGORIES[label]}: {score:.2f}', (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 6)

    out.write(frame)
fps_end_time = time.time()
average_fps = total_frames / (fps_end_time - fps_start_time)

print("Average FPS:", average_fps)

# Освобождение ресурсов
cap.release()
out.release()
cv2.destroyAllWindows()
# Освобождение кэшируемой памяти
torch.cuda.empty_cache()
gc.collect()

Average FPS: 1.447239785616117


0