In [1]:
import cv2
import numpy as np
import torch
from torchvision import models
from ultralytics import YOLO 

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
!pip install deep-sort-realtime

Defaulting to user installation because normal site-packages is not writeable
Collecting deep-sort-realtime
  Downloading deep_sort_realtime-1.3.2-py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Installing collected packages: deep-sort-realtime
Successfully installed deep-sort-realtime-1.3.2


In [2]:

import datetime
from ultralytics import YOLO
import cv2
from deep_sort_realtime.deepsort_tracker import DeepSort
# from google.colab.patches import cv2_imshow


def create_video_writer(video_cap, output_filename):

    # grab the width, height, and fps of the frames in the video stream.
    frame_width = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(video_cap.get(cv2.CAP_PROP_FPS))

    # initialize the FourCC and a video writer object
    fourcc = cv2.VideoWriter_fourcc(*'MP4V')
    writer = cv2.VideoWriter(output_filename, fourcc, fps,
                             (frame_width, frame_height))

    return writer


CONFIDENCE_THRESHOLD = 0.8
GREEN = (0, 255, 0)
WHITE = (255, 255, 255)

# initialize the video capture object
video_cap = cv2.VideoCapture("Test.mp4")
# initialize the video writer object
writer = create_video_writer(video_cap, "Testoutput.mp4")

# load the pre-trained YOLOv8n model
model = YOLO("yolov8n.pt")
tracker = DeepSort(max_age=50)


while True:
    start = datetime.datetime.now()

    ret, frame = video_cap.read()

    if not ret:
        break

    # run the YOLO model on the frame
    detections = model(frame)[0]

    # initialize the list of bounding boxes and confidences
    results = []

    ######################################
    # DETECTION
    ######################################

    # loop over the detections
    for data in detections.boxes.data.tolist():
        # extract the confidence (i.e., probability) associated with the prediction
        confidence = data[4]

        # filter out weak detections by ensuring the 
        # confidence is greater than the minimum confidence
        if float(confidence) < 0.1:
            continue

        # if the confidence is greater than the minimum confidence,
        # get the bounding box and the class id
        xmin, ymin, xmax, ymax = int(data[0]), int(data[1]), int(data[2]), int(data[3])
        class_id = int(data[5])
        # add the bounding box (x, y, w, h), confidence and class id to the results list
        results.append([[xmin, ymin, xmax - xmin, ymax - ymin], confidence, class_id])

    ######################################
    # TRACKING
    ######################################

    # update the tracker with the new detections
    tracks = tracker.update_tracks(results, frame=frame)
    # loop over the tracks
    for track in tracks:
        # if the track is not confirmed, ignore it
        if not track.is_confirmed():
            continue

        # get the track id and the bounding box
        track_id = track.track_id
        ltrb = track.to_ltrb()

        xmin, ymin, xmax, ymax = int(ltrb[0]), int(
            ltrb[1]), int(ltrb[2]), int(ltrb[3])
        # draw the bounding box and the track id
        cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), GREEN, 2)
        cv2.rectangle(frame, (xmin, ymin - 20), (xmin + 20, ymin), GREEN, -1)
        cv2.putText(frame, str(track_id), (xmin + 5, ymin - 8),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, WHITE, 2)

    # end time to compute the fps
    end = datetime.datetime.now()
    # show the time it took to process 1 frame
    print(f"Time to process 1 frame: {(end - start).total_seconds() * 1000:.0f} milliseconds")
    # calculate the frame per second and draw it on the frame
    fps = f"FPS: {1 / (end - start).total_seconds():.2f}"
    cv2.putText(frame, fps, (50, 50),
                cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 255), 8)

    # show the frame to our screen
    # cv2.imshow("Frame", frame)
    # cv2.imshow(frame)
    writer.write(frame)
    if cv2.waitKey(1) == ord("q"):
        break

video_cap.release()
writer.release()
cv2.destroyAllWindows()

OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'



0: 384x640 35 persons, 2 birds, 142.3ms
Speed: 6.4ms preprocess, 142.3ms inference, 5.3ms postprocess per image at shape (1, 3, 384, 640)
Time to process 1 frame: 15095 milliseconds

0: 384x640 34 persons, 3 birds, 213.1ms
Speed: 3.4ms preprocess, 213.1ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)
Time to process 1 frame: 3684 milliseconds

0: 384x640 35 persons, 3 birds, 132.4ms
Speed: 3.9ms preprocess, 132.4ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)
Time to process 1 frame: 1823 milliseconds

0: 384x640 36 persons, 2 birds, 120.6ms
Speed: 3.1ms preprocess, 120.6ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)
Time to process 1 frame: 1694 milliseconds

0: 384x640 37 persons, 2 birds, 125.6ms
Speed: 2.9ms preprocess, 125.6ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)
Time to process 1 frame: 2128 milliseconds

0: 384x640 37 persons, 2 birds, 129.2ms
Speed: 3.5ms preprocess, 129.2ms inference, 1.6m

KeyboardInterrupt: 