Способ 1: только инструментами yolo

In [2]:
import torch
import sys
from collections import defaultdict
import time
import datetime

import cv2
import numpy as np

from ultralytics import YOLO

# print(cv2.getBuildInformation())

model = YOLO('../../models/yolov8n-face.pt')
model.to("cuda")
video_path = "rtspsrc location=rtsp://127.0.0.1:18554/test user-id=user user-pw=pass latency=0 ! rtpjitterbuffer drop-on-latency=true ! decodebin ! videoconvert ! appsink"
cap = cv2.VideoCapture(video_path, cv2.CAP_GSTREAMER)

track_history = defaultdict(lambda: [])

while cap.isOpened():
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break
    start = datetime.datetime.now()
    success, frame = cap.read()
    if success:
        # Run YOLOv8 tracking on the frame, persisting tracks between frames
        results = model.track(frame, persist=True, tracker="../../cfg/bytetrack.yaml")
        if len(results) != 0 and results[0].boxes.id != None:

            boxes = results[0].boxes.xywh.cpu()
            track_ids = results[0].boxes.id.int().cpu().tolist()

            frame = results[0].plot()

            for (x, y, w, h), track_id in zip(boxes, track_ids):
                track = track_history[track_id]
                track.append((float(x), float(y)))
                if len(track) > 30:
                    track.pop(0)

                points = np.hstack(track).astype(np.int32).reshape((-1, 1, 2))
                cv2.polylines(frame, [points], isClosed=False, color=(230, 230, 230), thickness=10)

        end = datetime.datetime.now()
        fps = f"FPS: {1 / (end - start).total_seconds():.2f}"
        cv2.putText(frame, fps, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 8)

        cv2.imshow("YOLOv8 Tracking", frame)

    else:
        print("no frame")
cap.release()
cv2.destroyAllWindows()  




0: 384x640 1 face, 3.8ms
Speed: 2.4ms preprocess, 3.8ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 face, 3.9ms
Speed: 1.5ms preprocess, 3.9ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 face, 4.6ms
Speed: 2.1ms preprocess, 4.6ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 face, 6.0ms
Speed: 1.3ms preprocess, 6.0ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 face, 5.6ms
Speed: 1.2ms preprocess, 5.6ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 face, 4.8ms
Speed: 1.9ms preprocess, 4.8ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 face, 5.4ms
Speed: 1.7ms preprocess, 5.4ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 face, 4.0ms
Speed: 1.2ms preprocess, 4.0ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 face, 3.9m

KeyboardInterrupt: 

Способ 2: с помощью deepsort_tracker

In [3]:
from collections import defaultdict
import time
import cv2
import numpy as np
import datetime
from ultralytics import YOLO
import cv2
from deep_sort_realtime.deepsort_tracker import DeepSort
from ultralytics import YOLO

# Load the YOLOv8 model
model = YOLO('../../models/yolov8n-face.pt')

# Open the video file
video_path = "rtspsrc location=rtsp://127.0.0.1:18554/test user-id=user user-pw=pass latency=0 ! rtpjitterbuffer drop-on-latency=true ! decodebin ! videoconvert ! appsink"
cap = cv2.VideoCapture(video_path, cv2.CAP_GSTREAMER)

FPS = 10  # Задайте желаемое значение FPS
CONFIDENCE_THRESHOLD = 0.4
GREEN = (0, 255, 0)
WHITE = (255, 255, 255)
# Store the track history
track_history = defaultdict(lambda: [])
tracker = DeepSort(max_age=50)

while cap.isOpened():
    start = datetime.datetime.now()
    # Read a frame from the video
    success, frame = cap.read()

    if not success:
        continue
    
    # Run YOLOv8 tracking on the frame, persisting tracks between frames
    detections = model.track(frame, persist=True)[0]

    if detections.boxes.id == None:
        continue
        
    # initialize the list of bounding boxes and confidences
    results = []

    ######################################
    # DETECTION
    ######################################

    # loop over the detections
    for data in detections.boxes.data.tolist():
        # extract the confidence (i.e., probability) associated with the prediction
        confidence = data[4]

        # filter out weak detections by ensuring the 
        # confidence is greater than the minimum confidence
        if float(confidence) < CONFIDENCE_THRESHOLD:
            continue

        # if the confidence is greater than the minimum confidence,
        # get the bounding box and the class id
        xmin, ymin, xmax, ymax = int(data[0]), int(data[1]), int(data[2]), int(data[3])
        class_id = int(data[5])
        # add the bounding box (x, y, w, h), confidence and class id to the results list
        results.append([[xmin, ymin, xmax - xmin, ymax - ymin], confidence, class_id])


    ######################################
    # TRACKING
    ######################################

    # update the tracker with the new detections
    tracks = tracker.update_tracks(results, frame=frame)
    # loop over the tracks
    for track in tracks:
        # if the track is not confirmed, ignore it
        if not track.is_confirmed():
            continue

        # get the track id and the bounding box
        track_id = track.track_id
        ltrb = track.to_ltrb()

        xmin, ymin, xmax, ymax = int(ltrb[0]), int(
            ltrb[1]), int(ltrb[2]), int(ltrb[3])
        # draw the bounding box and the track id
        cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), GREEN, 2)
        cv2.rectangle(frame, (xmin, ymin - 20), (xmin + 20, ymin), GREEN, -1)
        cv2.putText(frame, str(track_id), (xmin + 5, ymin - 8),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, WHITE, 2)

    end = datetime.datetime.now()
    print(f"Time to process 1 frame: {(end - start).total_seconds() * 1000:.0f} milliseconds")
    fps = f"FPS: {1 / (end - start).total_seconds():.2f}"
    cv2.putText(frame, fps, (50, 50),
                cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 255), 8)

    # show the frame to our screen
    cv2.imshow("Frame", frame)
    if cv2.waitKey(1) == ord("q"):
        break

# Release the video capture object and close the display window
cap.release()
cv2.destroyAllWindows()



0: 384x640 1 face, 8.9ms
Speed: 1.9ms preprocess, 8.9ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)


KeyboardInterrupt: 