Задание 1

In [3]:
from ultralytics import YOLO
import numpy as np
import cv2

In [37]:
# Загружаем модель YOLOv8x
model = YOLO("yolov8x.pt")

# Загружаем видео
video = cv2.VideoCapture("test_video_short.mp4")
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))

track_ids = {}
kalman_filters = {} # фильтр Калмана делает отслеживание более точным и плавным

# Создание видеозаписи
fourcc = cv2.VideoWriter_fourcc(*'XVID')
writer = cv2.VideoWriter('output.mp4', fourcc, 20.0, (width, height))
   

In [None]:
while(video.isOpened()):
    ret, frame = video.read()
    if not ret:
        break

    results = model.predict(frame, classes=[0], conf=0.2, iou=0.5)

    # Получаем координаты распознанных людей
    detections = results[0].boxes.xyxy.cpu().numpy().astype(int)

    # Обновляем словарь track_ids
    for det in detections:
        x1, y1, x2, y2 = det
        cur_center_x = (x1 + x2) // 2
        cur_center_y = (y1 + y2) // 2

        # Проверяем, есть ли уже ID для этого человека
        found_id = False
        for id, (prev_center_x, prev_center_y) in track_ids.items():
            # Находим расстояние 
            distance = np.sqrt((cur_center_x - prev_center_x)**2 + (cur_center_y - prev_center_y)**2)
            if distance < 50:
                track_ids[id] = (cur_center_x, cur_center_y)
                found_id = True
                break

        # Если ID не найден, создаем новый
        if not found_id:
            new_id = len(track_ids) + 1
            track_ids[new_id] = (cur_center_x, cur_center_y)
            # Создаем Kalman filter для нового объекта
            kalman_filters[new_id] = cv2.KalmanFilter(4, 2)
            # Инициализируем фильтр
            kalman_filters[new_id].measurementMatrix = np.array([[1, 0, 0, 0], [0, 1, 0, 0]], np.float32)
            kalman_filters[new_id].transitionMatrix = np.array([[1, 0, 1, 0], [0, 1, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1]], np.float32)
            # Начальное состояние
            kalman_filters[new_id].statePre = np.array([[cur_center_x], [cur_center_y], [0], [0]], np.float32)
            kalman_filters[new_id].statePost = np.array([[cur_center_x], [cur_center_y], [0], [0]], np.float32)

    # Обновляем Kalman filter для каждого объекта
    for id in track_ids:
        # Если объект был обнаружен в текущем кадре
        if id in track_ids:
            # Получаем центр объекта
            cur_center_x, cur_center_y = track_ids[id]
            # Обновляем Kalman filter
            measurement = np.array([[cur_center_x], [cur_center_y]], np.float32)
            kalman_filters[id].correct(measurement)

            prediction = kalman_filters[id].predict()
            track_ids[id] = (int(prediction[0]), int(prediction[1]))

    # Рисуем прямоугольники и ID
    for id, (cur_center_x, cur_center_y) in track_ids.items():
        for det in detections:
            x1, y1, x2, y2 = det
            if (x1 < cur_center_x < x2) and (y1 < cur_center_y < y2):
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(frame, f"ID: {id}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    cv2.putText(frame, f'People: {len(detections)}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    cv2.imshow("Video", frame)
    writer.write(frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

video.release()
writer.release()
cv2.destroyAllWindows()

Задание 2

In [15]:
model = YOLO("yolov8x.pt")

video = cv2.VideoCapture("test_video_short.mp4")
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Цветовые модели HSV
hsv_orange_lower = np.array([10, 100, 100], dtype="uint8")
hsv_orange_upper = np.array([25, 255, 255], dtype="uint8")
hsv_green_lower = np.array([40, 50, 50], dtype="uint8")
hsv_green_upper = np.array([80, 255, 255], dtype="uint8")

# Создание видеозаписи
fourcc = cv2.VideoWriter_fourcc(*'XVID')
writer = cv2.VideoWriter('output.mp4', fourcc, 20.0, (width, height))
   

In [16]:
def person_has_helmet(det, frame):
    x1, y1, x2, y2 = det
    person_roi = frame[y1:int(y1 + (y2 - y1) / 3), x1:x2]  
    hsv = cv2.cvtColor(person_roi, cv2.COLOR_BGR2HSV)  
    mask_orange = cv2.inRange(hsv, hsv_orange_lower, hsv_orange_upper)
    mask_green = cv2.inRange(hsv, hsv_green_lower, hsv_green_upper)     
    return np.sum(mask_orange) > 100 or np.sum(mask_green) > 100

In [18]:
while(video.isOpened()):
    ret, frame = video.read()   
    if not ret:
        break

    results = model.track(frame, persist=True, conf=0.2, classes=[0])
        
    detections = results[0].boxes.xyxy.cpu().numpy().astype(int)
    track_ids = results[0].boxes.id.int().cpu().tolist()

    # наличие/отсутствие шлема у каждого обнаруженного объекта
    no_helmet = [not person_has_helmet(det, frame) for det in detections]
        
    # индексы людей без шлема    
    no_helmet_track_ids = [i for i, not_has_helmet in enumerate(no_helmet) if not_has_helmet]

    for index in no_helmet_track_ids:
        x1, y1, x2, y2 = detections[index]
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)


    # Вывод результата на экран
    cv2.imshow("Video", frame)
    writer.write(frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

video.release()
writer.release()
cv2.destroyAllWindows()



0: 384x640 4 persons, 1755.6ms
Speed: 3.0ms preprocess, 1755.6ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 1714.6ms
Speed: 2.0ms preprocess, 1714.6ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 1714.8ms
Speed: 2.0ms preprocess, 1714.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 1700.5ms
Speed: 4.0ms preprocess, 1700.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 1696.5ms
Speed: 2.0ms preprocess, 1696.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 1700.5ms
Speed: 3.0ms preprocess, 1700.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 1707.6ms
Speed: 3.0ms preprocess, 1707.6ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 1710.6ms
Speed: 2.0ms preprocess, 1710.6ms inference, 2.0ms 