In [1]:
pip install -r requirements.txt

Collecting gitpython>=3.1.30Note: you may need to restart the kernel to use updated packages.

  Downloading GitPython-3.1.44-py3-none-any.whl (207 kB)
     -------------------------------------- 207.6/207.6 kB 3.2 MB/s eta 0:00:00
Collecting numpy>=1.23.5
  Downloading numpy-2.0.2-cp39-cp39-win_amd64.whl (15.9 MB)
     ---------------------------------------- 15.9/15.9 MB 9.8 MB/s eta 0:00:00
Collecting pillow>=10.3.0
  Downloading pillow-11.1.0-cp39-cp39-win_amd64.whl (2.6 MB)
     ---------------------------------------- 2.6/2.6 MB 4.4 MB/s eta 0:00:00
Collecting requests>=2.32.2
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
     ---------------------------------------- 64.9/64.9 kB 3.4 MB/s eta 0:00:00
Collecting thop>=0.1.1
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Collecting torch>=1.8.0
  Downloading torch-2.5.1-cp39-cp39-win_amd64.whl (203.0 MB)
     -------------------------------------- 203.0/203.0 MB 6.4 MB/s eta 0:00:00
Collecting torchvision

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
daal4py 2021.6.0 requires daal==2021.4.0, which is not installed.
numba 0.55.1 requires numpy<1.22,>=1.18, but you have numpy 1.24.4 which is incompatible.
conda-repo-cli 1.0.20 requires clyent==1.2.1, but you have clyent 1.2.2 which is incompatible.
conda-repo-cli 1.0.20 requires nbformat==5.4.0, but you have nbformat 5.5.0 which is incompatible.
conda-repo-cli 1.0.20 requires requests==2.28.1, but you have requests 2.32.3 which is incompatible.


In [8]:
import cv2
import json
import numpy as np
from ultralytics import YOLO

model = YOLO('yolov8n.pt')

color_mapping = {
    "person": (255, 0, 0),  
    "car": (0, 255, 0),     
    "bike": (0, 0, 255),    
    "motorbike": (0, 255, 255), 
    "truck": (255, 255, 0), 
    "bus": (255, 0, 255),   
}

def detect_objects(video_path):
    cap = cv2.VideoCapture(video_path)
    results = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        
        detections = model(frame)

        main_object = None
        sub_objects = []

        for result in detections:
            for box in result.boxes:
                x1, y1, x2, y2 = map(int, box.xyxy[0])  
                class_id = int(box.cls[0])  
                confidence = box.conf[0]  

                object_name = model.names[class_id]
                detected_object = {
                    "object": object_name,
                    "id": len(results) + 1,
                    "bbox": [x1, y1, x2, y2],
                    "subobject": {} 
                }

                box_color = color_mapping.get(object_name, (255, 255, 255))


                if object_name == "person":
                    main_object = detected_object
                else:
                    sub_objects.append(detected_object)

                cv2.rectangle(frame, (x1, y1), (x2, y2), box_color, 2)
                cv2.putText(frame, f"{object_name}: {confidence:.2f}", (x1, y1 - 10),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, box_color, 2)


        if main_object:
            main_object["subobject"] = sub_objects  
            results.append(main_object)
        cv2.imshow("frame",frame)

    cap.release()
    cv2.destroyAllWindows()

    with open('detection_results.json', 'w') as json_file:
        json.dump(results, json_file, indent=4)


detect_objects('sample_video.mp4')



0: 640x384 2 persons, 2 cars, 1 truck, 350.5ms
Speed: 64.1ms preprocess, 350.5ms inference, 40.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 persons, 2 cars, 1 bus, 1 truck, 223.9ms
Speed: 5.9ms preprocess, 223.9ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 persons, 3 cars, 1 truck, 246.4ms
Speed: 10.1ms preprocess, 246.4ms inference, 4.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 persons, 3 cars, 1 truck, 205.1ms
Speed: 12.6ms preprocess, 205.1ms inference, 3.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 persons, 1 car, 1 truck, 209.7ms
Speed: 5.0ms preprocess, 209.7ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 persons, 1 car, 1 truck, 193.2ms
Speed: 9.8ms preprocess, 193.2ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 persons, 2 cars, 1 bus, 1 truck, 227.2ms
Speed: 4.5ms preprocess, 227.2ms inference, 2.5ms postproces


0: 640x384 3 persons, 4 cars, 194.9ms
Speed: 8.2ms preprocess, 194.9ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 persons, 4 cars, 1 bus, 213.3ms
Speed: 4.8ms preprocess, 213.3ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 4 cars, 1 bus, 184.9ms
Speed: 6.8ms preprocess, 184.9ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 3 cars, 1 bus, 226.0ms
Speed: 9.5ms preprocess, 226.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 4 cars, 1 bus, 1 truck, 210.0ms
Speed: 3.7ms preprocess, 210.0ms inference, 5.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 4 cars, 1 truck, 227.7ms
Speed: 8.5ms preprocess, 227.7ms inference, 7.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 4 cars, 1 truck, 213.6ms
Speed: 5.7ms preprocess, 213.6ms inference, 0.0ms postprocess per image at shape (1,

Speed: 11.0ms preprocess, 214.6ms inference, 10.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 3 cars, 203.1ms
Speed: 9.4ms preprocess, 203.1ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 3 cars, 188.8ms
Speed: 10.2ms preprocess, 188.8ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 3 cars, 195.7ms
Speed: 0.0ms preprocess, 195.7ms inference, 2.7ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 3 cars, 232.4ms
Speed: 0.0ms preprocess, 232.4ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 3 cars, 225.9ms
Speed: 3.0ms preprocess, 225.9ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 4 cars, 214.1ms
Speed: 11.5ms preprocess, 214.1ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 4 cars, 197.7ms
Speed: 7.0ms preprocess, 197.7ms infe