In [6]:
!python --version

Python 3.8.19


In [1]:
import ultralytics
ultralytics.__version__

'8.2.64'

In [2]:
import torch
torch.__version__

'2.4.0'

In [3]:
torch.cuda.get_device_name(0)

'NVIDIA A100-PCIE-40GB'

# Detect, track and count Persons

In [10]:
%ls

baari.jpg   README.md         small_video.mp4             yolov8l.pt
convert.py  requirements.txt  track_count_persons .ipynb  yolov8n.pt
[0m[01;34mdeep_sort[0m/  [01;34mruns[0m/             video.mp4


In [4]:
from ultralytics import YOLO

import time
import torch
import cv2
import torch.backends.cudnn as cudnn
from PIL import Image
import colorsys
import numpy as np

In [5]:


# Load a model
model = YOLO("yolov8l.pt")  # load a pretrained model (recommended for training)

results = model("baari.jpg", save=True)



class_names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']

for result in results:
    boxes = result.boxes  # Boxes object for bbox outputs
    probs = result.probs  # Class probabilities for classification outputs
    cls = boxes.cls.tolist()  # Convert tensor to list
    xyxy = boxes.xyxy
    xywh = boxes.xywh  # box with xywh format, (N, 4)
    conf = boxes.conf
    print(cls)
    for class_index in cls:
        class_name = class_names[int(class_index)]
        print("Class:", class_name)


image 1/1 /nfsshare/baari/STUDIO/YOLO_DEEPSORT/baari.jpg: 640x640 2 persons, 1 cup, 1 chair, 1 refrigerator, 1 book, 10.6ms
Speed: 7.0ms preprocess, 10.6ms inference, 571.1ms postprocess per image at shape (1, 3, 640, 640)
Results saved to [1mruns/detect/predict5[0m
[0.0, 56.0, 0.0, 41.0, 73.0, 72.0]
Class: person
Class: chair
Class: person
Class: cup
Class: book
Class: refrigerator


# DeepSORT

In [6]:
from deep_sort.utils.parser import get_config
from deep_sort.deep_sort import DeepSort
from deep_sort.sort.tracker import Tracker

deep_sort_weights = 'deep_sort/deep/checkpoint/ckpt.t7'
tracker = DeepSort(model_path=deep_sort_weights, max_age=70)

In [14]:
#%pip install easydict

In [15]:
!wget -O video.mp4 https://www.pexels.com/download/video/4296854/

--2024-07-27 18:41:55--  https://www.pexels.com/download/video/4296854/
Resolving www.pexels.com (www.pexels.com)... 104.18.66.220, 104.18.67.220, 2606:4700::6812:43dc, ...
Connecting to www.pexels.com (www.pexels.com)|104.18.66.220|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://videos.pexels.com/video-files/4296854/4296854-uhd_3840_2160_25fps.mp4 [following]
--2024-07-27 18:41:56--  https://videos.pexels.com/video-files/4296854/4296854-uhd_3840_2160_25fps.mp4
Resolving videos.pexels.com (videos.pexels.com)... 104.18.67.220, 104.18.66.220, 2606:4700::6812:43dc, ...
Connecting to videos.pexels.com (videos.pexels.com)|104.18.67.220|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27221342 (26M)
Saving to: ‘video.mp4’


2024-07-27 18:42:02 (4.15 MB/s) - ‘video.mp4’ saved [27221342/27221342]



In [7]:
# Define the video path
video_path = 'data/baari_shakthi_trimmed.mp4'
cap = cv2.VideoCapture(video_path)

# Get the video properties
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)

# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
output_path = 'output.mp4'
out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Load model once before the loop
model = YOLO("yolov8s.pt")  # Load the pretrained model


In [8]:
# Color array for bounding boxes
colors = [(0, 0, 255), (255, 0, 0), (0, 255, 0)]  # Red, Blue, Green

person_info = {}

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    # Convert the frame to RGB
    og_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Perform detection
    results = model(og_frame, device=device, classes=0, conf=0.8)
    class_names = ['person']

    # Prepare data for DeepSORT
    bbox_xywh = []
    confs = []

    for result in results:
        boxes = result.boxes
        cls = boxes.cls.tolist()
        xyxy = boxes.xyxy.cpu().numpy()
        conf = boxes.conf.cpu().numpy()

        for i, (bbox, cls_id, confidence) in enumerate(zip(xyxy, cls, conf)):
            class_name = class_names[int(cls_id)]
            if class_name == 'person':  # Track only 'person' class
                x1, y1, x2, y2 = map(int, bbox)
                w, h = x2 - x1, y2 - y1

                # Append to bbox list
                bbox_xywh.append([x1, y1, w, h])
                confs.append(float(confidence))

    # Convert lists to numpy arrays for DeepSORT
    bbox_xywh = np.array(bbox_xywh)
    confs = np.array(confs)

    # Run DeepSORT tracking
    outputs = tracker.update(bbox_xywh, confs, og_frame)

    # Process tracking outputs
    for output in outputs:
        x1, y1, x2, y2, track_id = output
        color = colors[track_id % len(colors)]


        # Ensure the coordinates are within the frame dimensions
        x1 = max(0, min(x1, frame_width))
        y1 = max(0, min(y1, frame_height))
        x2 = max(0, min(x2, frame_width))
        y2 = max(0, min(y2, frame_height))
        
        # Draw bounding box and ID
        cv2.rectangle(og_frame, (x1, y1), (x2, y2), color, 2)
        cv2.putText(og_frame, f"ID: {track_id}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

        # Store person info in dictionary
        person_info[track_id] = {
            'bbox': (x1, y1, x2-x1, y2-y1),
            'confidence': confs[track_id % len(confs)],
            'color': color
        }

    # Draw person count on frame
    person_count = len(person_info)
    cv2.putText(og_frame, f"Person Count: {person_count}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

    # Convert the frame back to BGR for OpenCV compatibility
    out_frame = cv2.cvtColor(og_frame, cv2.COLOR_RGB2BGR)
    out.write(out_frame)


cap.release()
out.release()
cv2.destroyAllWindows()

# Print person information
print("Person Information:")
for person_id, info in person_info.items():
    print(f"ID: {person_id}, BBox: {info['bbox']}, Confidence: {info['confidence']}, Color: {info['color']}")


0: 384x640 4 persons, 53.0ms
Speed: 2.0ms preprocess, 53.0ms inference, 8.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 8.6ms
Speed: 2.6ms preprocess, 8.6ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 7.4ms
Speed: 2.3ms preprocess, 7.4ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 7.3ms
Speed: 2.2ms preprocess, 7.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 7.0ms
Speed: 3.1ms preprocess, 7.0ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 6.9ms
Speed: 2.5ms preprocess, 6.9ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 7.0ms
Speed: 2.1ms preprocess, 7.0ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 7.3ms
Speed: 2.1ms preprocess, 7.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640