# Accuracy Measurements for Object Detection and Tracking with YOLOv8, ByteTrack, and the VIRAT Dataset

## First, obtain sample video from dataset for inference

In [10]:
import pandas as pd

In [1]:
# upload sample file: 'VIRAT_S_000200_03_000657_000899_first300.mp4'
# this is the first 300 frames of 'VIRAT_S_000200_03_000657_000899.mp4'

!gdown "https://drive.google.com/uc?id=1C-HPOEIdZnJm_RSbYGXH6gMLhlmobaxn" -O test.mp4

Downloading...
From: https://drive.google.com/uc?id=1C-HPOEIdZnJm_RSbYGXH6gMLhlmobaxn
To: /content/test.mp4
100% 10.4M/10.4M [00:00<00:00, 52.9MB/s]


I used `ffmpeg -i VIRAT_S_000200_03_000657_000899.mp4 -c:v copy -frames:v 300 -r 30 -an ~/Desktop/VIRAT_S_000200_03_000657_000899_first300.mp4     
`

In [2]:
# display details of clip
!ffprobe -v quiet -print_format json -show_format -show_streams test.mp4

{
    "streams": [
        {
            "index": 0,
            "codec_name": "mpeg4",
            "codec_long_name": "MPEG-4 part 2",
            "profile": "Simple Profile",
            "codec_type": "video",
            "codec_time_base": "1/30",
            "codec_tag_string": "mp4v",
            "codec_tag": "0x7634706d",
            "width": 1280,
            "height": 720,
            "coded_width": 1280,
            "coded_height": 720,
            "has_b_frames": 0,
            "sample_aspect_ratio": "1:1",
            "display_aspect_ratio": "16:9",
            "pix_fmt": "yuv420p",
            "level": 3,
            "color_range": "tv",
            "color_space": "bt709",
            "color_transfer": "bt709",
            "color_primaries": "bt709",
            "chroma_location": "left",
            "refs": 1,
            "quarter_sample": "false",
            "divx_packed": "false",
            "r_frame_rate": "30/1",
            "avg_frame_rate": "30/1",
            "tim

In [3]:
!ls -lh

total 10M
drwxr-xr-x 1 root root 4.0K Jun 23 01:15 sample_data
-rw-r--r-- 1 root root  10M Jun 23 18:46 test.mp4


## Secondly, obtain the ground truth bounding boxes and track IDs for this segment

In [4]:
# VIRAT_S_000200_03_000657_000899.types.yml
!gdown "https://drive.google.com/uc?id=12h_35hXzoSciduBzDWmpKB2-rgUuraiN" -O test.types.yml

Downloading...
From: https://drive.google.com/uc?id=12h_35hXzoSciduBzDWmpKB2-rgUuraiN
To: /content/test.types.yml
  0% 0.00/2.71k [00:00<?, ?B/s]100% 2.71k/2.71k [00:00<00:00, 19.3MB/s]


In [5]:
# VIRAT_S_000200_03_000657_000899.regions.yml
!gdown "https://drive.google.com/uc?id=1Ieau47ZxLLpE6mw04XwjfANeQPB9m7hY" -O test.regions.yml

Downloading...
From: https://drive.google.com/uc?id=1Ieau47ZxLLpE6mw04XwjfANeQPB9m7hY
To: /content/test.regions.yml
  0% 0.00/12.6M [00:00<?, ?B/s]100% 12.6M/12.6M [00:00<00:00, 142MB/s]


In [6]:
# VIRAT_S_000200_03_000657_000899.geom.yml
!gdown "https://drive.google.com/uc?id=1UH9s2MPSZFdJJ7TD827DrmqGPtR_CWdQ" -O test.geom.yml

Downloading...
From: https://drive.google.com/uc?id=1UH9s2MPSZFdJJ7TD827DrmqGPtR_CWdQ
To: /content/test.geom.yml
  0% 0.00/8.54M [00:00<?, ?B/s] 98% 8.39M/8.54M [00:00<00:00, 83.6MB/s]100% 8.54M/8.54M [00:00<00:00, 84.5MB/s]


In [7]:
# VIRAT_S_000200_03_000657_000899.activities.yml
!gdown "https://drive.google.com/uc?id=1tTAnWLE5f9FhbWCe4vElR7gD9aK0jYin" -O test.activities.yml

Downloading...
From: https://drive.google.com/uc?id=1tTAnWLE5f9FhbWCe4vElR7gD9aK0jYin
To: /content/test.activities.yml
  0% 0.00/14.3k [00:00<?, ?B/s]100% 14.3k/14.3k [00:00<00:00, 61.0MB/s]


In [8]:
!ls -lh

total 31M
drwxr-xr-x 1 root root 4.0K Jun 23 01:15 sample_data
-rw-r--r-- 1 root root  14K Jun 23 18:46 test.activities.yml
-rw-r--r-- 1 root root 8.2M Jun 23 18:46 test.geom.yml
-rw-r--r-- 1 root root  10M Jun 23 18:46 test.mp4
-rw-r--r-- 1 root root  12M Jun 23 18:46 test.regions.yml
-rw-r--r-- 1 root root 2.7K Jun 23 18:46 test.types.yml


In [9]:
# get bounding boxes, classes and track ids for detections in the first frame
import yaml

file_path = 'test.geom.yml'
with open(file_path, 'r') as file:
    geom = yaml.safe_load(file)
file_path = 'test.types.yml'
with open(file_path, 'r') as file:
    types = yaml.safe_load(file)

### Create Ground Truth DataFrame

AS pd.DataFrame </br>
format: </br>
{idx: geom_id0, </br>
track_id: geom_id1, </br>
label: types_cset3-key_on_id1, </br>
conf: types_cset3-value_on_id1, </br>
frame: geom_ts0, </br>
xmin: geom_g0.split(' ')[0], ymin: geom_g0.split(' ')[1], xmax: geom_g0.split(' ')[2], ymax: geom_g0.split(' ')[3]}

In [14]:
# helper function to get gt labels and confidence from types

def get_labels_conf(types, frame_num):
  # returns lists: labels, conf
  labels = []
  conf = []
  for i in types:
    try:
      if (i['types']['id1'] in track_id):
          label, conf = next(iter(i['types']['cset3'].items()))
          labels.append(label)
          conf.appens(conf)
    except:
      pass

  return labels, conf

In [27]:
# create a ground truth dataframe, gtdf

detections = []
frame_num = 0

# sample of geom with detections by frame_num
for i in geom:
  try:
    if (i['geom']['ts0'] == frame_num):
      detections.append(i)
  except:
    pass

# parse data from detections
idx = [i['geom']['id0'] for i in detections]
track_id = [i['geom']['id1'] for i in detections]
label, conf = get_labels_conf(types, frame_num)
frame = [i['geom']['ts0'] for i in detections]
xmin, ymin, xmax, ymax = [], [], [], []
for i in detections:
  bb = i['geom']['g0'].split(' ')
  xmin.append(bb[0])
  ymin.append(bb[1])
  xmax.append(bb[2])
  ymax.append(bb[3])

gt = {'idx': idx, 'track_id': track_id, 'label': label, 'conf': conf, 'frame': frame, 'xmin': xmin, 'ymin': ymin, 'xmax': xmax, 'ymax': ymax}
gtdf = pd.DataFrame(gt)
gtdf

Unnamed: 0,idx,track_id,label,conf,frame,xmin,ymin,xmax,ymax
0,0,1,Vehicle,1.0,0,946,327,1010,360
1,7243,2,Vehicle,1.0,0,648,495,801,559
2,14486,3,Vehicle,1.0,0,114,375,237,419
3,21729,4,Vehicle,1.0,0,102,349,212,386
4,28972,5,Vehicle,1.0,0,902,659,1023,718
5,34911,33,Person,1.0,0,281,517,320,577
6,39594,34,Person,1.0,0,730,392,756,437
7,46239,35,Person,1.0,0,1149,644,1185,714
8,51025,5000,Parking_Meter,1.0,0,1056,368,1070,402
9,58268,5001,Dumpster,1.0,0,69,291,117,328


AS LIST:
format: [[xmin, ymin, xmax, ymax, track_id, class], ... ]

ISSUE BELOW:
labels are not being parsed correctly... getting a lot of '-1's

In [11]:
frame_num = 0
frame_1_detections = []
all_labels = []
all_detections_with_labels = []

# bounding boxes and track ids
for i in geom:
  try:
    if (i['geom']['ts0'] == frame_num):
      frame_1_detections.append(i)
  except:
    pass

# classes
for j in range(len(frame_1_detections)):
  for k in types:
    try:
      if frame_1_detections[j]['geom']['id1'] == k['types']['id1']:
        all_labels.append(k)
    except:
      pass

# create list of detection objects [[xmin, ymin, xmax, ymax, track_id, class], ... ]
for i in range(len(frame_1_detections)):
  xmin, ymin, xmax, ymax = frame_1_detections[i]['geom']['g0'].split(' ')
  track_id = frame_1_detections[i]['geom']['id1']
  class_label = -1

  for j in range(len(all_labels)):
    class_str = ''
    if track_id == all_labels[j]['types']['id1']:
      class_str = list(all_labels[j]['types']['cset3'].keys())[0]

    if class_str == 'Vehicle':
      class_label = 2
    elif class_str == 'Person':
      class_label = 0
    elif class_str == 'Bike':
      class_label = 1

  this_detection = [xmin, ymin, xmax, ymax, track_id, class_label]
  all_detections_with_labels.append(this_detection)

all_detections_with_labels

[['946', '327', '1010', '360', 1, 2],
 ['648', '495', '801', '559', 2, 2],
 ['114', '375', '237', '419', 3, 2],
 ['102', '349', '212', '386', 4, 2],
 ['902', '659', '1023', '718', 5, 2],
 ['281', '517', '320', '577', 33, 0],
 ['730', '392', '756', '437', 34, 0],
 ['1149', '644', '1185', '714', 35, 0],
 ['1056', '368', '1070', '402', 5000, -1],
 ['69', '291', '117', '328', 5001, -1],
 ['0', '470', '226', '719', 5025, -1],
 ['72', '423', '95', '487', 5030, -1]]

In [11]:
frame_1_detections[0]

{'geom': {'id1': 1,
  'id0': 0,
  'ts0': 0,
  'ts1': 0,
  'g0': '946 327 1010 360',
  'src': 'truth'}}

In [12]:
all_labels[0]

{'types': {'id1': 1, 'cset3': {'Vehicle': 1.0}}}

In [12]:
key, val = next(iter(all_labels[0]['types']['cset3'].items()))
key

'Vehicle'

In [14]:
key = list(all_labels[0]['types']['cset3'].keys())[0]
key

'Vehicle'

In [15]:
labels_full = []
for i in types:
  try:
    if frame_1_detections[0]['geom']['id1'] == i['types']['id1']:
      labels_full.append(i)
  except:
    pass

labels_full

[{'types': {'id1': 1, 'cset3': {'Vehicle': 1.0}}}]

## Thirdly, get inference from YOLOv8 with ByteTracker on this test clip

Install YOLOv8

In [16]:
!pip -q install ultralytics
!pip -q install lap

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for lap (setup.py) ... [?25l[?25hdone


In [17]:
from ultralytics import YOLO
import lap

model = YOLO('yolov8n.pt')

Downloading https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n.pt to yolov8n.pt...
100%|██████████| 6.23M/6.23M [00:00<00:00, 69.3MB/s]


In [18]:
results = model.track(source='/content/test.mp4', conf=0.12, iou=0.5,
                      device=0, save_txt=True, imgsz=1280, classes=[0,1,2],
                      tracker="bytetrack.yaml", stream=True)
list_results = list(results)


video 1/1 (1/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 62.9ms
video 1/1 (2/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 13.4ms
video 1/1 (3/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 13.4ms
video 1/1 (4/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 13.4ms
video 1/1 (5/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 13.3ms
video 1/1 (6/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 13.4ms
video 1/1 (7/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 13.3ms
video 1/1 (8/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 12.8ms
video 1/1 (9/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 12.8ms
video 1/1 (10/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 12.9ms
video 1/1 (11/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 12.8ms
video 1/1 (12/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 12.9ms
video 1/1 (13/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 12.8ms
video 1/1 (14/300) /content/test.mp4: 736x1280 3 persons, 5

In [19]:
# frame 1 detections as .txt file(s)
!cat runs/detect/track/labels/test_1.txt

2 0.137419 0.553092 0.0945308 0.0548357 1
2 0.763876 0.478838 0.0469688 0.0366087 2
2 0.123751 0.511463 0.0826546 0.044233 3
2 0.567661 0.733765 0.111929 0.0792986 4
0 0.232489 0.761387 0.019395 0.080564 5
0 0.581185 0.575837 0.0138645 0.05707 6
2 0.751597 0.962718 0.090546 0.0741219 7
0 0.909582 0.944357 0.0208885 0.095949 8


.txt files format ^
- [class, xmid, ymid, w, h, track_id]</br></br>

In [20]:
# frame 1 detection as tensor objects
list_results[0].boxes



ultralytics.yolo.engine.results.Boxes object with attributes:

boxes: tensor([[1.1540e+02, 3.7849e+02, 2.3640e+02, 4.1797e+02, 1.0000e+00, 8.8784e-01, 2.0000e+00],
        [9.4770e+02, 3.3158e+02, 1.0078e+03, 3.5794e+02, 2.0000e+00, 8.5510e-01, 2.0000e+00],
        [1.0550e+02, 3.5233e+02, 2.1130e+02, 3.8418e+02, 3.0000e+00, 8.5349e-01, 2.0000e+00],
        [6.5497e+02, 4.9976e+02, 7.9824e+02, 5.5686e+02, 4.0000e+00, 8.5208e-01, 2.0000e+00],
        [2.8517e+02, 5.1920e+02, 3.1000e+02, 5.7720e+02, 5.0000e+00, 8.3465e-01, 0.0000e+00],
        [7.3504e+02, 3.9406e+02, 7.5279e+02, 4.3515e+02, 6.0000e+00, 7.6037e-01, 0.0000e+00],
        [9.0410e+02, 6.6647e+02, 1.0200e+03, 7.1984e+02, 7.0000e+00, 7.5590e-01, 2.0000e+00],
        [1.1509e+03, 6.4540e+02, 1.1776e+03, 7.1448e+02, 8.0000e+00, 6.6577e-01, 0.0000e+00]])
cls: tensor([2., 2., 2., 2., 0., 0., 2., 0.])
conf: tensor([0.8878, 0.8551, 0.8535, 0.8521, 0.8347, 0.7604, 0.7559, 0.6658])
data: tensor([[1.1540e+02, 3.7849e+02, 2.3640e+02, 4

ultralytics.yolo.engine.results.Boxes attribute formats ^:
- Boxes.boxes, Boxes.data use [xmin, ymin, xmax, ymax, track_id, conf, class]
- xywh, xywhn use [xmid, ymid, w, h]
- xyxy, xyxyn use [xmin, ymin, xmax, ymax]

In [21]:
list_results[0].boxes.data[0][6]

tensor(2.)