# YOLOv8n Model
# ("n" nano: smallest, quickest, but lowest accuracy)
## Inference and IOU Score on 300 frames of VIRAT Ground Video
- 5 Vehicles
- 3 Persons

## 1. Data Import

In [1]:
!python --version

Python 3.10.12


In [2]:
import pandas as pd
import numpy as np
import torch

In [3]:
# upload sample file: 'VIRAT_S_000200_03_000657_000899_first300.mp4'
# this is the first 300 frames of 'VIRAT_S_000200_03_000657_000899.mp4'
# viewable at: https://drive.google.com/file/d/1C-HPOEIdZnJm_RSbYGXH6gMLhlmobaxn/view?usp=drive_link

!gdown "https://drive.google.com/uc?id=1C-HPOEIdZnJm_RSbYGXH6gMLhlmobaxn" -O test.mp4

Downloading...
From: https://drive.google.com/uc?id=1C-HPOEIdZnJm_RSbYGXH6gMLhlmobaxn
To: /content/test.mp4
  0% 0.00/10.4M [00:00<?, ?B/s]100% 10.4M/10.4M [00:00<00:00, 144MB/s]


I used `ffmpeg -i VIRAT_S_000200_03_000657_000899.mp4 -c:v copy -frames:v 300 -r 30 -an ~/Desktop/VIRAT_S_000200_03_000657_000899_first300.mp4` to create cut the first 300 frames of this video to a clip



In [4]:
# display details of clip
!ffprobe -v quiet -print_format json -show_format -show_streams test.mp4

{
    "streams": [
        {
            "index": 0,
            "codec_name": "mpeg4",
            "codec_long_name": "MPEG-4 part 2",
            "profile": "Simple Profile",
            "codec_type": "video",
            "codec_time_base": "1/30",
            "codec_tag_string": "mp4v",
            "codec_tag": "0x7634706d",
            "width": 1280,
            "height": 720,
            "coded_width": 1280,
            "coded_height": 720,
            "has_b_frames": 0,
            "sample_aspect_ratio": "1:1",
            "display_aspect_ratio": "16:9",
            "pix_fmt": "yuv420p",
            "level": 3,
            "color_range": "tv",
            "color_space": "bt709",
            "color_transfer": "bt709",
            "color_primaries": "bt709",
            "chroma_location": "left",
            "refs": 1,
            "quarter_sample": "false",
            "divx_packed": "false",
            "r_frame_rate": "30/1",
            "avg_frame_rate": "30/1",
            "tim

In [5]:
!ls -lh

total 10M
drwxr-xr-x 1 root root 4.0K Jun 26 13:35 sample_data
-rw-r--r-- 1 root root  10M Jun 27 18:21 test.mp4


## 2. Ground Truth

- Data from https://gitlab.kitware.com/viratdata/viratannotations
- obtain the ground truth bounding boxes data
- store in Pandas DataFrame 'gtdf'

In [6]:
# VIRAT_S_000200_03_000657_000899.types.yml
!gdown "https://drive.google.com/uc?id=12h_35hXzoSciduBzDWmpKB2-rgUuraiN" -O test.types.yml

Downloading...
From: https://drive.google.com/uc?id=12h_35hXzoSciduBzDWmpKB2-rgUuraiN
To: /content/test.types.yml
  0% 0.00/2.71k [00:00<?, ?B/s]100% 2.71k/2.71k [00:00<00:00, 15.4MB/s]


In [7]:
# VIRAT_S_000200_03_000657_000899.regions.yml
!gdown "https://drive.google.com/uc?id=1Ieau47ZxLLpE6mw04XwjfANeQPB9m7hY" -O test.regions.yml

Downloading...
From: https://drive.google.com/uc?id=1Ieau47ZxLLpE6mw04XwjfANeQPB9m7hY
To: /content/test.regions.yml
  0% 0.00/12.6M [00:00<?, ?B/s]100% 12.6M/12.6M [00:00<00:00, 194MB/s]


In [8]:
# VIRAT_S_000200_03_000657_000899.geom.yml
!gdown "https://drive.google.com/uc?id=1UH9s2MPSZFdJJ7TD827DrmqGPtR_CWdQ" -O test.geom.yml

Downloading...
From: https://drive.google.com/uc?id=1UH9s2MPSZFdJJ7TD827DrmqGPtR_CWdQ
To: /content/test.geom.yml
  0% 0.00/8.54M [00:00<?, ?B/s]100% 8.54M/8.54M [00:00<00:00, 110MB/s]


In [9]:
# VIRAT_S_000200_03_000657_000899.activities.yml
!gdown "https://drive.google.com/uc?id=1tTAnWLE5f9FhbWCe4vElR7gD9aK0jYin" -O test.activities.yml

Downloading...
From: https://drive.google.com/uc?id=1tTAnWLE5f9FhbWCe4vElR7gD9aK0jYin
To: /content/test.activities.yml
  0% 0.00/14.3k [00:00<?, ?B/s]100% 14.3k/14.3k [00:00<00:00, 59.1MB/s]


In [10]:
!ls -lh

total 31M
drwxr-xr-x 1 root root 4.0K Jun 26 13:35 sample_data
-rw-r--r-- 1 root root  14K Jun 27 18:21 test.activities.yml
-rw-r--r-- 1 root root 8.2M Jun 27 18:21 test.geom.yml
-rw-r--r-- 1 root root  10M Jun 27 18:21 test.mp4
-rw-r--r-- 1 root root  12M Jun 27 18:21 test.regions.yml
-rw-r--r-- 1 root root 2.7K Jun 27 18:21 test.types.yml


In [11]:
# get bounding boxes, classes and track ids for detections in all 300 frames of video
import yaml

file_path = 'test.geom.yml'
with open(file_path, 'r') as file:
    geom = yaml.safe_load(file)
file_path = 'test.types.yml'
with open(file_path, 'r') as file:
    types = yaml.safe_load(file)

### Create Ground Truth DataFrame

AS pd.DataFrame </br>
format: </br>
{idx: geom_id0, </br>
track_id: geom_id1, </br>
label: types_cset3-key_on_id1, </br>
conf: types_cset3-value_on_id1, </br>
frame: geom_ts0, </br>
xmin: geom_g0.split(' ')[0], ymin: geom_g0.split(' ')[1], xmax: geom_g0.split(' ')[2], ymax: geom_g0.split(' ')[3]}

In [12]:
detections = []

# lists for populating dictionary then DataFrame
idx = []
track_id = []
labels = []
labels_ints = []
confs = []
frame = []
xmin, ymin, xmax, ymax = [], [], [], []

for i in geom:
  try:
    if i['geom']['ts0'] < 300:  # populates detections data from first 300 frames
      detections.append(i)
      idx.append(i['geom']['id0'])
      track_id.append(i['geom']['id1'])
      frame.append(i['geom']['ts0'])
      bb = i['geom']['g0'].split(' ')
      xmin.append(int(bb[0]))
      ymin.append(int(bb[1]))
      xmax.append(int(bb[2]))
      ymax.append(int(bb[3]))

      for j in types:
        try:
          if j['types']['id1'] == track_id[-1]:  # pulls labels and confidences (by track id --> 'id1') from types.yaml file
            label, conf = next(iter(j['types']['cset3'].items()))
            confs.append(conf)
            labels.append(label)
            # also save labels as ints for comparison with YOLOv8 Predictions later on
            if label == 'Person':
              labels_ints.append(0)
            elif label == 'Bike':
              labels_ints.append(1)
            elif label == 'Vehicle':
              labels_ints.append(2)
            else:
              labels_ints.append(-1)
        except:
          pass
  except:
    pass

print('idx: ', len(idx),
      'track_id: ', len(track_id),
      'label: ', len(labels),
      'conf: ', len(confs),
      'frame: ', len(frame))


gt = {'idx_gt': idx, 'track_id_gt': track_id, 'label_gt': labels,
      'label_as_int_gt': labels_ints, 'conf_gt': confs, 'frame_gt': frame,
      'xmin_gt': xmin, 'ymin_gt': ymin, 'xmax_gt': xmax, 'ymax_gt': ymax}
df_gt = pd.DataFrame(gt)
df_gt

idx:  3406 track_id:  3406 label:  3406 conf:  3406 frame:  3406


Unnamed: 0,idx_gt,track_id_gt,label_gt,label_as_int_gt,conf_gt,frame_gt,xmin_gt,ymin_gt,xmax_gt,ymax_gt
0,0,1,Vehicle,2,1.0,0,946,327,1010,360
1,1,1,Vehicle,2,1.0,1,945,326,1009,359
2,2,1,Vehicle,2,1.0,2,945,326,1009,359
3,3,1,Vehicle,2,1.0,3,945,326,1009,359
4,4,1,Vehicle,2,1.0,4,945,326,1009,359
...,...,...,...,...,...,...,...,...,...,...
3401,73049,5030,Door,-1,1.0,295,72,423,95,487
3402,73050,5030,Door,-1,1.0,296,72,423,95,487
3403,73051,5030,Door,-1,1.0,297,72,423,95,487
3404,73052,5030,Door,-1,1.0,298,72,423,95,487


## Let's drop all detections besides Cars, People, and Bikes

In [13]:
df_gt = df_gt[df_gt['label_as_int_gt'] != -1]
df_gt.groupby('label_as_int_gt').count()

Unnamed: 0_level_0,idx_gt,track_id_gt,label_gt,conf_gt,frame_gt,xmin_gt,ymin_gt,xmax_gt,ymax_gt
label_as_int_gt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,900,900,900,900,900,900,900,900,900
2,1306,1306,1306,1306,1306,1306,1306,1306,1306


In [14]:
df_gt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2206 entries, 0 to 2205
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   idx_gt           2206 non-null   int64  
 1   track_id_gt      2206 non-null   int64  
 2   label_gt         2206 non-null   object 
 3   label_as_int_gt  2206 non-null   int64  
 4   conf_gt          2206 non-null   float64
 5   frame_gt         2206 non-null   int64  
 6   xmin_gt          2206 non-null   int64  
 7   ymin_gt          2206 non-null   int64  
 8   xmax_gt          2206 non-null   int64  
 9   ymax_gt          2206 non-null   int64  
dtypes: float64(1), int64(8), object(1)
memory usage: 189.6+ KB


## 3. Get Predictions
- get inference from YOLOv8 with ByteTracker on this test clip
- store predictions in Pandas DataFrame 'preddf'

Install YOLOv8

In [15]:
!pip -q install ultralytics
!pip -q install lap

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m612.4/612.4 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for lap (setup.py) ... [?25l[?25hdone


In [16]:
from ultralytics import YOLO
import lap

model = YOLO('yolov8n.pt')

Downloading https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n.pt to yolov8n.pt...
100%|██████████| 6.23M/6.23M [00:00<00:00, 78.9MB/s]


In [17]:
results = model.track(source='/content/test.mp4', conf=0.12, iou=0.5,
                      device=0, save_txt=True, imgsz=1280, classes=[0,1,2],
                      tracker="bytetrack.yaml", stream=True)
list_results = list(results)


video 1/1 (1/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 106.0ms
video 1/1 (2/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 13.5ms
video 1/1 (3/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 13.4ms
video 1/1 (4/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 14.7ms
video 1/1 (5/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 13.4ms
video 1/1 (6/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 13.4ms
video 1/1 (7/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 13.4ms
video 1/1 (8/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 13.1ms
video 1/1 (9/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 15.5ms
video 1/1 (10/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 13.1ms
video 1/1 (11/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 13.1ms
video 1/1 (12/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 13.1ms
video 1/1 (13/300) /content/test.mp4: 736x1280 3 persons, 5 cars, 13.1ms
video 1/1 (14/300) /content/test.mp4: 736x1280 3 persons, 

With `save_txt = True`, .txt files are in format `class_label xmid_ratio ymid_ratio width height track_id`

In [18]:
# frame 1 detections as .txt file(s)
!cat runs/detect/track/labels/test_1.txt

2 0.137419 0.553092 0.0945308 0.0548357 1
2 0.763876 0.478838 0.0469688 0.0366087 2
2 0.123751 0.511463 0.0826546 0.044233 3
2 0.567661 0.733765 0.111929 0.0792986 4
0 0.232489 0.761387 0.019395 0.080564 5
0 0.581185 0.575837 0.0138645 0.05707 6
2 0.751597 0.962718 0.090546 0.0741219 7
0 0.909582 0.944357 0.0208885 0.095949 8


A better way:</br>
ultralytics.yolo.engine.results.Boxes attribute formats:
- Boxes.data use [xmin, ymin, xmax, ymax, track_id, conf, class]

In [19]:
list_results[0].boxes.data

tensor([[1.1540e+02, 3.7849e+02, 2.3640e+02, 4.1797e+02, 1.0000e+00, 8.8784e-01, 2.0000e+00],
        [9.4770e+02, 3.3158e+02, 1.0078e+03, 3.5794e+02, 2.0000e+00, 8.5510e-01, 2.0000e+00],
        [1.0550e+02, 3.5233e+02, 2.1130e+02, 3.8418e+02, 3.0000e+00, 8.5349e-01, 2.0000e+00],
        [6.5497e+02, 4.9976e+02, 7.9824e+02, 5.5686e+02, 4.0000e+00, 8.5208e-01, 2.0000e+00],
        [2.8517e+02, 5.1920e+02, 3.1000e+02, 5.7720e+02, 5.0000e+00, 8.3465e-01, 0.0000e+00],
        [7.3504e+02, 3.9406e+02, 7.5279e+02, 4.3515e+02, 6.0000e+00, 7.6037e-01, 0.0000e+00],
        [9.0410e+02, 6.6647e+02, 1.0200e+03, 7.1984e+02, 7.0000e+00, 7.5590e-01, 2.0000e+00],
        [1.1509e+03, 6.4540e+02, 1.1776e+03, 7.1448e+02, 8.0000e+00, 6.6577e-01, 0.0000e+00]])

### Create DataFrame for predictions

In [20]:
len(list_results[0].boxes.data)

8

In [21]:
# Create dictionary of predictions `df_pred

track_id = []
label = []
conf = []
frame = []
xmin, ymin, xmax, ymax = [], [], [], []

for i in range(len(list_results)):  # loop through all frames
  for j in range(len(list_results[i].boxes.data)):  # loop through all detections per frame
    bb = [torch.round(k) for k in iter(list_results[i].boxes.data[j])]  # enumerate data per detection: [xmin, ymin, xmax, ymax, track_id, conf, class]
    xmin.append(int(bb[0]))
    ymin.append(int(bb[1]))
    xmax.append(int(bb[2]))
    ymax.append(int(bb[3]))
    track_id.append(int(bb[4]))
    conf.append(int(bb[5]))
    label.append(int(bb[6]))
    frame.append(i)  # frames in gt start from 0

pred = {'track_id_pred': track_id, 'label_pred': label, 'conf_pred': conf, 'frame_pred': frame, 'xmin_pred': xmin, 'ymin_pred': ymin, 'xmax_pred': xmax, 'ymax_pred': ymax}

df_pred = pd.DataFrame(pred)
df_pred.head()

Unnamed: 0,track_id_pred,label_pred,conf_pred,frame_pred,xmin_pred,ymin_pred,xmax_pred,ymax_pred
0,1,2,1,0,115,378,236,418
1,2,2,1,0,948,332,1008,358
2,3,2,1,0,106,352,211,384
3,4,2,1,0,655,500,798,557
4,5,0,1,0,285,519,310,577


In [22]:
df_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2181 entries, 0 to 2180
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   track_id_pred  2181 non-null   int64
 1   label_pred     2181 non-null   int64
 2   conf_pred      2181 non-null   int64
 3   frame_pred     2181 non-null   int64
 4   xmin_pred      2181 non-null   int64
 5   ymin_pred      2181 non-null   int64
 6   xmax_pred      2181 non-null   int64
 7   ymax_pred      2181 non-null   int64
dtypes: int64(8)
memory usage: 136.4 KB


## Compare/Contrast dataframes

How many track_ids?

In [23]:
# ground truth
df_gt.groupby('track_id_gt').count()

Unnamed: 0_level_0,idx_gt,label_gt,label_as_int_gt,conf_gt,frame_gt,xmin_gt,ymin_gt,xmax_gt,ymax_gt
track_id_gt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,300,300,300,300,300,300,300,300,300
2,300,300,300,300,300,300,300,300,300
3,300,300,300,300,300,300,300,300,300
4,300,300,300,300,300,300,300,300,300
5,106,106,106,106,106,106,106,106,106
33,300,300,300,300,300,300,300,300,300
34,300,300,300,300,300,300,300,300,300
35,300,300,300,300,300,300,300,300,300


In [24]:
# predicted
df_pred.groupby('track_id_pred').count()

Unnamed: 0_level_0,label_pred,conf_pred,frame_pred,xmin_pred,ymin_pred,xmax_pred,ymax_pred
track_id_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,300,300,300,300,300,300,300
2,300,300,300,300,300,300,300
3,300,300,300,300,300,300,300
4,300,300,300,300,300,300,300
5,300,300,300,300,300,300,300
6,300,300,300,300,300,300,300
7,81,81,81,81,81,81,81
8,300,300,300,300,300,300,300


## Get IOU scores for each frame & object of video

In [25]:
def get_iou(boxA, boxB):
    """
    Calculate the Intersection over Union (IoU) of two bounding boxes.

    Parameters:
    boxA -- [xmin, ymin, xmax, ymax] of box A
    boxB -- [xmin, ymin, xmax, ymax] of box B

    Returns:
    iou   -- Intersection over Union
    """
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    # compute the area of intersection rectangle
    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)

    # compute the area of both the prediction and ground-truth rectangles
    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)

    # compute the intersection over union by taking the intersection area and dividing it by the sum of prediction + ground-truth areas - the intersection area
    iou = interArea / float(boxAArea + boxBArea - interArea)

    # return the intersection over union value
    return iou

In [26]:
# Create a cross join dataframe
df_gt['key'] = 1
df_pred['key'] = 1
df_cross = pd.merge(df_gt, df_pred, left_on=['key', 'frame_gt'], right_on=['key', 'frame_pred']).drop("key", axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gt['key'] = 1


In [27]:
df_cross.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16090 entries, 0 to 16089
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   idx_gt           16090 non-null  int64  
 1   track_id_gt      16090 non-null  int64  
 2   label_gt         16090 non-null  object 
 3   label_as_int_gt  16090 non-null  int64  
 4   conf_gt          16090 non-null  float64
 5   frame_gt         16090 non-null  int64  
 6   xmin_gt          16090 non-null  int64  
 7   ymin_gt          16090 non-null  int64  
 8   xmax_gt          16090 non-null  int64  
 9   ymax_gt          16090 non-null  int64  
 10  track_id_pred    16090 non-null  int64  
 11  label_pred       16090 non-null  int64  
 12  conf_pred        16090 non-null  int64  
 13  frame_pred       16090 non-null  int64  
 14  xmin_pred        16090 non-null  int64  
 15  ymin_pred        16090 non-null  int64  
 16  xmax_pred        16090 non-null  int64  
 17  ymax_pred   

In [28]:
%%time

# Calculate IOU for each pair
df_cross['iou'] = df_cross.apply(lambda row: get_iou(row[['xmin_gt', 'ymin_gt', 'xmax_gt', 'ymax_gt']], row[['xmin_pred', 'ymin_pred', 'xmax_pred', 'ymax_pred']]), axis=1)

# Find the pairs with maximum IOU for each ground truth bounding box
df_matched = df_cross.loc[df_cross.groupby(['idx_gt'])['iou'].idxmax()]

CPU times: user 14.5 s, sys: 67.3 ms, total: 14.6 s
Wall time: 14.7 s


In [29]:
df_matched.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2206 entries, 1 to 16089
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   idx_gt           2206 non-null   int64  
 1   track_id_gt      2206 non-null   int64  
 2   label_gt         2206 non-null   object 
 3   label_as_int_gt  2206 non-null   int64  
 4   conf_gt          2206 non-null   float64
 5   frame_gt         2206 non-null   int64  
 6   xmin_gt          2206 non-null   int64  
 7   ymin_gt          2206 non-null   int64  
 8   xmax_gt          2206 non-null   int64  
 9   ymax_gt          2206 non-null   int64  
 10  track_id_pred    2206 non-null   int64  
 11  label_pred       2206 non-null   int64  
 12  conf_pred        2206 non-null   int64  
 13  frame_pred       2206 non-null   int64  
 14  xmin_pred        2206 non-null   int64  
 15  ymin_pred        2206 non-null   int64  
 16  xmax_pred        2206 non-null   int64  
 17  ymax_pred    

In [30]:
# How many missed predictions? ie ground truth box exists but no prediction box was given
df_matched[df_matched['iou']<.01].count()

idx_gt             25
track_id_gt        25
label_gt           25
label_as_int_gt    25
conf_gt            25
frame_gt           25
xmin_gt            25
ymin_gt            25
xmax_gt            25
ymax_gt            25
track_id_pred      25
label_pred         25
conf_pred          25
frame_pred         25
xmin_pred          25
ymin_pred          25
xmax_pred          25
ymax_pred          25
iou                25
dtype: int64

In [31]:
len(df_gt) - len(df_pred)

25

In [32]:
df_matched[df_matched['iou']<.01]

Unnamed: 0,idx_gt,track_id_gt,label_gt,label_as_int_gt,conf_gt,frame_gt,xmin_gt,ymin_gt,xmax_gt,ymax_gt,track_id_pred,label_pred,conf_pred,frame_pred,xmin_pred,ymin_pred,xmax_pred,ymax_pred,iou
4124,29036,5,Vehicle,2,1.0,64,1123,519,1203,579,1,2,1,64,115,379,236,418,0.0
4180,29037,5,Vehicle,2,1.0,65,1126,517,1205,577,1,2,1,65,115,379,236,418,0.0
4236,29038,5,Vehicle,2,1.0,66,1129,515,1207,575,1,2,1,66,115,378,236,418,0.0
4292,29039,5,Vehicle,2,1.0,67,1132,514,1209,573,1,2,1,67,115,378,236,418,0.0
4348,29040,5,Vehicle,2,1.0,68,1135,512,1211,571,1,2,1,68,115,378,237,418,0.0
4404,29041,5,Vehicle,2,1.0,69,1138,510,1213,569,1,2,1,69,115,378,236,418,0.0
4460,29042,5,Vehicle,2,1.0,70,1141,508,1216,567,1,2,1,70,114,378,236,418,0.0
4900,29049,5,Vehicle,2,1.0,77,1161,495,1231,553,1,2,1,77,115,378,236,418,0.0
4956,29050,5,Vehicle,2,1.0,78,1163,494,1233,551,1,2,1,78,116,378,236,418,0.0
5012,29051,5,Vehicle,2,1.0,79,1165,493,1235,549,1,2,1,79,115,378,236,418,0.0


In [33]:
df_matched

Unnamed: 0,idx_gt,track_id_gt,label_gt,label_as_int_gt,conf_gt,frame_gt,xmin_gt,ymin_gt,xmax_gt,ymax_gt,track_id_pred,label_pred,conf_pred,frame_pred,xmin_pred,ymin_pred,xmax_pred,ymax_pred,iou
1,0,1,Vehicle,2,1.0,0,946,327,1010,360,2,2,1,0,948,332,1008,358,0.745249
65,1,1,Vehicle,2,1.0,1,945,326,1009,359,2,2,1,1,947,331,1008,358,0.785520
129,2,1,Vehicle,2,1.0,2,945,326,1009,359,2,2,1,2,947,331,1008,358,0.785520
193,3,1,Vehicle,2,1.0,3,945,326,1009,359,2,2,1,3,948,331,1008,358,0.772851
257,4,1,Vehicle,2,1.0,4,945,326,1009,359,2,2,1,4,948,331,1008,358,0.772851
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15893,46534,35,Person,0,1.0,295,1192,575,1240,639,8,0,1,295,1201,578,1227,636,0.500157
15942,46535,35,Person,0,1.0,296,1192,575,1240,639,8,0,1,296,1201,578,1227,635,0.491680
15991,46536,35,Person,0,1.0,297,1192,575,1240,639,8,0,1,297,1201,578,1227,635,0.491680
16040,46537,35,Person,0,1.0,298,1192,575,1240,639,8,0,1,298,1201,577,1227,635,0.500157


In [34]:
df_matched.describe()

Unnamed: 0,idx_gt,track_id_gt,label_as_int_gt,conf_gt,frame_gt,xmin_gt,ymin_gt,xmax_gt,ymax_gt,track_id_pred,label_pred,conf_pred,frame_pred,xmin_pred,ymin_pred,xmax_pred,ymax_pred,iou
count,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0,2206.0
mean,23867.247053,15.471442,1.184044,1.0,144.839075,611.516319,440.395286,690.878513,491.404352,4.212149,1.184044,0.966455,144.839075,603.762013,443.074343,676.12874,488.421578,0.722826
std,15723.507103,15.424598,0.983141,0.0,87.283254,403.081605,96.108346,384.374472,109.153721,2.269258,0.983141,0.180095,87.283254,402.77466,96.166437,382.701641,109.00804,0.14514
min,0.0,1.0,0.0,1.0,0.0,102.0,326.0,212.0,359.0,1.0,0.0,0.0,0.0,105.0,331.0,211.0,358.0,0.0
25%,7494.25,2.0,0.0,1.0,68.25,114.0,349.0,237.0,386.0,2.0,0.0,1.0,68.25,116.0,353.0,236.0,384.0,0.604839
50%,21931.5,4.0,2.0,1.0,142.0,647.0,389.0,801.0,434.0,4.0,2.0,1.0,142.0,655.0,392.0,798.0,431.0,0.772851
75%,39641.75,34.0,2.0,1.0,221.0,945.0,516.0,1009.0,577.0,6.0,2.0,1.0,221.0,948.0,518.0,1008.0,577.0,0.834739
max,46538.0,35.0,2.0,1.0,299.0,1223.0,659.0,1268.0,718.0,8.0,2.0,1.0,299.0,1217.0,666.0,1257.0,720.0,0.903763


### IOU Scores averaged for all 300 frames
- for all detections collectively
- per object individually

In [35]:
print('average IOU for all detections across the 300 frames')
df_matched['iou'].mean()

average IOU for all detections across the 300 frames


0.7228258909968059

In [36]:
print('average IOU for each distinct object')
df_matched.groupby('track_id_gt')[['label_as_int_gt', 'iou']].mean()

average IOU for each distinct object


Unnamed: 0_level_0,label_as_int_gt,iou
track_id_gt,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.0,0.771644
2,2.0,0.830496
3,2.0,0.883685
4,2.0,0.831063
5,2.0,0.597647
33,0.0,0.587742
34,0.0,0.59615
35,0.0,0.603232


In [37]:
print('the People on average had the worst IOU scores')
print('the Vehicles did very well, \nexcept for the one vehicle which is moving and disappears behind tree branches. \nThe detection model missed the last 25 frames of \nground truth for this. \nPartial Occlusion appears to be an issue.')

the People on average had the worst IOU scores
the Vehicles did very well, 
except for the one vehicle which is moving and disappears behind tree branches. 
The detection model missed the last 25 frames of 
ground truth for this. 
Partial Occlusion appears to be an issue.
