# Creating a Custom Dataset for training YOLOv8

In [1]:
import yaml
import numpy as np
import pandas as pd
import os
import cv2

In [2]:
print('open the pod bay doors, hal')

open the pod bay doors, hal


### Get File Names of geom.yml and types.yml annotation files from train and validate folders
* `diva_train_path`
    - `train_files`
* `diva_valid_path`
    - `valid_files`

In [3]:
# get diva annotation files from training folder
diva_train_path = '/Users/p/Documents/Code/VIRAT/VIRAT_Ground_Dataset/diva_annotations/train'
train_dir = os.listdir(diva_train_path)
keep_these = {".geom", "types"}
train_geom_types = sorted([item for item in train_dir if item[-9:-4] in keep_these])
train_files = list(zip(train_geom_types[::2], train_geom_types[1::2]))
train_files

[('VIRAT_S_000000.geom.yml', 'VIRAT_S_000000.types.yml'),
 ('VIRAT_S_000001.geom.yml', 'VIRAT_S_000001.types.yml'),
 ('VIRAT_S_000002.geom.yml', 'VIRAT_S_000002.types.yml'),
 ('VIRAT_S_000005.geom.yml', 'VIRAT_S_000005.types.yml'),
 ('VIRAT_S_000200_03_000657_000899.geom.yml',
  'VIRAT_S_000200_03_000657_000899.types.yml'),
 ('VIRAT_S_000200_05_001525_001575.geom.yml',
  'VIRAT_S_000200_05_001525_001575.types.yml'),
 ('VIRAT_S_000201_03_000640_000672.geom.yml',
  'VIRAT_S_000201_03_000640_000672.types.yml'),
 ('VIRAT_S_000201_05_001081_001215.geom.yml',
  'VIRAT_S_000201_05_001081_001215.types.yml'),
 ('VIRAT_S_000201_06_001354_001397.geom.yml',
  'VIRAT_S_000201_06_001354_001397.types.yml'),
 ('VIRAT_S_000201_07_001485_001581.geom.yml',
  'VIRAT_S_000201_07_001485_001581.types.yml'),
 ('VIRAT_S_000201_08_001652_001838.geom.yml',
  'VIRAT_S_000201_08_001652_001838.types.yml'),
 ('VIRAT_S_000202_02_001527_001560.geom.yml',
  'VIRAT_S_000202_02_001527_001560.types.yml'),
 ('VIRAT_S_00020

In [4]:
len(train_files)

64

In [5]:
# get diva annotation files from validation folder
diva_validate_path = '/Users/p/Documents/Code/VIRAT/VIRAT_Ground_Dataset/diva_annotations/validate'
valid_dir = os.listdir(diva_validate_path)
keep_these = {".geom", "types"}
valid_geom_types = sorted([item for item in valid_dir if item[-9:-4] in keep_these])
valid_files = list(zip(valid_geom_types[::2], valid_geom_types[1::2]))
valid_files

[('VIRAT_S_000007.geom.yml', 'VIRAT_S_000007.types.yml'),
 ('VIRAT_S_000008.geom.yml', 'VIRAT_S_000008.types.yml'),
 ('VIRAT_S_000200_00_000100_000171.geom.yml',
  'VIRAT_S_000200_00_000100_000171.types.yml'),
 ('VIRAT_S_000200_02_000479_000635.geom.yml',
  'VIRAT_S_000200_02_000479_000635.types.yml'),
 ('VIRAT_S_000201_00_000018_000380.geom.yml',
  'VIRAT_S_000201_00_000018_000380.types.yml'),
 ('VIRAT_S_000201_01_000384_000589.geom.yml',
  'VIRAT_S_000201_01_000384_000589.types.yml'),
 ('VIRAT_S_000201_02_000590_000623.geom.yml',
  'VIRAT_S_000201_02_000590_000623.types.yml'),
 ('VIRAT_S_000201_04_000682_000822.geom.yml',
  'VIRAT_S_000201_04_000682_000822.types.yml'),
 ('VIRAT_S_000203_01_000171_000345.geom.yml',
  'VIRAT_S_000203_01_000171_000345.types.yml'),
 ('VIRAT_S_000203_08_001702_001734.geom.yml',
  'VIRAT_S_000203_08_001702_001734.types.yml'),
 ('VIRAT_S_000204_07_001577_001611.geom.yml',
  'VIRAT_S_000204_07_001577_001611.types.yml'),
 ('VIRAT_S_000204_09_001768_001849.geo

In [6]:
len(valid_files)

55

### Take a sample of the Train and Validate files for data extraction
- 30 from Train
- 10 from Validate

In [7]:
import random
random.seed(40)

train_sample = sorted(random.sample(train_files, 30))
valid_sample = sorted(random.sample(valid_files, 10))

In [8]:
len(train_sample)

30

In [9]:
len(valid_sample)

10

### Match Annotation Files to Video Files
* `train_sample` <--> `train_videos`
* `valid_sample` <--> `valid_videos`

In [10]:
train_videos = []

for i in range(len(train_sample)):
    this_file = train_sample[i][0].split('.')[0]
    this_file += '.mp4'
    train_videos.append(this_file)
len(train_videos)

30

In [11]:
print(train_sample[27][0])
print(train_videos[27])

VIRAT_S_050000_01_000207_000361.geom.yml
VIRAT_S_050000_01_000207_000361.mp4


In [12]:
valid_videos = []

for i in range(len(valid_sample)):
    this_file = valid_sample[i][0].split('.')[0]
    this_file += '.mp4'
    valid_videos.append(this_file)
len(valid_videos)

10

In [13]:
print(valid_sample[3][1])
print(valid_videos[3])

VIRAT_S_000206_08_001618_001712.types.yml
VIRAT_S_000206_08_001618_001712.mp4


### Get 200 images of 'Persons' with ground truth from single Training Video

1. Import geom.yml and types.yml files
2. get dataframe from (1)
3. Filter for 'Persons' and create sample of frames
4. Write frames as .jpgs in Custom_Dataset/train/images/ folder with '[file-prefix]_[frame-num].jpg' as file name
5. Write .txt files by Querying full dataframe per frame for detections' labels and bboxes. write to Custom_Dataset/train/labels/ folder with '[file-prefix]_[frame-num].txt' as file name
6. Repeat 1-5 for Validation Set

In [14]:
train_sample[0]

('VIRAT_S_000001.geom.yml', 'VIRAT_S_000001.types.yml')

In [15]:
%%time
# geom
geom_path = diva_train_path + '/' + train_sample[0][0]
with open(geom_path, 'r') as file:
    geom = yaml.safe_load(file)

# types
types_path = diva_train_path + '/' + train_sample[0][1]
with open(types_path, 'r') as file:
    types = yaml.safe_load(file)

CPU times: user 54.2 s, sys: 434 ms, total: 54.6 s
Wall time: 54.6 s


In [17]:
# Turns the geom.yml and types.yml files into a single dataframe

detections = []  # all detections
id0 = []  # unique detection index
id1 = []  # track_id
ts0 = []  # frame
labels = []  # string class labels
labels_ints = []  # integer class labels
confs = []
xmin, ymin, xmax, ymax = [], [], [], []

for i in geom:
  try:
    if i['geom']['ts0'] is not None:  # populates detections data from geom file
      detections.append(i)
      id0.append(i['geom']['id0'])
      id1.append(i['geom']['id1'])
      ts0.append(i['geom']['ts0'])
      bb = i['geom']['g0'].split(' ')
      xmin.append(int(bb[0]))
      ymin.append(int(bb[1]))
      xmax.append(int(bb[2]))
      ymax.append(int(bb[3]))

      for j in types:
        try:
          if j['types']['id1'] == id1[-1]:  # pulls labels and confidences (by track id --> 'id1') from types.yaml file
            label, conf = next(iter(j['types']['cset3'].items()))
            confs.append(conf)
            labels.append(label)
            # also save labels as ints for comparison with YOLOv8 Predictions later on
            if label == 'Person':
              labels_ints.append(0)
            elif label == 'Bike':
              labels_ints.append(1)
            elif label == 'Vehicle':
              labels_ints.append(2)
            else:
              labels_ints.append(-1)
        except:
          pass
  except:
    pass

print('idx: ', len(id0),
      'track_id: ', len(id1),
      'label: ', len(labels),
      'conf: ', len(confs),
      'frame: ', len(ts0))  # sanity check


gt = {'idx_gt': id0, 'track_id_gt': id1, 'label_gt': labels,
      'label_as_int_gt': labels_ints, 'conf_gt': confs, 'frame_gt': ts0,
      'xmin_gt': xmin, 'ymin_gt': ymin, 'xmax_gt': xmax, 'ymax_gt': ymax}
df_gt = pd.DataFrame(gt)
df_gt

idx:  179573 track_id:  179573 label:  179573 conf:  179573 frame:  179573


Unnamed: 0,idx_gt,track_id_gt,label_gt,label_as_int_gt,conf_gt,frame_gt,xmin_gt,ymin_gt,xmax_gt,ymax_gt
0,0,1,Person,0,1.0,3455,1,663,77,795
1,1,1,Person,0,1.0,3456,1,663,77,795
2,2,1,Person,0,1.0,3457,1,663,77,795
3,3,1,Person,0,1.0,3458,1,663,77,795
4,4,1,Person,0,1.0,3459,1,663,77,795
...,...,...,...,...,...,...,...,...,...,...
179568,179568,5003,Other,-1,1.0,20650,760,758,778,830
179569,179569,5003,Other,-1,1.0,20651,760,758,778,830
179570,179570,5003,Other,-1,1.0,20652,760,758,778,830
179571,179571,5003,Other,-1,1.0,20653,760,758,778,830


In [18]:
# filter for cars and people

# CHANGE TO JUST PEOPLE
df_persons = df_gt[df_gt['label_as_int_gt'] == 0]
df_cars = df_gt[df_gt['label_as_int_gt'] == 2]

In [19]:
# sample frames: 30 cars and 70 persons

# CHANGE TO 200 PEOPLE. NO CARS.
# CREATE CHECK SO AS NOT TO SAMPLE SAME FRAME TWICE
df_persons_70 = df_persons.sample(n=70)
df_cars_30 = df_cars.sample(n=30)

In [20]:
# get frame numbers from these samples

# 
persons_frames = sorted(df_persons_70['frame_gt'].to_list())
cars_frames = sorted(df_cars_30['frame_gt'].to_list())

In [28]:
# list of frames from video
persons_frames

[4739,
 4752,
 5095,
 5288,
 5729,
 5756,
 5943,
 6184,
 6272,
 6314,
 6480,
 7132,
 7533,
 7669,
 7777,
 7863,
 7940,
 8015,
 8237,
 8414,
 8500,
 8515,
 8530,
 8849,
 8991,
 9185,
 9218,
 9479,
 9597,
 9633,
 9669,
 10046,
 10058,
 10099,
 10112,
 10260,
 11061,
 11296,
 11353,
 11742,
 11917,
 12236,
 12241,
 12306,
 12562,
 12682,
 12789,
 13051,
 13441,
 13475,
 13788,
 14111,
 14367,
 14749,
 15016,
 15111,
 15368,
 15646,
 15904,
 16215,
 16234,
 16529,
 16796,
 16801,
 17056,
 17764,
 18758,
 19777,
 20087,
 20191]

In [29]:
# grab .mp4

vid_path = '/Users/p/Documents/Code/VIRAT/VIRAT_Ground_Dataset/videos_original/'
this_vid_path = vid_path + train_videos[0]
this_vid_path

'/Users/p/Documents/Code/VIRAT/VIRAT_Ground_Dataset/videos_original/VIRAT_S_000001.mp4'

In [25]:
# declare destination

loc = '/Users/p/Documents/Code/VIRAT/VIRAT_Ground_Dataset/Custom/'

In [45]:
def convert_bbox_format(bbox, total_width, total_height):
    """
    Converts bounding box from [xmin, ymin, xmax, ymax] format to 
    normalized [x_mid, y_mid, width, height] format.

    Parameters:
    bbox (list): A list of 4 floats: [xmin, ymin, xmax, ymax].
    total_width (int): The total width of the encoded frame.
    total_height (int): The total height of the encoded frame.

    Returns:
    list: A list of 4 floats: [x_mid, y_mid, width, height] where these values 
    have been normalized with respect to total_width and total_height.
    """
    xmin, ymin, xmax, ymax = bbox
    width = xmax - xmin
    height = ymax - ymin
    x_mid = xmin + width / 2
    y_mid = ymin + height / 2
    
    # Normalize the coordinates and dimensions
    x_mid /= total_width
    y_mid /= total_height
    width /= total_width
    height /= total_height
    
    return [x_mid, y_mid, width, height]

In [46]:
%%time
# write .jpgs and .txts for all frames of train_videos[0]
# to '/Users/p/Documents/Code/VIRAT/VIRAT_Ground_Dataset/Custom/train/[images, labels]'

vid_num = 0
vid_path = '/Users/p/Documents/Code/VIRAT/VIRAT_Ground_Dataset/videos_original/'
this_vid_path = vid_path + train_videos[vid_num]

# begin video capture
cap = cv2.VideoCapture(this_vid_path)

total_width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
total_height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)

for i in persons_frames:
    # write .txt files:
    # get persons in frame
    persons_in_frame_df = df_gt[(df_gt['frame_gt'] == i) & (df_gt['label_as_int_gt'] == 0)]
    
    write_this = ""
    for j in range(len(persons_in_frame_df)):
        this_person = persons_in_frame_df.iloc[j]
        label = this_person['label_as_int_gt']
        xmin = float(this_person['xmin_gt'])
        ymin = float(this_person['ymin_gt'])
        xmax = float(this_person['xmax_gt'])
        ymax = float(this_person['ymax_gt'])
        x_mid, y_mid, width, height = convert_bbox_format([xmin,ymin,xmax,ymax], total_width, total_height)
        
        # create string for .txt file
        write_this += str(label) + ' ' + str(x_mid) + ' ' + str(y_mid) + ' ' + str(width) + ' ' + str(height)
        if j != (len(persons_in_frame_df) - 1):
            write_this += '\n'
    
    # write the .txt file
    labels_loc = '/Users/p/Documents/Code/VIRAT/VIRAT_Ground_Dataset/Custom/train/labels/'
    txt_path = labels_loc + train_videos[vid_num].split('.')[0] + '_' + str(i) + '.txt'
    with open(txt_path, 'w') as f:
        f.write(write_this)
    
    # write .jpg files:
    cap.set(cv2.CAP_PROP_POS_FRAMES, i)  # set frame position
    success, frame = cap.read()  # read frame
    if not success:
        print("Failed to read the frame")
    if success:
        images_loc = '/Users/p/Documents/Code/VIRAT/VIRAT_Ground_Dataset/Custom/train/images/'
        filepath = images_loc + train_videos[vid_num].split('.')[0] + '_' + str(i) + '.jpg'
        cv2.imwrite(filepath, frame)  # write frame to .jpg and store in parent folder

# end capture
cap.release()

CPU times: user 1min 38s, sys: 2.88 s, total: 1min 41s
Wall time: 20.9 s


.txt file format
"label x_m y_m w h"

In [None]:
# NOW WRITE GROUND TRUTH TO .TXT FILE

for i in persons_frames:
    
    # get persons in frame
    persons_in_frame_df = df_gt[(df_gt['frame_gt'] == persons_frames[0]) & (df_gt['label_as_int_gt'] == 0)]
    
    

In [30]:
label = df_gt[(df_gt['frame_gt'] == persons_frames[0]) & (df_gt['label_as_int_gt'] == 0)]
label

Unnamed: 0,idx_gt,track_id_gt,label_gt,label_as_int_gt,conf_gt,frame_gt,xmin_gt,ymin_gt,xmax_gt,ymax_gt
6303,6303,4,Person,0,1.0,4739,497,759,562,919
22964,22964,19,Person,0,1.0,4739,357,716,422,863
39947,39947,20,Person,0,1.0,4739,366,681,444,835


In [38]:
label.iloc[2]['frame_gt']

4739

In [36]:
len(label)

3

In [2]:
os.listdir('/Users/p/Documents/Code/VIRAT/VIRAT_Ground_Dataset/Custom/train/images')

['VIRAT_S_000001_15904.jpg',
 'VIRAT_S_000001_14367.jpg',
 'VIRAT_S_000001_9218.jpg',
 'VIRAT_S_000001_12306.jpg',
 'VIRAT_S_000001_9185.jpg',
 'VIRAT_S_000001_6314.jpg',
 'VIRAT_S_000001_16215.jpg',
 'VIRAT_S_000001_10260.jpg',
 'VIRAT_S_000001_8515.jpg',
 'VIRAT_S_000001_8500.jpg',
 'VIRAT_S_000001_5756.jpg',
 'VIRAT_S_000001_17056.jpg',
 'VIRAT_S_000001_14749.jpg',
 'VIRAT_S_000001_9597.jpg',
 '.DS_Store',
 'VIRAT_S_000001_6272.jpg',
 'VIRAT_S_000001_9633.jpg',
 'VIRAT_S_000001_10099.jpg',
 'VIRAT_S_000001_18758.jpg',
 'VIRAT_S_000001_10112.jpg',
 'VIRAT_S_000001_4739.jpg',
 'VIRAT_S_000001_11742.jpg',
 'VIRAT_S_000001_15646.jpg',
 'VIRAT_S_000001_10058.jpg',
 'VIRAT_S_000001_11353.jpg',
 'VIRAT_S_000001_11917.jpg',
 'VIRAT_S_000001_12562.jpg',
 'VIRAT_S_000001_8414.jpg',
 'VIRAT_S_000001_11296.jpg',
 'VIRAT_S_000001_12789.jpg',
 'VIRAT_S_000001_7533.jpg',
 'VIRAT_S_000001_14111.jpg',
 'VIRAT_S_000001_9669.jpg',
 'VIRAT_S_000001_7863.jpg',
 'VIRAT_S_000001_13051.jpg',
 'VIRAT_S_0000