# YOLOv8 Training on Custom Dataset

## Create the Dataset

In [43]:
import yaml
import numpy as np
import pandas as pd
import os
import cv2

In [1]:
print('open the pod bay doors, hal')

open the pod bay doors, hal


### Get File Names of geom.yml and types.yml annotation files from train and validate folders
* `diva_train_path`
    - `train_files`
* `diva_valid_path`
    - `valid_files`

In [19]:
# get diva annotation files from training folder
diva_train_path = '/Users/p/Documents/Code/VIRAT/VIRAT_Ground_Dataset/diva_annotations/train'
train_dir = os.listdir(diva_train_path)
keep_these = {".geom", "types"}
train_geom_types = sorted([item for item in train_dir if item[-9:-4] in keep_these])
train_files = list(zip(train_geom_types[::2], train_geom_types[1::2]))
train_files

[('VIRAT_S_000000.geom.yml', 'VIRAT_S_000000.types.yml'),
 ('VIRAT_S_000001.geom.yml', 'VIRAT_S_000001.types.yml'),
 ('VIRAT_S_000002.geom.yml', 'VIRAT_S_000002.types.yml'),
 ('VIRAT_S_000005.geom.yml', 'VIRAT_S_000005.types.yml'),
 ('VIRAT_S_000200_03_000657_000899.geom.yml',
  'VIRAT_S_000200_03_000657_000899.types.yml'),
 ('VIRAT_S_000200_05_001525_001575.geom.yml',
  'VIRAT_S_000200_05_001525_001575.types.yml'),
 ('VIRAT_S_000201_03_000640_000672.geom.yml',
  'VIRAT_S_000201_03_000640_000672.types.yml'),
 ('VIRAT_S_000201_05_001081_001215.geom.yml',
  'VIRAT_S_000201_05_001081_001215.types.yml'),
 ('VIRAT_S_000201_06_001354_001397.geom.yml',
  'VIRAT_S_000201_06_001354_001397.types.yml'),
 ('VIRAT_S_000201_07_001485_001581.geom.yml',
  'VIRAT_S_000201_07_001485_001581.types.yml'),
 ('VIRAT_S_000201_08_001652_001838.geom.yml',
  'VIRAT_S_000201_08_001652_001838.types.yml'),
 ('VIRAT_S_000202_02_001527_001560.geom.yml',
  'VIRAT_S_000202_02_001527_001560.types.yml'),
 ('VIRAT_S_00020

In [21]:
len(train_files)

64

In [20]:
# get diva annotation files from validation folder
diva_validate_path = '/Users/p/Documents/Code/VIRAT/VIRAT_Ground_Dataset/diva_annotations/validate'
valid_dir = os.listdir(diva_validate_path)
keep_these = {".geom", "types"}
valid_geom_types = sorted([item for item in valid_dir if item[-9:-4] in keep_these])
valid_files = list(zip(valid_geom_types[::2], valid_geom_types[1::2]))
valid_files

[('VIRAT_S_000007.geom.yml', 'VIRAT_S_000007.types.yml'),
 ('VIRAT_S_000008.geom.yml', 'VIRAT_S_000008.types.yml'),
 ('VIRAT_S_000200_00_000100_000171.geom.yml',
  'VIRAT_S_000200_00_000100_000171.types.yml'),
 ('VIRAT_S_000200_02_000479_000635.geom.yml',
  'VIRAT_S_000200_02_000479_000635.types.yml'),
 ('VIRAT_S_000201_00_000018_000380.geom.yml',
  'VIRAT_S_000201_00_000018_000380.types.yml'),
 ('VIRAT_S_000201_01_000384_000589.geom.yml',
  'VIRAT_S_000201_01_000384_000589.types.yml'),
 ('VIRAT_S_000201_02_000590_000623.geom.yml',
  'VIRAT_S_000201_02_000590_000623.types.yml'),
 ('VIRAT_S_000201_04_000682_000822.geom.yml',
  'VIRAT_S_000201_04_000682_000822.types.yml'),
 ('VIRAT_S_000203_01_000171_000345.geom.yml',
  'VIRAT_S_000203_01_000171_000345.types.yml'),
 ('VIRAT_S_000203_08_001702_001734.geom.yml',
  'VIRAT_S_000203_08_001702_001734.types.yml'),
 ('VIRAT_S_000204_07_001577_001611.geom.yml',
  'VIRAT_S_000204_07_001577_001611.types.yml'),
 ('VIRAT_S_000204_09_001768_001849.geo

In [22]:
len(valid_files)

55

### Take a sample of the Train and Validate files for data extraction
- 30 from Train
- 10 from Validate

In [26]:
import random
random.seed(40)

train_sample = sorted(random.sample(train_files, 30))
valid_sample = sorted(random.sample(valid_files, 10))

In [29]:
len(train_sample)

30

In [30]:
len(valid_sample)

10

### Match Annotation Files to Video Files
* `train_sample` <--> `train_videos`
* `valid_sample` <--> `valid_videos`

In [34]:
train_videos = []

for i in range(len(train_sample)):
    this_file = train_sample[i][0].split('.')[0]
    this_file += '.mp4'
    train_videos.append(this_file)
len(train_videos)

30

In [36]:
print(train_sample[27][0])
print(train_videos[27])

VIRAT_S_050000_01_000207_000361.geom.yml
VIRAT_S_050000_01_000207_000361.mp4


In [35]:
valid_videos = []

for i in range(len(valid_sample)):
    this_file = valid_sample[i][0].split('.')[0]
    this_file += '.mp4'
    valid_videos.append(this_file)
len(valid_videos)

10

In [37]:
print(valid_sample[3][1])
print(valid_videos[3])

VIRAT_S_000206_08_001618_001712.types.yml
VIRAT_S_000206_08_001618_001712.mp4


### Grab the Track_ID of one person in the Training Set

In [None]:
%%time
# geom
geom_path = diva_train_path + '/' + train_sample[0][0]
with open(geom_path, 'r') as file:
    geom = yaml.safe_load(file)

# types
types_path = diva_train_path + '/' + train_sample[0][1]
with open(types_path, 'r') as file:
    types = yaml.safe_load(file)