@@ -0,0 +1,204 @@
"""
Convert Okutama Action dataset to VOC similar dataset
Store the processed data into HDF5 file
The original dataset is for object predestria tracking and people action understasnding
We remap the orignal labels by only keeping object detection labels
"""
# Original Labels: Each line contains 10+ columns, separated by spaces. The definition of these columns are:
# Track ID. All rows with the same ID belong to the same person for 180 frames. Then the person gets a new idea for the next 180 frames. We will soon release an update to make the IDs consistant.
# xmin. The top left x-coordinate of the bounding box.
# ymin. The top left y-coordinate of the bounding box.
# xmax. The bottom right x-coordinate of the bounding box.
# ymax. The bottom right y-coordinate of the bounding box.
# frame. The frame that this annotation represents.
# lost. If 1, the annotation is outside of the view screen.
# occluded. If 1, the annotation is occluded.
# generated. If 1, the annotation was automatically interpolated.
# label. The label for this annotation, enclosed in quotation marks.
# (+) actions. Each column after this is an action.

# There are two label files for each video;
# one for single-action detection and one for multi-action detection.
# Note that labels for single-action detection has been created from the multi-action detection labels
# (for more details please refer to our publication).
# For pedestrian detection task, the columns describing the actions should be ignored.

# Object detection Labels:
# labels. Always be 0 ('person') for this dataset
# xmin. The top left x-coordinate of the bounding box.
# ymin. The top left y-coordinate of the bounding box.
# xmax. The bottom right x-coordinate of the bounding box.
# ymax. The bottom right y-coordinate of the bounding box.

import numpy as np
import os
import glob
import cv2
import argparse
import fnmatch
import h5py
import random
import copy
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

classes = ["person", "bus", "car", "train"]

parser = argparse.ArgumentParser(
description='Merge multiple HDF5 datasets into a singleone.')

parser.add_argument(
'-o',
'--data_output',
help='path to output HDF5',
default='~/data/')

parser.add_argument(
'-i',
'--input_hdf5s',
help='path to input hdf5 files',
nargs=argparse.ONE_OR_MORE)

parser.add_argument(
'-d',
'--draw',
help='draw bound boxes on each image and output to /tmp',
default=False)

def draw_bboxes(image, bboxes):
"""Draw the bounding boxes on raw or jpg images"""
decoded_image = copy.deepcopy(image)
decoded_image = cv2.imdecode(decoded_image, 1)
if bboxes is None:
return decoded_image
corners = bboxes[:, 1:]
corners = np.array(corners, dtype=np.int)
for corner in corners:
cv2.rectangle(decoded_image, (corner[0], corner[1]),(corner[2], corner[3]), (0,255,0), 5)
return decoded_image

def draw_on_images(dataset_images, dataset_boxes, out_dir='/tmp/combined/'):
if not os.path.exists(out_dir):
os.mkdir(out_dir)
for i in range(dataset_images.shape[0]):
boxes = np.array(dataset_boxes[i]).reshape(-1, 5)
img = draw_bboxes(dataset_images[i], boxes)
out_img_path = os.path.join(out_dir, str(i)+'.jpg')
cv2.imwrite(out_img_path, img)
return

def _main(args):
draw_enabled = args.draw
output_path = os.path.expanduser(args.data_output)

input_hdf5 = []
num_datasets = len(args.input_hdf5s)
if num_datasets < 2:
print('Number of hd5f files is :' + str(num_datasets))
print('Nothing to combine')
return

num_samples_images_train = 0
num_samples_bboxes_train = 0
num_samples_images_valid = 0
num_samples_bboxes_valid = 0

for f in args.input_hdf5s:
in_file = h5py.File(f, 'r')
input_hdf5.append(in_file)
num_samples_images_train += in_file['train/images'].shape[0]
num_samples_bboxes_train += in_file['train/boxes'].shape[0]
num_samples_images_valid += in_file['valid/images'].shape[0]
num_samples_bboxes_valid += in_file['valid/boxes'].shape[0]

# images and boxes must have the same size
assert(num_samples_images_train == num_samples_bboxes_train)
assert(num_samples_images_valid == num_samples_bboxes_valid)

num_samples_train = num_samples_images_train
num_samples_valid = num_samples_images_valid

# Create HDF5 dataset structure
print('Creating output HDF5 dataset structure.')
print('Total train: ' + str(num_samples_train))
print('Total valid: ' + str(num_samples_valid))

if not os.path.exists(output_path):
print('Creating ' + output_path)
os.mkdir(output_path)

fname = os.path.join(output_path, 'combined.hdf5')
if os.path.exists(fname):
print('Removing old ' + fname)
os.remove(fname)

# Create HDF5 dataset structure
print('Creating HDF5 dataset structure.')
combined = h5py.File(fname, 'w')

uint8_dt = h5py.special_dtype(
vlen=np.dtype('uint8')) # variable length uint8
int32_dt = h5py.special_dtype(
vlen=np.dtype('int32')) # variable length uint8

vlen_int_dt = h5py.special_dtype(
vlen=np.dtype(int)) # variable length default int

train_group = combined.create_group('train')
valid_group = combined.create_group('valid')

# store class list for reference class ids as csv fixed-length numpy string
combined.attrs['classes'] = np.string_(str.join(',', classes))

# store images as variable length uint8 arrays
dataset_train_images = train_group.create_dataset(
'images', shape=(num_samples_train, ), dtype=uint8_dt)

dataset_valid_images = valid_group.create_dataset(
'images', shape=(num_samples_valid, ), dtype=uint8_dt)

# store images as variable length uint8 arrays
dataset_train_boxes = train_group.create_dataset(
'boxes', shape=(num_samples_train, ), dtype=int32_dt)

dataset_valid_boxes = valid_group.create_dataset(
'boxes', shape=(num_samples_valid, ), dtype=int32_dt)

# combine the input hdf5 into the new dataset
# combine train and valid data
Xtrain = []
ytrain = []
Xvalid = []
yvalid = []
# Note: this might use a large chunk of memory as all the data are loaded into memory first and then
# we randomly shuffe it
for hdf5 in input_hdf5:
for i in range(hdf5['train/images'].shape[0]):
Xtrain.append(hdf5['train/images'][i])
ytrain.append(hdf5['train/boxes'][i])
for i in range(hdf5['valid/images'].shape[0]):
Xvalid.append(hdf5['valid/images'][i])
yvalid.append(hdf5['valid/boxes'][i])

Xtrain, ytrain = shuffle(Xtrain, ytrain)
Xvalid, yvalid = shuffle(Xvalid, yvalid)

for i in range(num_samples_train):
dataset_train_images[i] = Xtrain[i]
dataset_train_boxes[i] = ytrain[i]
for i in range(num_samples_valid):
dataset_valid_images[i] = Xvalid[i]
dataset_valid_boxes[i] = yvalid[i]

if draw_enabled:
num_check = 1000
draw_on_images(dataset_train_images[0:num_check], dataset_train_boxes[0:num_check], '/tmp/train')
draw_on_images(dataset_valid_images[0:num_check], dataset_valid_boxes[0:num_check], '/tmp/valid')

combined.close()
for hdf5 in input_hdf5:
hdf5.close()
print('Done combining')

if __name__ == '__main__':
_main(parser.parse_args())

Large diffs are not rendered by default.

@@ -0,0 +1,372 @@
"""
Convert UAV123 dataset to VOC similar dataset
Store the processed data into HDF5 file
The original dataset is for UAV object tracking
"""

import numpy as np
import os
import glob
import cv2
import argparse
import fnmatch
import h5py
import random
import copy
import re
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

classes = ['person', 'car']

parser = argparse.ArgumentParser(
description='Convert UAV123 dataset to HDF5.')

parser.add_argument(
'-p',
'--seq_path',
help='path to UAV123 dataseq',
default='~/data/UAV123/UAV123_10fps/data_seq/UAV123_10fps/')

parser.add_argument(
'-a',
'--anno_path',
help='path to UAV123 annotation',
default='~/data/UAV123/UAV123_10fps/anno/UAV123_10fps/')

parser.add_argument(
'-f',
'--hdf5_path',
help='path to output UAV123 hdf5',
default='~/data/UAV123/UAV123_10fps/')

parser.add_argument(
'-d',
'--verify_enabled',
help='path to classes file, defaults to pascal_classes.txt',
default=False)


def find_car_person_folders(seq_path):
""" Find folders with car or person pictures
"""
assert(os.path.exists(seq_path))
folders = os.listdir(seq_path)
if len(folders):
print('No folders in ' + seq_path)
car_person_folders = []
group_person_folders = []
for folder in folders:
person_match = re.search(r'person\d+$', folder)
car_match = re.search(r'car\d+$', folder)
group_match = re.search(r'group\d+$', folder)
if person_match is not None:
car_person_folders.append(person_match.group(0))
print('Adding folder ' + person_match.group(0))
elif car_match is not None:
car_person_folders.append(car_match.group(0))
print('Adding folder ' + car_match.group(0))
elif group_match is not None:
group_person_folders.append(group_match.group(0))
print('Adding folder ' + group_match.group(0))
car_person_folders = sorted(car_person_folders)
group_person_folders = sorted(group_person_folders)
return car_person_folders + group_person_folders

def find_car_person_anns(ann_path):
"""Find annoation files related to car and persons
"""
car_re = r'car\d+(_\d+)?.txt'
person_re = r'person\d+(_\d+)?.txt'
group_re = r'group\d+(_\d+)?.txt'
car_person_ann = []
group_ann = []
for ann in os.listdir(ann_path):
car_match = re.match(car_re, ann)
person_match = re.match(person_re, ann)
group_match = re.match(group_re, ann)
if car_match:
car_person_ann.append(car_match.group(0))
print('Adding Car anno ' + car_match.group(0))
elif person_match:
car_person_ann.append(person_match.group(0))
print('Adding Person anno ' + person_match.group(0))
elif group_match:
group_ann.append(group_match.group(0))
print('Adding Person anno ' + group_match.group(0))
car_person_ann = sorted(car_person_ann)
group_ann = sorted(group_ann)
return car_person_ann + group_ann

def match_dataseq_anno(seq_path, ann_path):
""" Find the label file for each video folder. Note that for some videos there are multiple
Note: There is a new line missing in car7 label file and it has been manually fixed
"""
car_person_folders = find_car_person_folders(seq_path)
car_person_anns = find_car_person_anns(ann_path)
annotations = []
object_images = []
for folder in car_person_folders:
imgs = os.listdir(os.path.join(seq_path, folder))
imgs = sorted(imgs)
imgs = [os.path.join(seq_path, folder, img) for img in imgs]
print('Folder name: ' + folder)
print('Total images: ' + str(len(imgs)))
ann_name1 = folder + '.txt'
ann_name2 = folder + '_'
anns = [ann for ann in car_person_anns if ann_name1 in ann or ann_name2 in ann]
anns = sorted(anns)
anns = [os.path.join(ann_path, ann) for ann in anns]
ann_data = ''.join([open(f).read() for f in anns])
ann_data = ann_data.split('\n')
parsed_anns = []
for ann in ann_data:
ann = ann.split(',')
try:
ann = [int (a) for a in ann]
except ValueError:
ann = [0, 0, 0, 0]
parsed_anns.append(ann)
parsed_anns = np.array(parsed_anns)
# covert the (xmin, ymin, w, h) to (xmin,ymin, xmax,ymax)
parsed_anns[:,2] = parsed_anns[:,0] + parsed_anns[:,2]
parsed_anns[:,3] = parsed_anns[:,1] + parsed_anns[:,3]
label = np.zeros((parsed_anns.shape[0], 1), dtype=np.int)
if 'car' in folder.lower():
label.fill(classes.index('car'))
elif 'person' in folder.lower():
label.fill(classes.index('person'))
else:
print('warning: unknown label')
continue
# final label is (class, xmin,ymin, xmax,ymax), the same as voc parse script
parsed_anns = np.concatenate((label, parsed_anns), axis=1)
print('Total annotations: ' + str(len(parsed_anns)))
if len(parsed_anns) != len(imgs):
print('warning: image and anno have different size. Turncating')
num = min(len(parsed_anns),len(imgs))
parsed_anns = parsed_anns[:num,:]
imgs = imgs[:num]
annotations.append(parsed_anns)
object_images.append(imgs)
return object_images, annotations, car_person_folders


def select_object_detection_images(list_videos, list_annos, list_folders, clean_info = '~/data/UAV123/UAV123_10fps_clean'):
""" The original UAV123 dataset was for object tracking purpose and usually only one of
the object is labelled. To train an object detection network, we need remove the images where
not all the objects are labelled. Otherwise, it might take longer time for model to converge.
"""
clean_info = os.path.expanduser(clean_info)
assert(os.path.exists(clean_info))
assert(len(list_videos) == len(list_annos))
assert(len(list_videos) > 0)
out_images = []
out_annos = []
out_folders = []
selected_video = os.listdir(clean_info)
selected_video = [int(i) for i in selected_video]
selected_video = sorted(selected_video)
selected_imgs = dict.fromkeys(selected_video)
for video_file in selected_video:
img_idxs = os.listdir(os.path.join(clean_info, str(video_file)))
img_idxs = sorted([int(item.split('.jpg')[0]) for item in img_idxs])
selected_imgs[video_file] = img_idxs
# now parse list_videos, list_annos based on above selected_imgs information
for key, labled_images in selected_imgs.iteritems():
raw_video = list_videos[key] # this in fact is a list of image
raw_anns = list_annos[key]
video = []
anns =[]
for i in labled_images:
if raw_anns[i][2] * raw_anns[i][3] < 16: # w*h > 16 pixes
continue
video.append(raw_video[i])
anns.append(raw_anns[i])
out_images.append(video)
out_annos.append(np.array(anns))
out_folders.append(list_folders[key])
return out_images, out_annos, out_folders

def balance_video_annos(videos, annos, max_allowed_sample=100):
""" The number if images for each videos has large variance and this might
cause the deep learning model overfit on certain type of images. To overcome
the issue of using video data for image detection we only allow max_allowed_sample
of images for each video and mark them as balanced_video and balanced_annos
The remaining one will be marked as unbalanced_video and annos and can be used for validation purpose
Parameters
----------
videos : list
List of list of images.
annos : list
List of list of images
max_allowed_sample: int
The maximum allow image samples for each video clip
Returns
-------
balance_videos : List
List of list of images
balance_videos : List
List of list of annos.
unbalance_videos : List
List of list of images.
unbalance_videos : List
List of list of anno.
"""
assert(len(videos) == len(annos))
balance_images = []
balance_labels = []
unbalance_images = []
unbalance_labels = []
min_samples = min([len(video) for video in videos])
max_allowed_sample = min(min_samples, max_allowed_sample)
max_allowed_sample = max(max_allowed_sample, 100)
for i in range(len(videos)):
images = np.array(videos[i])
labels = np.array(annos[i])
# TODO: for better performance, sample images with fixed interval
if len(images) > max_allowed_sample:
rnd_idxs = sorted(np.random.choice(len(images), max_allowed_sample, replace=False))
balance_images.extend(images[rnd_idxs].tolist())
balance_labels.extend(labels[rnd_idxs].tolist())
unbalance_images.extend([images[j] for j in range(len(images)) if not (j in rnd_idxs)])
unbalance_labels.extend([labels[j] for j in range(len(labels)) if not (j in rnd_idxs)])
else:
balance_images.extend(images)
balance_labels.extend(labels)
return balance_images, np.array(balance_labels), unbalance_images, np.array(unbalance_labels)

def get_image_for_id(images, image_id):
assert(image_id < len(images))
fname = images[image_id]
with open(fname, 'rb') as in_file:
data = in_file.read()
# Use of encoding based on: https://github.com/h5py/h5py/issues/745
return np.fromstring(data, dtype='uint8')

def add_to_dataset(dataset_images, dataset_boxes, images, bboxes, start=0):
""" Store image and bboxes data into dataset
"""
current_rows = len(bboxes)
total_rows = current_rows + dataset_images.shape[0]
dataset_images.resize(total_rows, axis=0)
dataset_boxes.resize(total_rows, axis=0)
for i in range(min(len(images), len(bboxes))):
dataset_images[start + i] = get_image_for_id(images, i)
dataset_boxes[start + i] = bboxes[i].flatten('C')
return i

def draw_on_image_files(images, bboxes, name_hint='debug'):
xmin, ymin = bboxes[:,0],bboxes[:,1]
xmax, ymax = xmin + bboxes[:,2], ymin + bboxes[:,3]
corners = np.concatenate((xmin.reshape(-1,1), ymin.reshape(-1,1), xmax.reshape(-1,1), ymax.reshape(-1,1)), axis=1)
corners = np.array(corners, dtype=np.int)
for i in range(min(len(images), len(bboxes))):
img = cv2.imread(images[i])
corner = corners[i]
cv2.rectangle(img, (corner[0], corner[1]),(corner[2], corner[3]), (0,255,0), 10)
out_dir = os.path.join('/tmp', name_hint)
if not os.path.exists(out_dir):
os.mkdir(out_dir)
out_img_path = os.path.join(out_dir, str(i)+'.jpg')
cv2.imwrite(out_img_path, img)

def draw_bboxes(image, bboxes):
decoded_image = copy.deepcopy(image)
if image.shape[0] > 3180:
decoded_image = cv2.imdecode(image, 1)
if bboxes is None:
return decoded_image
corners = bboxes[:, 1:]
corners = np.array(corners, dtype=np.int)
for corner in corners:
cv2.rectangle(decoded_image, (corner[0], corner[1]),(corner[2], corner[3]), (0,255,0), 10)
return decoded_image

def draw_on_images(dataset_images, dataset_boxes, out_dir='/tmp/uav123/'):
if not os.path.exists(out_dir):
os.mkdir(out_dir)
for i in range(dataset_images.shape[0]):
boxes = np.array(dataset_boxes[i]).reshape(-1, 5)
img = draw_bboxes(dataset_images[i], boxes)
out_img_path = os.path.join(out_dir, str(i)+'.jpg')
cv2.imwrite(out_img_path, img)
return

def _main(args):
seq_path = os.path.expanduser(args.seq_path)
anno_path = os.path.expanduser(args.anno_path)
hdf5_path = os.path.expanduser(args.hdf5_path)
verify_enabled = args.verify_enabled
assert(os.path.exists(seq_path))
assert(os.path.exists(anno_path))
if verify_enabled:
hdf5_path = os.path.join(hdf5_path, 'UAV123.hdf5')
print("Verifying the HD5 data....")
if not os.path.exists(hdf5_path):
print(hdf5_path + " does not exits!")
return
uav123 = h5py.File(hdf5_path, 'r')
print("Verifying the training data....")
draw_on_images(uav123['train/images'], uav123['train/boxes'])
print("Verifying the validation data....")
draw_on_images(uav123['valid/images'], uav123['valid/boxes'])
print("Verification is done")
return
list_videos, list_annos, list_folders = match_dataseq_anno(seq_path, anno_path)
print(len(list_videos), len(list_annos))
videos, annos, folders = select_object_detection_images(list_videos, list_annos, list_folders)
print('Total number of images: '+ str(sum([len(i) for i in videos])))
balance_images, balance_annos, unbalance_images, unbalance_annos = balance_video_annos(videos, annos)
Xtrain, ytrain = shuffle(balance_images, balance_annos, random_state=0)
_, Xvalid, _, yvalid = train_test_split(unbalance_images, unbalance_annos, test_size=0.15, random_state=42)
# draw_on_images(Xtrain, ytrain, name_hint='train')
# draw_on_images(Xvalid, yvalid, name_hint='valid')
# We will use balance_images, balance_annos as train data
# and select a portion from unbalance_images, unbalance_annos to use as validation data
if not os.path.exists(hdf5_path):
print('Creating ' + hdf5_path)
os.mkdir(hdf5_path)
# Create HDF5 dataset structure
print('Creating HDF5 dataset structure.')
fname = os.path.join(hdf5_path, 'UAV123.hdf5')

if os.path.exists(fname):
print('Removing old HDF5')
os.remove(fname)
uav123_h5file = h5py.File(fname, 'w')
uint8_dt = h5py.special_dtype(
vlen=np.dtype('uint8')) # variable length uint8
uint32_dt = h5py.special_dtype(
vlen=np.dtype('uint32')) # variable length uint8
vlen_int_dt = h5py.special_dtype(
vlen=np.dtype(int)) # variable length default int
train_group = uav123_h5file.create_group('train')
valid_group = uav123_h5file.create_group('valid')
# store class list for reference class ids as csv fixed-length numpy string
uav123_h5file.attrs['classes'] = np.string_(str.join(',', classes))

# store images as variable length uint8 arrays
dataset_train_images = train_group.create_dataset(
'images', shape=(0, ), maxshape=(None, ), dtype=uint8_dt)

dataset_valid_images = valid_group.create_dataset(
'images', shape=(0, ), maxshape=(None, ), dtype=uint8_dt)

# store images as variable length uint8 arrays
dataset_train_boxes = train_group.create_dataset(
'boxes', shape=(0, ), maxshape=(None, ), dtype=uint32_dt)

dataset_valid_boxes = valid_group.create_dataset(
'boxes', shape=(0, ), maxshape=(None, ), dtype=uint32_dt)

print('Adding ' + str(len(Xtrain)) + ' training data')
add_to_dataset(dataset_train_images, dataset_train_boxes, Xtrain, ytrain, start=0)
print('Adding ' + str(len(Xvalid)) + ' validation data')
add_to_dataset(dataset_valid_images, dataset_valid_boxes, Xvalid, yvalid, start=0)
print('Closing HDF5 file.')
uav123_h5file.close()
print('Done.')

if __name__ == '__main__':
_main(parser.parse_args())
@@ -0,0 +1,219 @@
"""
Convert Pascal VOC 2007+2012 detection dataset to HDF5.
Does not preserve full XML annotations.
Combines all VOC subsets (train, val test) with VOC2012 train for full
training set as done in Faster R-CNN paper.
Code based on:
https://github.com/pjreddie/darknet/blob/master/scripts/voc_label.py
"""

import argparse
import os
import xml.etree.ElementTree as ElementTree

import h5py
import numpy as np

sets_from_2007 = [('2007', 'train'), ('2007', 'val')]
train_set = [('2012', 'train')]
val_set = [('2012', 'val')]
test_set = [('2007', 'test')]

voc_classes = [
"aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
"chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
"pottedplant", "sheep", "sofa", "train", "tvmonitor"
]
# We only care about below two classes (for now)
vehicles = ["bus", "car", "train"]
aerial_classes = ["person", "vehicle"]

parser = argparse.ArgumentParser(
description='Convert Pascal VOC 2007+2012 detection dataset to HDF5.')
parser.add_argument(
'-p',
'--path_to_voc',
help='path to VOCdevkit directory',
default='~/data/PascalVOC/VOCdevkit')


def get_boxes_for_id(voc_path, year, image_id):
"""Get object bounding boxes annotations for given image.
Parameters
----------
voc_path : str
Path to VOCdevkit directory.
year : str
Year of dataset containing image. Either '2007' or '2012'.
image_id : str
Pascal VOC identifier for given image.
Returns
-------
boxes : array of int
bounding box annotations of class label, xmin, ymin, xmax, ymax as a
5xN array.
"""
fname = os.path.join(voc_path, 'VOC{}/Annotations/{}.xml'.format(year,
image_id))
with open(fname) as in_file:
xml_tree = ElementTree.parse(in_file)
root = xml_tree.getroot()
boxes = []
for obj in root.iter('object'):
difficult = obj.find('difficult').text
label = obj.find('name').text
if int(difficult) == 1: # exclude difficult or unlisted classes
continue
if (label != 'person') and (label not in vehicles):
continue
# map 'car', 'bus' and 'train' to label 'vehicle'
if label in vehicles:
label = 'vehicle'
xml_box = obj.find('bndbox')
bbox = (aerial_classes.index(label), int(xml_box.find('xmin').text),
int(xml_box.find('ymin').text), int(xml_box.find('xmax').text),
int(xml_box.find('ymax').text))
boxes.extend(bbox)
return np.array(boxes) # .T # return transpose so last dimension is variable length

def get_image_for_id(voc_path, year, image_id):
"""Get image data as uint8 array for given image.
Parameters
----------
voc_path : str
Path to VOCdevkit directory.
year : str
Year of dataset containing image. Either '2007' or '2012'.
image_id : str
Pascal VOC identifier for given image.
Returns
-------
image_data : array of uint8
Compressed JPEG byte string represented as array of uint8.
"""
fname = os.path.join(voc_path, 'VOC{}/JPEGImages/{}.jpg'.format(year,
image_id))
with open(fname, 'rb') as in_file:
data = in_file.read()
# Use of encoding based on: https://github.com/h5py/h5py/issues/745
return np.fromstring(data, dtype='uint8')


def get_ids(voc_path, datasets):
"""Get image identifiers for corresponding list of dataset identifies.
Parameters
----------
voc_path : str
Path to VOCdevkit directory.
datasets : list of str tuples
List of dataset identifiers in the form of (year, dataset) pairs.
Returns
-------
ids : list of str
List of all image identifiers for given datasets.
"""
ids = []
for year, image_set in datasets:
id_file = os.path.join(voc_path, 'VOC{}/ImageSets/Main/{}.txt'.format(
year, image_set))
with open(id_file, 'r') as image_ids:
ids.extend(map(str.strip, image_ids.readlines()))
return ids


def add_to_dataset(voc_path, year, ids, images, boxes, start=0):
"""Process all given ids and adds them to given datasets."""
idx = 0
for i, voc_id in enumerate(ids):
image_data = get_image_for_id(voc_path, year, voc_id)
image_boxes = get_boxes_for_id(voc_path, year, voc_id)
# ignore images without interesting objects
if image_boxes.shape[0] == 0:
continue
images[start + idx] = image_data
boxes[start + idx] = image_boxes
idx += 1
return idx

def _main(args):
voc_path = os.path.expanduser(args.path_to_voc)
train_ids = get_ids(voc_path, train_set)
val_ids = get_ids(voc_path, val_set)
test_ids = get_ids(voc_path, test_set)
train_ids_2007 = get_ids(voc_path, sets_from_2007)
total_train_ids = len(train_ids) + len(train_ids_2007)

# Create HDF5 dataset structure
print('Creating HDF5 dataset structure.')
fname = os.path.join(voc_path, 'pascal_voc_07_12_person_vehicle.hdf5')
if os.path.exists(fname):
print('Removing old ' + fname)
os.remove(fname)

voc_h5file = h5py.File(fname, 'w')
uint8_dt = h5py.special_dtype(
vlen=np.dtype('uint8')) # variable length uint8
vlen_int_dt = h5py.special_dtype(
vlen=np.dtype(int)) # variable length default int
train_group = voc_h5file.create_group('train')
val_group = voc_h5file.create_group('valid')
test_group = voc_h5file.create_group('test')

# store class list for reference class ids as csv fixed-length numpy string
voc_h5file.attrs['classes'] = np.string_(str.join(',', aerial_classes))

# store images as variable length uint8 arrays
train_images = train_group.create_dataset(
'images', shape=(total_train_ids, ), dtype=uint8_dt, chunks=True)
val_images = val_group.create_dataset(
'images', shape=(len(val_ids), ), dtype=uint8_dt, chunks=True)
test_images = test_group.create_dataset(
'images', shape=(len(test_ids), ), dtype=uint8_dt, chunks=True)

# store boxes as class_id, xmin, ymin, xmax, ymax
train_boxes = train_group.create_dataset(
'boxes', shape=(total_train_ids, ), dtype=vlen_int_dt, chunks=True)
val_boxes = val_group.create_dataset(
'boxes', shape=(len(val_ids), ), dtype=vlen_int_dt, chunks=True)
test_boxes = test_group.create_dataset(
'boxes', shape=(len(test_ids), ), dtype=vlen_int_dt, chunks=True)

# process all ids and add to datasets
print('Processing Pascal VOC 2007 datasets for training set.')
last_2007 = add_to_dataset(voc_path, '2007', train_ids_2007, train_images,
train_boxes)
print('Processing Pascal VOC 2012 training set.')
total = add_to_dataset(
voc_path,
'2012',
train_ids,
train_images,
train_boxes,
start=last_2007)
train_images.resize(total, axis=0)
train_boxes.resize(total, axis=0)

print('Processing Pascal VOC 2012 val set.')
total = add_to_dataset(voc_path, '2012', val_ids, val_images, val_boxes)
val_images.resize(total, axis=0)
val_boxes.resize(total, axis=0)

print('Processing Pascal VOC 2007 test set.')
total = add_to_dataset(voc_path, '2007', test_ids, test_images, test_boxes)
test_images.resize(total, axis=0)
test_boxes.resize(total, axis=0)

print('Closing HDF5 file.')
voc_h5file.close()
print('Done.')

if __name__ == '__main__':
_main(parser.parse_args())
@@ -0,0 +1,244 @@
"""Convert Pascal VOC 2007+2012 detection dataset to TFRecords.
Does not preserve full XML annotations.
Combines all VOC 2007 subsets (train, val) with VOC2012 for training.
Uses VOC2012 val for val and VOC2007 test for test.
Code based on:
https://github.com/pjreddie/darknet/blob/master/scripts/voc_label.py
https://github.com/tensorflow/models/blob/master/inception/inception/data/build_image_data.py
"""

import argparse
import os
import xml.etree.ElementTree as ElementTree
from datetime import datetime

import numpy as np
import tensorflow as tf

from voc_to_hdf5 import get_ids

sets_from_2007 = [('2007', 'train'), ('2007', 'val')]
train_set = [('2012', 'train'), ('2012', 'val')]
test_set = [('2007', 'test')]

classes = [
"aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
"chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
"pottedplant", "sheep", "sofa", "train", "tvmonitor"
]

parser = argparse.ArgumentParser(
description='Convert Pascal VOC 2007+2012 detection dataset to TFRecords.')
parser.add_argument(
'-p',
'--path_to_voc',
help='path to Pascal VOC dataset',
default='~/data/PascalVOC/VOCdevkit')

# Small graph for image decoding
decoder_sess = tf.Session()
image_placeholder = tf.placeholder(dtype=tf.string)
decoded_jpeg = tf.image.decode_jpeg(image_placeholder, channels=3)


def process_image(image_path):
"""Decode image at given path."""
with open(image_path, 'rb') as f:
image_data = f.read()
image = decoder_sess.run(decoded_jpeg,
feed_dict={image_placeholder: image_data})
assert len(image.shape) == 3
height = image.shape[0]
width = image.shape[2]
assert image.shape[2] == 3
return image_data, height, width


def process_anno(anno_path):
"""Process Pascal VOC annotations."""
with open(anno_path) as f:
xml_tree = ElementTree.parse(f)
root = xml_tree.getroot()
size = root.find('size')
height = float(size.find('height').text)
width = float(size.find('width').text)
boxes = []
for obj in root.iter('object'):
difficult = obj.find('difficult').text
label = obj.find('name').text
if label not in classes or int(
difficult) == 1: # exclude difficult or unlisted classes
continue
xml_box = obj.find('bndbox')
bbox = {
'class': classes.index(label),
'y_min': float(xml_box.find('ymin').text) / height,
'x_min': float(xml_box.find('xmin').text) / width,
'y_max': float(xml_box.find('ymax').text) / height,
'x_max': float(xml_box.find('xmax').text) / width
}
boxes.append(bbox)
return boxes


def convert_to_example(image_data, boxes, filename, height, width):
"""Convert Pascal VOC ground truth to TFExample protobuf.
Parameters
----------
image_data : bytes
Encoded image bytes.
boxes : dict
Bounding box corners and class labels
filename : string
Path to image file.
height : int
Image height.
width : int
Image width.
Returns
-------
example : protobuf
Tensorflow Example protobuf containing image and bounding boxes.
"""
box_classes = [b['class'] for b in boxes]
box_ymin = [b['y_min'] for b in boxes]
box_xmin = [b['x_min'] for b in boxes]
box_ymax = [b['y_max'] for b in boxes]
box_xmax = [b['x_max'] for b in boxes]
encoded_image = [tf.compat.as_bytes(image_data)]
base_name = [tf.compat.as_bytes(os.path.basename(filename))]

example = tf.train.Example(features=tf.train.Features(feature={
'filename':
tf.train.Feature(bytes_list=tf.train.BytesList(value=base_name)),
'height':
tf.train.Feature(int64_list=tf.train.Int64List(value=[height])),
'width':
tf.train.Feature(int64_list=tf.train.Int64List(value=[width])),
'classes':
tf.train.Feature(int64_list=tf.train.Int64List(value=box_classes)),
'y_mins':
tf.train.Feature(float_list=tf.train.FloatList(value=box_ymin)),
'x_mins':
tf.train.Feature(float_list=tf.train.FloatList(value=box_xmin)),
'y_maxes':
tf.train.Feature(float_list=tf.train.FloatList(value=box_ymax)),
'x_maxes':
tf.train.Feature(float_list=tf.train.FloatList(value=box_xmax)),
'encoded':
tf.train.Feature(bytes_list=tf.train.BytesList(value=encoded_image))
}))
return example


def get_image_path(voc_path, year, image_id):
"""Get path to image for given year and image id."""
return os.path.join(voc_path, 'VOC{}/JPEGImages/{}.jpg'.format(year,
image_id))


def get_anno_path(voc_path, year, image_id):
"""Get path to image annotation for given year and image id."""
return os.path.join(voc_path, 'VOC{}/Annotations/{}.xml'.format(year,
image_id))


def process_dataset(name, image_paths, anno_paths, result_path, num_shards):
"""Process selected Pascal VOC dataset to generate TFRecords files.
Parameters
----------
name : string
Name of resulting dataset 'train' or 'test'.
image_paths : list
List of paths to images to include in dataset.
anno_paths : list
List of paths to corresponding image annotations.
result_path : string
Path to put resulting TFRecord files.
num_shards : int
Number of shards to split TFRecord files into.
"""
shard_ranges = np.linspace(0, len(image_paths), num_shards + 1).astype(int)
counter = 0
for shard in range(num_shards):
# Generate shard file name
output_filename = '{}-{:05d}-of-{:05d}'.format(name, shard, num_shards)
output_file = os.path.join(result_path, output_filename)
writer = tf.python_io.TFRecordWriter(output_file)

shard_counter = 0
files_in_shard = range(shard_ranges[shard], shard_ranges[shard + 1])
for i in files_in_shard:
image_file = image_paths[i]
anno_file = anno_paths[i]

# processes image + anno
image_data, height, width = process_image(image_file)
boxes = process_anno(anno_file)

# convert to example
example = convert_to_example(image_data, boxes, image_file, height,
width)

# write to writer
writer.write(example.SerializeToString())

shard_counter += 1
counter += 1

if not counter % 1000:
print('{} : Processed {:d} of {:d} images.'.format(
datetime.now(), counter, len(image_paths)))
writer.close()
print('{} : Wrote {} images to {}'.format(
datetime.now(), shard_counter, output_filename))

print('{} : Wrote {} images to {} shards'.format(datetime.now(), counter,
num_shards))


def _main(args):
"""Locate files for train and test sets and then generate TFRecords."""
voc_path = args.path_to_voc
voc_path = os.path.expanduser(voc_path)
result_path = os.path.join(voc_path, 'TFRecords')
print('Saving results to {}'.format(result_path))

train_path = os.path.join(result_path, 'train')
test_path = os.path.join(result_path, 'test')

train_ids = get_ids(voc_path, train_set) # 2012 trainval
test_ids = get_ids(voc_path, test_set) # 2007 test
train_ids_2007 = get_ids(voc_path, sets_from_2007) # 2007 trainval
total_train_ids = len(train_ids) + len(train_ids_2007)
print('{} train examples and {} test examples'.format(total_train_ids,
len(test_ids)))

train_image_paths = [
get_image_path(voc_path, '2012', i) for i in train_ids
]
train_image_paths.extend(
[get_image_path(voc_path, '2007', i) for i in train_ids_2007])
test_image_paths = [get_image_path(voc_path, '2007', i) for i in test_ids]

train_anno_paths = [get_anno_path(voc_path, '2012', i) for i in train_ids]
train_anno_paths.extend(
[get_anno_path(voc_path, '2007', i) for i in train_ids_2007])
test_anno_paths = [get_anno_path(voc_path, '2007', i) for i in test_ids]

process_dataset(
'train',
train_image_paths,
train_anno_paths,
train_path,
num_shards=60)
process_dataset(
'test', test_image_paths, test_anno_paths, test_path, num_shards=20)


if __name__ == '__main__':
_main(parser.parse_args(args))
@@ -0,0 +1,247 @@
import os

import tensorflow as tf
from keras.models import Model, load_model
from keras.layers import Reshape, Activation, Conv2D, Input
from keras.layers import MaxPooling2D, BatchNormalization, Flatten, Dense, Lambda
from keras.layers.merge import concatenate
from keras.applications.mobilenet import MobileNet
from keras.layers.advanced_activations import LeakyReLU

from keras_darknet19 import Darknet19
from keras_mobilenet import depthwise_conv_block, relu6

def space_to_depth_x2(x):
"""Thin wrapper for Tensorflow space_to_depth with block_size=2."""
# Import currently required to make Lambda work.
# See: https://github.com/fchollet/keras/issues/5088#issuecomment-273851273
import tensorflow as tf
return tf.space_to_depth(x, block_size=2)

def space_to_depth_x2_output_shape(input_shape):
"""Determine space_to_depth output shape for block_size=2.
Note: For Lambda with TensorFlow backend, output shape may not be needed.
"""
return (input_shape[0], input_shape[1] // 2, input_shape[2] // 2, 4 *
input_shape[3]) if input_shape[1] else (input_shape[0], None, None,
4 * input_shape[3])


def space_to_depth_x4(x):
"""Thin wrapper for Tensorflow space_to_depth with block_size=4."""
# Import currently required to make Lambda work.
import tensorflow as tf
return tf.space_to_depth(x, block_size=4)

def space_to_depth_x4_output_shape(input_shape):
"""Determine space_to_depth output shape for block_size=4.
"""
return (input_shape[0], input_shape[1] // 4, input_shape[2] // 4, 16 *
input_shape[3]) if input_shape[1] else (input_shape[0], None, None,
16 * input_shape[3])

class FeatureExtractor(object):
"""Abstract class for feature extracor
"""
# to be defined in each subclass
def __init__(self, input_tensor):
raise NotImplementedError("error message")

# to be defined in each subclass
def normalize(self, image):
raise NotImplementedError("error message")

def output_shape(self):
return self.feature_model.get_output_shape_at(-1)[1:3]

def get_feature_model(self):
return self.feature_model

class Darknet19Feature(FeatureExtractor):
"""Original YoLov2 with Darknet19 as feature extractor
Parameters
----------
input_tensor : tensor
Input tensor with shape (height, width, num_channel)
weights:
Load pretrained weights with COCO dataset
shallow_detection : bool
Whether use a shallow net. In the original YoLov2 design, the size of the last
feature is 1/32 of input due to 5 maxpooling. When shallow_detection is enabled,
we only keep all the layes upto the first 4 maxpooling. This is to hopefully improve
performance for small objects.
three_scale_detection : bool
Whether to use 3 scale of features for detection. The original YoLov2 will use feature
from last layer and one earlier feature and concatenate them togther. We extend this idea
by introducing an extra scale of feature to improve the detection accuracy.
"""
def __init__(self, input_tensor, weights='COCO', shallow_detection=True, three_scale_detection=False):

fine_grained_layers = [17, 27, 43] #[1/4, 1/8, 1/16]
if shallow_detection:
fine_grained_layers = fine_grained_layers[0:2]
num_fina_layers = 512
final_feature_layer = 43 # Total 44 layer
else:
fine_grained_layers = fine_grained_layers[1:]
num_fina_layers = 1024
final_feature_layer = -1 # total 75 layers

feature_model = Darknet19(input_tensor, include_top=False)
feature_model = Model(inputs=feature_model.input, outputs=feature_model.layers[final_feature_layer].output)

if weights == 'COCO':
print("Loading trained COCO weights...")
model_path = os.path.join('weights', 'yolo-coco-m.h5')
trained_model = load_model(model_path)
trained_layers = trained_model.layers
feature_layers = feature_model.layers
for i in range(0, min(len(feature_layers), len(trained_layers))):
weights = trained_layers[i].get_weights()
feature_layers[i].set_weights(weights)

x0 = feature_model.layers[fine_grained_layers[0]].output
x1 = feature_model.layers[fine_grained_layers[1]].output
x2 = feature_model.output

if shallow_detection:
x0 = Conv2D(8, (1,1), strides=(1,1), padding='same', use_bias=False)(x0)
x1 = Conv2D(32, (1,1), strides=(1,1), padding='same', use_bias=False)(x1)
num_fina_layers = 512

else:
x0 = Conv2D(16, (1,1), strides=(1,1), padding='same', use_bias=False)(x0)
x1 = Conv2D(64, (1,1), strides=(1,1), padding='same', use_bias=False)(x1)
num_fina_layers = 1024

# Layer 19
x2 = Conv2D(num_fina_layers, (3,3), strides=(1,1), padding='same', name='conv_19', use_bias=False)(x2)
x2 = BatchNormalization(name='norm_19')(x2)
x2 = LeakyReLU(alpha=0.1)(x2)

# Layer 20
x2 = Conv2D(num_fina_layers, (3,3), strides=(1,1), padding='same', name='conv_20', use_bias=False)(x2)
x2 = BatchNormalization(name='norm_20')(x2)
x2 = LeakyReLU(alpha=0.1)(x2)

# earlier net feature
x0 = BatchNormalization(name='norm_space_to_depth_x4')(x0)
x0 = LeakyReLU(alpha=0.1)(x0)
x0_reshaped = Lambda(
space_to_depth_x4,
output_shape=space_to_depth_x4_output_shape,
name='space_to_depth_x4')(x0)

# earlier net feature
x1 = BatchNormalization(name='norm_space_to_depth_x2')(x1)
x1 = LeakyReLU(alpha=0.1)(x1)
x1_reshaped = Lambda(
space_to_depth_x2,
output_shape=space_to_depth_x2_output_shape,
name='space_to_depth_x2')(x1)

if three_scale_detection:
x = concatenate([x0_reshaped, x1_reshaped, x2])
else:
x = concatenate([x1_reshaped, x2])

x = Conv2D(num_fina_layers, (3,3), strides=(1,1), padding='same', name='conv_detection', use_bias=False)(x)
x = BatchNormalization(name='norm_detection_feature')(x)
x = LeakyReLU(alpha=0.1)(x)
self.feature_model = Model(feature_model.inputs, x)


def normalize(self, image):
return image / 255.


class MobileNetFeature(FeatureExtractor):
"""MobileNet based YoLo
Parameters
----------
input_shape : tensor
Input tensor shape (height, width, num_channel)
weights: string
Load pretrained weights with imagenet dataset
shallow_detection : bool
Whether use a shallow net. In the original YoLov2 design, the size of the last
feature is 1/32 of input due to 5 maxpooling. When shallow_detection is enabled,
we only keep all the layes upto the first 4 maxpooling. This is to hopefully improve
performance for small objects.
three_scale_detection : bool
Whether to use 3 scale of features for detection. The original YoLov2 will use feature
from last layer and one earlier feature and concatenate them togther. We extend this idea
by introducing an extra scale of feature to improve the detection accuracy.
"""

def __init__(self, input_tensor, weights='imagenet', shallow_detection=False, three_scale_detection=False):

fine_grained_layers = [21, 33, 69] #[1/4, 1/8, 1/16]

if shallow_detection:
fine_grained_layers = fine_grained_layers[0:2]
final_feature_layer = 69
else:
fine_grained_layers = fine_grained_layers[1:]
final_feature_layer = -1

feature_model = MobileNet(input_tensor=input_tensor, include_top=False, weights=None)
feature_model = Model(inputs=feature_model.input, outputs=feature_model.layers[final_feature_layer].output)

if weights == 'imagenet':
print('Loading pretrained weights from ImageNet...')
trained_model = MobileNet(input_shape=(224, 224, 3), include_top=False, weights=weights)
trained_layers = trained_model.layers
feature_layers = feature_model.layers
for i in range(0, min(len(feature_layers), len(trained_layers))):
weights = trained_layers[i].get_weights()
feature_layers[i].set_weights(weights)

x0 = feature_model.layers[fine_grained_layers[0]].output
x1 = feature_model.layers[fine_grained_layers[1]].output
x2 = feature_model.output

if shallow_detection:
x0 = Conv2D(8, (1,1), strides=(1,1), padding='same', use_bias=False)(x0)
x1 = Conv2D(32, (1,1), strides=(1,1), padding='same', use_bias=False)(x1)
num_final_layers = 512

else:
x0 = Conv2D(16, (1,1), strides=(1,1), padding='same', use_bias=False)(x0)
x1 = Conv2D(64, (1,1), strides=(1,1), padding='same', use_bias=False)(x1)
num_final_layers = 1024

x2 = depthwise_conv_block(x2, num_final_layers, 1.0, block_id=14)
x2 = depthwise_conv_block(x2, num_final_layers, 1.0, block_id=15)

x1 = BatchNormalization()(x1)
x1 = Lambda(relu6)(x1)
x1_reshaped = Lambda(
space_to_depth_x2,
output_shape=space_to_depth_x2_output_shape,
name='space_to_depth_x2')(x1)

x0 = BatchNormalization()(x0)
x0 = Lambda(relu6)(x0)
x0_reshaped = Lambda(
space_to_depth_x4,
output_shape=space_to_depth_x4_output_shape,
name='space_to_depth_x4')(x0)

if three_scale_detection:
x = concatenate([x0_reshaped, x1_reshaped, x2])
else:
x = concatenate([x1_reshaped, x2])
x = depthwise_conv_block(x, num_final_layers, 1.0, block_id=16)
self.feature_model = Model(feature_model.inputs, x)

def normalize(self, image):
image = image / 255.
image = image - 0.5
image = image * 2.
return image




@@ -0,0 +1,79 @@
"""
DarKNet19 Keras Implementation:
YOLO9000: Better, Faster, Stronger
https://arxiv.org/pdf/1612.08242
"""
from keras.models import Model
from keras.layers import Input, Conv2D, MaxPool2D
from keras.layers import BatchNormalization, Activation
from keras.layers import GlobalAvgPool2D
from keras.layers.advanced_activations import LeakyReLU

def Darknet19(image_tensor=None, num_classes=1000, include_top=False):
"""
DarkNet-19 Architecture Definition
Parameters
----------
image_tensor: tensor
Input tensor. Default: None
num_classes: int
Number of classes for classfication tasks. Default: 1000
include_top: bool
Whether includes the last layer (only needs for classfication tasks). Default: False
"""
if image_tensor is None:
image_tensor = Input(shape=(None, None, 3))

x = conv_block(image_tensor, 32, (3, 3)) # << --- Input layer
x = MaxPool2D(strides=2)(x)

x = conv_block(x, 64, (3, 3))
x = MaxPool2D(strides=2)(x)

x = conv_block(x, 128, (3, 3))
x = conv_block(x, 64, (1, 1))
x = conv_block(x, 128, (3, 3))
x = MaxPool2D(strides=2)(x)

x = conv_block(x, 256, (3, 3))
x = conv_block(x, 128, (1, 1))
x = conv_block(x, 256, (3, 3))
x = MaxPool2D(strides=2)(x)

x = conv_block(x, 512, (3, 3))
x = conv_block(x, 256, (1, 1))
x = conv_block(x, 512, (3, 3))
x = conv_block(x, 256, (1, 1))
x = conv_block(x, 512, (3, 3))
x = MaxPool2D(strides=2)(x)

x = conv_block(x, 1024, (3, 3))
x = conv_block(x, 512, (1, 1))
x = conv_block(x, 1024, (3, 3))
x = conv_block(x, 512, (1, 1))
x = conv_block(x, 1024, (3, 3)) # ---> feature extraction ends here

if include_top:
x = Conv2D(num_classes, (1, 1), activation='linear', padding='same')(x)
x = GlobalAvgPool2D()(x)
x = Activation(activation='softmax')(x)

darknet = Model(image_tensor, x)

return darknet


def conv_block(x, filters, kernel_size, name=None):
"""
Standard YOLOv2 Convolutional Block as suggested in YOLO9000 paper
:param x:
:param filters:
:param kernel_size:
:param kernel_regularizer:
:return:
"""
x = Conv2D(filters=filters, kernel_size=kernel_size, padding='same',
use_bias=False, name=name)(x)
x = BatchNormalization(name=name if name is None else 'batch_norm_%s' % name)(x)
x = LeakyReLU(alpha=0.1, name=name if name is None else 'leaky_relu_%s' % name)(x)
return x
@@ -0,0 +1,198 @@
"""
MobileNet Implementation in Keras
Author: https://github.com/fchollet/keras/blob/master/keras/applications/mobilenet.py
"""
import keras.backend as K
from keras.layers import Input, InputSpec
from keras.layers import Conv2D
from keras.layers import BatchNormalization
from keras.layers import Activation
from keras.layers import GlobalAvgPool2D, Reshape, Dropout
from keras.models import Model
from keras import initializers, regularizers, constraints
from keras.utils import conv_utils

def keras_mobile_net(input_size=(224, 224, 3), include_top=True, n_classes=1000, alpha=1.0, depth_multiplier=1):
if input_size is None:
img_input = Input(shape=(None, None, 3))
else:
img_input = Input(shape=input_size)

shape = (1, 1, int(1024 * alpha))

x = _conv_block(img_input, 32, alpha, strides=(2, 2))
x = depthwise_conv_block(x, 64, alpha, depth_multiplier, block_id=1)
x = depthwise_conv_block(x, 128, alpha, depth_multiplier, block_id=2, strides=(2, 2))
x = depthwise_conv_block(x, 128, alpha, depth_multiplier, block_id=3)
x = depthwise_conv_block(x, 256, alpha, depth_multiplier, block_id=4, strides=(2, 2))
x = depthwise_conv_block(x, 256, alpha, depth_multiplier, block_id=5)
x = depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=6, strides=(2, 2))
x = depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=7)
x = depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=8)
x = depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=9)
x = depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=10)
x = depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=11)

x = depthwise_conv_block(x, 1024, alpha, depth_multiplier, block_id=12, strides=(2, 2))
x = depthwise_conv_block(x, 1024, alpha, depth_multiplier, block_id=13)

if include_top:
x = GlobalAvgPool2D()(x)
x = Reshape(shape, name='reshape_1')(x)
x = Dropout(0.0, name='dropout')(x)

x = Conv2D(n_classes, (1, 1), padding='same', name='conv_preds')(x)
x = Activation('softmax', name='act_softmax')(x)
x = Reshape((n_classes,), name='reshape_2')(x)

model = Model(inputs=img_input, outputs=x)
return model


def relu6(x):
return K.relu(x, max_value=6)

def _conv_block(inputs, filters, alpha, kernel=(3, 3), strides=(1, 1), name='conv1'):
""" Standard Convolutional Block"""
filters = int(filters * alpha)
x = Conv2D(filters, kernel, padding='same', use_bias=False, strides=strides, name=name)(inputs)
x = BatchNormalization(name='%s_bn' % name)(x)
return Activation(relu6, name='%s_relu' % name)(x)


def depthwise_conv_block(inputs, pointwise_conv_filters, alpha, depth_multiplier=1, strides=(1, 1), block_id=1):
"""
A depthwise convolution block.
"""
pointwise_conv_filters = int(pointwise_conv_filters * alpha)
x = DepthwiseConv2D((3, 3), padding='same',
depth_multiplier=depth_multiplier, strides=strides,
use_bias=False, name='conv_dw_%d' % block_id)(inputs)
x = BatchNormalization(name='conv_dw_%d_bn' % block_id)(x)
x = Activation(relu6, name='conv_dw_%d_relu' % block_id)(x)

x = Conv2D(pointwise_conv_filters, (1, 1), padding='same', use_bias=False, strides=(1, 1), name='conv_pw_%d' % block_id)(x)
x = BatchNormalization(name='conv_pw_%d_bn' % block_id)(x)
return Activation(relu6, name='conv_pw_%d_relu' % block_id)(x)


class DepthwiseConv2D(Conv2D):
"""
Depthwise separable 2D convolution.
Reference: https://github.com/fchollet/keras/blob/master/keras/applications/mobilenet.py
"""
def __init__(self, kernel_size, strides=(1, 1), padding='valid', depth_multiplier=1,
data_format=None, activation=None, use_bias=True,
depthwise_initializer='glorot_uniform', bias_initializer='zeros',
depthwise_regularizer=None, bias_regularizer=None, activity_regularizer=None, depthwise_constraint=None,
bias_constraint=None, **kwargs):
super(DepthwiseConv2D, self).__init__(
filters=None,
kernel_size=kernel_size,
strides=strides,
padding=padding,
data_format=data_format,
activation=activation,
use_bias=use_bias,
bias_regularizer=bias_regularizer,
activity_regularizer=activity_regularizer,
bias_constraint=bias_constraint,
**kwargs)
self.depth_multiplier = depth_multiplier
self.depthwise_initializer = initializers.get(depthwise_initializer)
self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
self.depthwise_constraint = constraints.get(depthwise_constraint)
self.bias_initializer = initializers.get(bias_initializer)

def build(self, input_shape):
if len(input_shape) < 4:
raise ValueError('Inputs to `DepthwiseConv2D` should have rank 4. '
'Received input shape:', str(input_shape))
if self.data_format == 'channels_first':
channel_axis = 1
else:
channel_axis = 3
if input_shape[channel_axis] is None:
raise ValueError('The channel dimension of the inputs to '
'`DepthwiseConv2D` '
'should be defined. Found `None`.')
input_dim = int(input_shape[channel_axis])
depthwise_kernel_shape = (self.kernel_size[0],
self.kernel_size[1],
input_dim,
self.depth_multiplier)

self.depthwise_kernel = self.add_weight(
shape=depthwise_kernel_shape,
initializer=self.depthwise_initializer,
name='depthwise_kernel',
regularizer=self.depthwise_regularizer,
constraint=self.depthwise_constraint)

if self.use_bias:
self.bias = self.add_weight(shape=(input_dim * self.depth_multiplier,),
initializer=self.bias_initializer,
name='bias',
regularizer=self.bias_regularizer,
constraint=self.bias_constraint)
else:
self.bias = None
# Set input spec.
self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
self.built = True

def call(self, inputs, training=None):
outputs = K.depthwise_conv2d(
inputs,
self.depthwise_kernel,
strides=self.strides,
padding=self.padding,
dilation_rate=self.dilation_rate,
data_format=self.data_format)

if self.bias:
outputs = K.bias_add(
outputs,
self.bias,
data_format=self.data_format)

if self.activation is not None:
return self.activation(outputs)

return outputs

def compute_output_shape(self, input_shape):
if self.data_format == 'channels_first':
rows = input_shape[2]
cols = input_shape[3]
out_filters = input_shape[1] * self.depth_multiplier
elif self.data_format == 'channels_last':
rows = input_shape[1]
cols = input_shape[2]
out_filters = input_shape[3] * self.depth_multiplier

rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
self.padding,
self.strides[0])
cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
self.padding,
self.strides[1])

if self.data_format == 'channels_first':
return (input_shape[0], out_filters, rows, cols)
elif self.data_format == 'channels_last':
return (input_shape[0], rows, cols, out_filters)

def get_config(self):
config = super(DepthwiseConv2D, self).get_config()
config.pop('filters')
config.pop('kernel_initializer')
config.pop('kernel_regularizer')
config.pop('kernel_constraint')
config['depth_multiplier'] = self.depth_multiplier
config['depthwise_initializer'] = initializers.serialize(self.depthwise_initializer)
config['depthwise_regularizer'] = regularizers.serialize(self.depthwise_regularizer)
config['depthwise_constraint'] = constraints.serialize(self.depthwise_constraint)
return config

Large diffs are not rendered by default.

@@ -0,0 +1,143 @@
"""
YOLOv2 Loss Function Implementation
Input: out feature map from network
Output:
A scalar - loss value for back propagation
------------------
Loss of YOLOv2 Implementation. Few Notes
* What we get from the CNN is a feature map (imagine as a 3-D box)
* Each cell in a feature map is a vector size CFG.N_ANCHORS* (5 + N_CLASSES) as:
------------- ANCHOR 1--------------- -------- ANCHORS 2 ------------- ...... ------------ANCHOR N -----------
[tx, ty, tw, th, to , label_vector..],[tx1, ty1, tw1, th1, label_vector]......[tx_n, ty_n, tw_n, th_m, label...]
----------------------------------------------------------------------------------------------------------------
One cell in a feature map
* tx, ty : predicts of relative center of bounding box to its current cell. Therefore, true center points of
a prediction would be :
xc = sigmoid(tx) + cx
yc = sigmoid(ty) + cy
* tw, th: predicts the scaling value for true width and height of the bounding box based on the anchor as:
w = exp(tw) * px
h = exp(th) * py
* to : objectiveness of the cell : the probability of having an object in the cell
* label: classification vector to calculate soft-max
"""
import re
import numpy as np
import tensorflow as tf
import keras.backend as K
from yolo_uav import get_anchors
import cfg as CFG
import pdb
import math

def custom_loss(y_true, y_pred):
"""
Loss Function of YOLOv2
:param y_true: a Tensor [batch_size, GRID_H, GRID_W, CFG.N_ANCHORS*(N_CLASSES + 5)]
:param y_pred: a Tensor [batch_size, GRID_H, GRID_H, N_ANCHOR*(N_CLASSES + 5)]
:return: a scalar
loss value
"""
# Config Anchors
anchors = get_anchors(CFG.ANCHORS_PATH)
if CFG.SHALLOW_DETECTOR:
anchors = anchors * 2
# pdb.set_trace()
pred_shape = K.shape(y_pred)[1:3]
gt_shape = K.shape(y_true) # shape of ground truth value
GRID_H = tf.cast(pred_shape[0], tf.int32) # shape of output feature map
GRID_W = tf.cast(pred_shape[1], tf.int32)

output_size = tf.cast(tf.reshape([GRID_W, GRID_H], [1, 1, 1, 1, 2]), tf.float32)
y_pred = tf.reshape(y_pred, [-1, pred_shape[0], pred_shape[1],CFG.N_ANCHORS, CFG.N_CLASSES + 5])
y_true = tf.reshape(y_true, [-1, gt_shape[1], gt_shape[2], CFG.N_ANCHORS, CFG.N_CLASSES + 5])

# Grid Map to calculate offset
c_xy = _create_offset_map(K.shape(y_pred))

# Scale anchors to correct aspect ratio
pred_box_xy = (tf.sigmoid(y_pred[:, :, :, :, :2]) + c_xy) / output_size
pred_box_wh = tf.exp(y_pred[:, :, :, :, 2:4]) * np.reshape(anchors, [1, 1, 1, CFG.N_ANCHORS, 2]) / output_size
pred_box_wh = tf.sqrt(pred_box_wh)
pred_box_conf = tf.sigmoid(y_pred[:, :, :, :, 4:5])
pred_box_prob = tf.nn.softmax(y_pred[:, :, :, :, 5:])

# Adjust ground truth
true_box_xy = y_true[:, :, :, :, 0:2]
true_box_wh = tf.sqrt(y_true[:, :, :, :, 2:4])

# adjust confidence
pred_tem_wh = tf.pow(pred_box_wh, 2) * output_size
pred_box_ul = pred_box_xy - 0.5 * pred_tem_wh
pred_box_bd = pred_box_xy + 0.5 * pred_tem_wh
pred_box_area = pred_tem_wh[:, :, :, :, 0] * pred_tem_wh[:, :, :, :, 1]

true_tem_wh = tf.pow(true_box_wh, 2) * output_size
true_box_ul = true_box_xy - 0.5 * true_tem_wh
true_box_bd = true_box_xy + 0.5 * true_tem_wh
true_box_area = true_tem_wh[:, :, :, :, 0] * true_tem_wh[:, :, :, :, 1]

intersect_ul = tf.maximum(pred_box_ul, true_box_ul)
intersect_br = tf.minimum(pred_box_bd, true_box_bd)
intersect_wh = tf.maximum(intersect_br - intersect_ul, 0.0)
intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]

iou = tf.truediv(intersect_area, true_box_area + pred_box_area - intersect_area)
best_box = tf.equal(iou, tf.reduce_max(iou, [3], True))
best_box = tf.to_float(best_box)
true_box_conf = tf.expand_dims(best_box * y_true[:, :, :, :, 4], -1)
true_box_prob = y_true[:, :, :, :, 5:]

# Localization Loss
weight_coor = 5.0 * tf.concat(4 * [true_box_conf], 4)
true_boxes = tf.concat([true_box_xy, true_box_wh], 4)
pred_boxes = tf.concat([pred_box_xy, pred_box_wh], 4)
loc_loss = tf.pow(true_boxes - pred_boxes, 2) * weight_coor
loc_loss = tf.reshape(loc_loss, [-1, tf.cast(GRID_W * GRID_H, tf.int32) * CFG.N_ANCHORS * 4])
loc_loss = tf.reduce_mean(tf.reduce_sum(loc_loss, 1))

# # NOTE: YOLOv2 does not use cross-entropy loss.
# Object Confidence Loss
weight_conf = 0.5 * (1. - true_box_conf) + 5.0 * true_box_conf
obj_conf_loss = tf.pow(true_box_conf - pred_box_conf, 2) * weight_conf
obj_conf_loss = tf.reshape(obj_conf_loss, [-1, tf.cast(GRID_W * GRID_H, tf.int32) * CFG.N_ANCHORS])
obj_conf_loss = tf.reduce_mean(tf.reduce_sum(obj_conf_loss, 1))

# Category Loss
weight_prob = 1.0 * tf.concat(CFG.N_CLASSES * [true_box_conf], 4)
category_loss = tf.pow(true_box_prob - pred_box_prob, 2) * weight_prob
category_loss = tf.reshape(category_loss, [-1, tf.cast(GRID_W * GRID_H, tf.int32) * CFG.N_ANCHORS * CFG.N_CLASSES])
category_loss = tf.reduce_mean(tf.reduce_sum(category_loss, 1))

loss = 0.5 * (loc_loss + obj_conf_loss + category_loss)
# loss = tf.Print(loss, [loc_loss, obj_conf_loss, category_loss], message='Loc, obj, conf ')
return loss


def _create_offset_map(output_shape):
"""
In Yolo9000 paper, Grid map to calculate offsets for each cell in the output feature map
"""
GRID_H = tf.cast(output_shape[1], tf.int32) # shape of output feature map
GRID_W = tf.cast(output_shape[2], tf.int32)

cx = tf.cast((K.arange(0, stop=GRID_W)), dtype=tf.float32)
cx = K.tile(cx, [GRID_H])
cx = K.reshape(cx, [-1, GRID_H, GRID_W, 1])

cy = K.cast((K.arange(0, stop=GRID_H)), dtype=tf.float32)
cy = K.reshape(cy, [-1, 1])
cy = K.tile(cy, [1, GRID_W])
cy = K.reshape(cy, [-1])
cy = K.reshape(cy, [-1, GRID_H, GRID_W, 1])

c_xy = tf.stack([cx, cy], -1)
c_xy = K.cast(c_xy, tf.float32)
return c_xy
@@ -0,0 +1,80 @@
person
bicycle
car
motorbike
aeroplane
bus
train
truck
boat
traffic light
fire hydrant
stop sign
parking meter
bench
bird
cat
dog
horse
sheep
cow
elephant
bear
zebra
giraffe
backpack
umbrella
handbag
tie
suitcase
frisbee
skis
snowboard
sports ball
kite
baseball bat
baseball glove
skateboard
surfboard
tennis racket
bottle
wine glass
cup
fork
knife
spoon
bowl
banana
apple
sandwich
orange
broccoli
carrot
hot dog
pizza
donut
cake
chair
sofa
pottedplant
bed
diningtable
toilet
tvmonitor
laptop
mouse
remote
keyboard
cell phone
microwave
oven
toaster
sink
refrigerator
book
clock
vase
scissors
teddy bear
hair drier
toothbrush
@@ -0,0 +1,20 @@
aeroplane
bicycle
bird
boat
bottle
bus
car
cat
chair
cow
diningtable
dog
horse
motorbike
person
pottedplant
sheep
sofa
train
tvmonitor
@@ -0,0 +1,5 @@
14.001599, 14.765134
1.691922, 3.070213
6.153262, 13.402029
3.111511, 7.955340
8.375247, 5.988119
@@ -0,0 +1,2 @@
person
vehicle
@@ -0,0 +1 @@
0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828
@@ -0,0 +1,122 @@
"""
Overfit one image with 1000 epochs to test the loss function properly
"""
import random
import h5py
import os
import PIL
import io
import cv2

from argparse import ArgumentParser
from loss import custom_loss
from yolo_uav import *
import numpy as np
import cfg as CFG

import keras
import tensorflow as tf
from datagen import DataBatchGenerator
from keras.models import load_model
import keras.backend as K
import matplotlib.pyplot as plt

parser = ArgumentParser(
description="Retrain the Yolo-UAV for a dataset")

parser.add_argument('-p',
'--data_path',
help='path to HDF5 file containing dataset',
default='~/data/PascalVOC/VOCdevkit/pascal_voc_07_12_person_vehicle.hdf5')


parser.add_argument('-w',
'--weights_path',
help="Path to pre-trained weight files",
type=str, default=None)

parser.add_argument('-e',
'--num_epochs',
help='Number of epochs for training',
type=int, default=100)

parser.add_argument('-b',
'--batch_size',
help='Number of batch size',
type=int, default=CFG.BATCH_SIZE)


def _main_():
args = parser.parse_args()
data_path = args.data_path
weights_path = args.weights_path
batch_size = args.batch_size
num_epochs = args.num_epochs

# ###################
# PREPARE DATA INPUT
# ###################

anchors = get_anchors(CFG.ANCHORS_PATH)
classes = get_classes(CFG.CLASSES_PATH)
data_path = os.path.expanduser(data_path)

if CFG.SHALLOW_DETECTOR:
anchors = anchors * 2
assert(CFG.N_ANCHORS == len(anchors))
assert(CFG.N_CLASSES == len(classes))
assert(os.path.exists(data_path))
hdf5_data = h5py.File(data_path, 'r')
num_training = hdf5_data['train/images'].shape[0]

print("==========================")
print('\t anchors:', anchors)
print('\t classes:', classes)
print('\t train_path:', data_path)
print('\t num_training:', num_training)
print("==========================")

yolo_detector = YOLODetector(feature_extractor_name=CFG.FEATURE_EXTRACTOR)
detect_model = yolo_detector.model
detect_model.summary()
# #################
# COMPILE AND RUN
# #################
detect_model.compile(optimizer='adam', loss=custom_loss)

train_batch_gen = DataBatchGenerator(hdf5_data, train='train', jitter=True)
valid_batch_gen = DataBatchGenerator(hdf5_data, train='valid')

logging = TensorBoard()
early_stopping = EarlyStopping(
monitor='val_loss', min_delta=0, patience=10, verbose=1, mode='auto')
train_steps_per_epoch = train_batch_gen.training_instances // batch_size
valid_steps_per_epoch = valid_batch_gen.training_instances // batch_size
print('train_steps_per_epoch=', train_steps_per_epoch)
print('valid_steps_per_epoch=', valid_steps_per_epoch)

num_loop_epochs = 5
loop = num_epochs // num_loop_epochs
for i in range(loop):
weight_name = 'weights/' + 'best_{}{}{}_loop_{}.h5'.format(
CFG.FEATURE_EXTRACTOR, int(CFG.SHALLOW_DETECTOR), int(CFG.USE_THREE_SCALE_FEATURE), i)

checkpoint = ModelCheckpoint(
weight_name, monitor='val_loss', save_weights_only=True, save_best_only=True)
detect_model.fit_generator(generator=train_batch_gen.flow_from_hdf5(),
validation_data=valid_batch_gen.flow_from_hdf5(),
steps_per_epoch=train_steps_per_epoch,
validation_steps=valid_steps_per_epoch,
callbacks=[checkpoint, logging],
epochs=num_loop_epochs,
workers=1,
verbose=1)
weight_name = 'weights/' + '{}{}{}_loop_{}.h5'.format(
CFG.FEATURE_EXTRACTOR, CFG.SHALLOW_DETECTOR, CFG.USE_THREE_SCALE_FEATURE, i)
detect_model.save_weights(weight_name)
compute_recall_precision(
hdf5_data, yolo_detector, weight_name, train='valid', num_samples=1024)


if __name__ == "__main__":
_main_()
@@ -0,0 +1,154 @@
"""
Overfit one image with 1000 epochs to test the loss function properly
"""
import random
import h5py
import os
import PIL
import io
import cv2

from argparse import ArgumentParser
from loss import custom_loss
from yolo_uav import *
import numpy as np
import cfg as CFG

import keras
import tensorflow as tf
from keras.models import load_model
import keras.backend as K
import matplotlib.pyplot as plt

parser = ArgumentParser(description="Over-fit one sample to validate YOLOv2 Loss Function")

parser.add_argument('-p', '--path', help="Path to training text file ",
type=str, default=None)

parser.add_argument('-w', '--weights', help="Path to pre-trained weight files",
type=str, default=None)

parser.add_argument('-e', '--epochs', help='Number of epochs for training',
type=int, default=1000)

parser.add_argument('-b', '--batch', help='Number of batch size',
type=int, default=1)

parser.add_argument(
'-d',
'--data_path',
help='path to HDF5 file containing pascal voc dataset',
default='~/data/PascalVOC/VOCdevkit/pascal_voc_07_12_person_vehicle.hdf5')

args = parser.parse_args()
annotation_path = args.path
WEIGHTS_FILE = args.weights
BATCH_SIZE = args.batch
EPOCHS = args.epochs

def _main_():
# ###################
# PREPARE DATA INPUT
# ###################
anchors = get_anchors(CFG.ANCHORS_PATH)
classes = get_classes(CFG.CLASSES_PATH)

if CFG.SHALLOW_DETECTOR:
anchors = anchors * 2
print(anchors)
test_size = 128
voc_path = os.path.expanduser(args.data_path)
voc = h5py.File(voc_path, 'r')
total_test_instances = voc['train/images'].shape[0]

test_list = np.random.choice(total_test_instances, test_size, replace=False)

x_batch = np.zeros((test_size, CFG.IMAGE_HEIGHT, CFG.IMAGE_WIDTH, 3))
y_batch = np.zeros((test_size, CFG.FEAT_H, CFG.FEAT_W, CFG.N_ANCHORS, 5 + CFG.N_CLASSES))
b_batch = []

cur_id = 0
for test_id in sorted(test_list):
# Original boxes stored as 1D list of class, x_min, y_min, x_max, y_max.
image = PIL.Image.open(io.BytesIO(voc['train/images'][test_id]))
orig_size = np.array([image.width, image.height])
orig_size = np.expand_dims(orig_size, axis=0)

image = image.resize((CFG.IMAGE_WIDTH, CFG.IMAGE_HEIGHT), PIL.Image.BICUBIC)
image_data = np.array(image, dtype=np.float)
image_data /= 255.
x_batch[cur_id] = image_data

boxes = voc['train/boxes'][test_id]
boxes = boxes.reshape((-1, 5))

# Get box parameters as x_center, y_center, box_width, box_height, class.
boxes_xy = 0.5 * (boxes[:, 3:5] + boxes[:, 1:3])
boxes_wh = boxes[:, 3:5] - boxes[:, 1:3]
boxes_xy = boxes_xy / orig_size
boxes_wh = boxes_wh / orig_size
boxes = np.concatenate((boxes_xy, boxes_wh, boxes[:, 0:1]), axis=1)

for box in boxes:
label = int(box[-1])
one_hot = np.eye(CFG.N_CLASSES)[label]
xc, yc, w, h = box[0:4]
b_batch.append(BoundBox(xc, yc, w, h, c=1.0, classes=one_hot))

object_mask = np.concatenate([[xc, yc, w, h], [1.0], one_hot]) # A cell in grid map`

center_x = xc * CFG.FEAT_W
center_y = yc * CFG.FEAT_H
r = int(np.floor(center_x))
c = int(np.floor(center_y))
fw = w * CFG.FEAT_W
fh = h * CFG.FEAT_H

# find the anchor that best predicts this box
best_anchor = -1
max_iou = -1
shifted_box = BoundBox(0, 0, fw, fh)

for i in range(len(anchors)):
anchor_bb = BoundBox(0, 0, anchors[i][0], anchors[i][1])
iou = bbox_iou(shifted_box, anchor_bb)
if max_iou < iou:
best_anchor = i
max_iou = iou
print(c,r, best_anchor, max_iou, object_mask)
if r < CFG.FEAT_W and c < CFG.FEAT_H:
y_batch[cur_id, c, r, best_anchor, :] = object_mask # Construct Feature map ground truth

cur_id += 1

# pdb.set_trace()
y_batch = y_batch.reshape([test_size, CFG.FEAT_H, CFG.FEAT_W, CFG.N_ANCHORS*(5 + CFG.N_CLASSES)])

yolo_detector = YOLODetector(feature_extractor_name=CFG.FEATURE_EXTRACTOR)
# yolo_detector.model.summary()
# #################
# COMPILE AND RUN
# #################
yolo_detector.model.compile(optimizer='adam', loss=custom_loss)

num_steps = 200

yolo_detector.model.fit(x_batch, y_batch, batch_size=CFG.BATCH_SIZE, epochs=num_steps)
yolo_detector.model.save_weights('overfit.weights')

netout = yolo_detector.model.predict(x_batch, batch_size=CFG.BATCH_SIZE)
netouts = netout.reshape(-1, CFG.FEAT_H, CFG.FEAT_W, CFG.N_ANCHORS, (5 + CFG.N_CLASSES))
idx = 0
boxes_pred = []
for i in range(len(netouts)):
image_data = x_batch[i]
boxes = yolo_detector.decode_netout(netouts[i])
boxes_pred += boxes
img = draw_boxes(image_data, boxes, classes)
img = np.array(img * 255, dtype=np.uint8)
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
cv2.imwrite('/tmp/output/img_'+str(i)+'.jpg',img)
# import pdb; pdb.set_trace()
get_recall_precision(boxes_pred, b_batch)
if __name__ == "__main__":
_main_()

Large diffs are not rendered by default.