| @@ -0,0 +1,204 @@ | ||
| """ | ||
| Convert Okutama Action dataset to VOC similar dataset | ||
| Store the processed data into HDF5 file | ||
| The original dataset is for object predestria tracking and people action understasnding | ||
| We remap the orignal labels by only keeping object detection labels | ||
| """ | ||
| # Original Labels: Each line contains 10+ columns, separated by spaces. The definition of these columns are: | ||
| # Track ID. All rows with the same ID belong to the same person for 180 frames. Then the person gets a new idea for the next 180 frames. We will soon release an update to make the IDs consistant. | ||
| # xmin. The top left x-coordinate of the bounding box. | ||
| # ymin. The top left y-coordinate of the bounding box. | ||
| # xmax. The bottom right x-coordinate of the bounding box. | ||
| # ymax. The bottom right y-coordinate of the bounding box. | ||
| # frame. The frame that this annotation represents. | ||
| # lost. If 1, the annotation is outside of the view screen. | ||
| # occluded. If 1, the annotation is occluded. | ||
| # generated. If 1, the annotation was automatically interpolated. | ||
| # label. The label for this annotation, enclosed in quotation marks. | ||
| # (+) actions. Each column after this is an action. | ||
|
|
||
| # There are two label files for each video; | ||
| # one for single-action detection and one for multi-action detection. | ||
| # Note that labels for single-action detection has been created from the multi-action detection labels | ||
| # (for more details please refer to our publication). | ||
| # For pedestrian detection task, the columns describing the actions should be ignored. | ||
|
|
||
| # Object detection Labels: | ||
| # labels. Always be 0 ('person') for this dataset | ||
| # xmin. The top left x-coordinate of the bounding box. | ||
| # ymin. The top left y-coordinate of the bounding box. | ||
| # xmax. The bottom right x-coordinate of the bounding box. | ||
| # ymax. The bottom right y-coordinate of the bounding box. | ||
|
|
||
| import numpy as np | ||
| import os | ||
| import glob | ||
| import cv2 | ||
| import argparse | ||
| import fnmatch | ||
| import h5py | ||
| import random | ||
| import copy | ||
| from sklearn.model_selection import train_test_split | ||
| from sklearn.utils import shuffle | ||
|
|
||
| classes = ["person", "bus", "car", "train"] | ||
|
|
||
| parser = argparse.ArgumentParser( | ||
| description='Merge multiple HDF5 datasets into a singleone.') | ||
|
|
||
| parser.add_argument( | ||
| '-o', | ||
| '--data_output', | ||
| help='path to output HDF5', | ||
| default='~/data/') | ||
|
|
||
| parser.add_argument( | ||
| '-i', | ||
| '--input_hdf5s', | ||
| help='path to input hdf5 files', | ||
| nargs=argparse.ONE_OR_MORE) | ||
|
|
||
| parser.add_argument( | ||
| '-d', | ||
| '--draw', | ||
| help='draw bound boxes on each image and output to /tmp', | ||
| default=False) | ||
|
|
||
| def draw_bboxes(image, bboxes): | ||
| """Draw the bounding boxes on raw or jpg images""" | ||
| decoded_image = copy.deepcopy(image) | ||
| decoded_image = cv2.imdecode(decoded_image, 1) | ||
| if bboxes is None: | ||
| return decoded_image | ||
| corners = bboxes[:, 1:] | ||
| corners = np.array(corners, dtype=np.int) | ||
| for corner in corners: | ||
| cv2.rectangle(decoded_image, (corner[0], corner[1]),(corner[2], corner[3]), (0,255,0), 5) | ||
| return decoded_image | ||
|
|
||
| def draw_on_images(dataset_images, dataset_boxes, out_dir='/tmp/combined/'): | ||
| if not os.path.exists(out_dir): | ||
| os.mkdir(out_dir) | ||
| for i in range(dataset_images.shape[0]): | ||
| boxes = np.array(dataset_boxes[i]).reshape(-1, 5) | ||
| img = draw_bboxes(dataset_images[i], boxes) | ||
| out_img_path = os.path.join(out_dir, str(i)+'.jpg') | ||
| cv2.imwrite(out_img_path, img) | ||
| return | ||
|
|
||
| def _main(args): | ||
| draw_enabled = args.draw | ||
| output_path = os.path.expanduser(args.data_output) | ||
|
|
||
| input_hdf5 = [] | ||
| num_datasets = len(args.input_hdf5s) | ||
| if num_datasets < 2: | ||
| print('Number of hd5f files is :' + str(num_datasets)) | ||
| print('Nothing to combine') | ||
| return | ||
|
|
||
| num_samples_images_train = 0 | ||
| num_samples_bboxes_train = 0 | ||
| num_samples_images_valid = 0 | ||
| num_samples_bboxes_valid = 0 | ||
|
|
||
| for f in args.input_hdf5s: | ||
| in_file = h5py.File(f, 'r') | ||
| input_hdf5.append(in_file) | ||
| num_samples_images_train += in_file['train/images'].shape[0] | ||
| num_samples_bboxes_train += in_file['train/boxes'].shape[0] | ||
| num_samples_images_valid += in_file['valid/images'].shape[0] | ||
| num_samples_bboxes_valid += in_file['valid/boxes'].shape[0] | ||
|
|
||
| # images and boxes must have the same size | ||
| assert(num_samples_images_train == num_samples_bboxes_train) | ||
| assert(num_samples_images_valid == num_samples_bboxes_valid) | ||
|
|
||
| num_samples_train = num_samples_images_train | ||
| num_samples_valid = num_samples_images_valid | ||
|
|
||
| # Create HDF5 dataset structure | ||
| print('Creating output HDF5 dataset structure.') | ||
| print('Total train: ' + str(num_samples_train)) | ||
| print('Total valid: ' + str(num_samples_valid)) | ||
|
|
||
| if not os.path.exists(output_path): | ||
| print('Creating ' + output_path) | ||
| os.mkdir(output_path) | ||
|
|
||
| fname = os.path.join(output_path, 'combined.hdf5') | ||
| if os.path.exists(fname): | ||
| print('Removing old ' + fname) | ||
| os.remove(fname) | ||
|
|
||
| # Create HDF5 dataset structure | ||
| print('Creating HDF5 dataset structure.') | ||
| combined = h5py.File(fname, 'w') | ||
|
|
||
| uint8_dt = h5py.special_dtype( | ||
| vlen=np.dtype('uint8')) # variable length uint8 | ||
| int32_dt = h5py.special_dtype( | ||
| vlen=np.dtype('int32')) # variable length uint8 | ||
|
|
||
| vlen_int_dt = h5py.special_dtype( | ||
| vlen=np.dtype(int)) # variable length default int | ||
|
|
||
| train_group = combined.create_group('train') | ||
| valid_group = combined.create_group('valid') | ||
|
|
||
| # store class list for reference class ids as csv fixed-length numpy string | ||
| combined.attrs['classes'] = np.string_(str.join(',', classes)) | ||
|
|
||
| # store images as variable length uint8 arrays | ||
| dataset_train_images = train_group.create_dataset( | ||
| 'images', shape=(num_samples_train, ), dtype=uint8_dt) | ||
|
|
||
| dataset_valid_images = valid_group.create_dataset( | ||
| 'images', shape=(num_samples_valid, ), dtype=uint8_dt) | ||
|
|
||
| # store images as variable length uint8 arrays | ||
| dataset_train_boxes = train_group.create_dataset( | ||
| 'boxes', shape=(num_samples_train, ), dtype=int32_dt) | ||
|
|
||
| dataset_valid_boxes = valid_group.create_dataset( | ||
| 'boxes', shape=(num_samples_valid, ), dtype=int32_dt) | ||
|
|
||
| # combine the input hdf5 into the new dataset | ||
| # combine train and valid data | ||
| Xtrain = [] | ||
| ytrain = [] | ||
| Xvalid = [] | ||
| yvalid = [] | ||
| # Note: this might use a large chunk of memory as all the data are loaded into memory first and then | ||
| # we randomly shuffe it | ||
| for hdf5 in input_hdf5: | ||
| for i in range(hdf5['train/images'].shape[0]): | ||
| Xtrain.append(hdf5['train/images'][i]) | ||
| ytrain.append(hdf5['train/boxes'][i]) | ||
| for i in range(hdf5['valid/images'].shape[0]): | ||
| Xvalid.append(hdf5['valid/images'][i]) | ||
| yvalid.append(hdf5['valid/boxes'][i]) | ||
|
|
||
| Xtrain, ytrain = shuffle(Xtrain, ytrain) | ||
| Xvalid, yvalid = shuffle(Xvalid, yvalid) | ||
|
|
||
| for i in range(num_samples_train): | ||
| dataset_train_images[i] = Xtrain[i] | ||
| dataset_train_boxes[i] = ytrain[i] | ||
| for i in range(num_samples_valid): | ||
| dataset_valid_images[i] = Xvalid[i] | ||
| dataset_valid_boxes[i] = yvalid[i] | ||
|
|
||
| if draw_enabled: | ||
| num_check = 1000 | ||
| draw_on_images(dataset_train_images[0:num_check], dataset_train_boxes[0:num_check], '/tmp/train') | ||
| draw_on_images(dataset_valid_images[0:num_check], dataset_valid_boxes[0:num_check], '/tmp/valid') | ||
|
|
||
| combined.close() | ||
| for hdf5 in input_hdf5: | ||
| hdf5.close() | ||
| print('Done combining') | ||
|
|
||
| if __name__ == '__main__': | ||
| _main(parser.parse_args()) |
| @@ -0,0 +1,372 @@ | ||
| """ | ||
| Convert UAV123 dataset to VOC similar dataset | ||
| Store the processed data into HDF5 file | ||
| The original dataset is for UAV object tracking | ||
| """ | ||
|
|
||
| import numpy as np | ||
| import os | ||
| import glob | ||
| import cv2 | ||
| import argparse | ||
| import fnmatch | ||
| import h5py | ||
| import random | ||
| import copy | ||
| import re | ||
| from sklearn.utils import shuffle | ||
| from sklearn.model_selection import train_test_split | ||
|
|
||
| classes = ['person', 'car'] | ||
|
|
||
| parser = argparse.ArgumentParser( | ||
| description='Convert UAV123 dataset to HDF5.') | ||
|
|
||
| parser.add_argument( | ||
| '-p', | ||
| '--seq_path', | ||
| help='path to UAV123 dataseq', | ||
| default='~/data/UAV123/UAV123_10fps/data_seq/UAV123_10fps/') | ||
|
|
||
| parser.add_argument( | ||
| '-a', | ||
| '--anno_path', | ||
| help='path to UAV123 annotation', | ||
| default='~/data/UAV123/UAV123_10fps/anno/UAV123_10fps/') | ||
|
|
||
| parser.add_argument( | ||
| '-f', | ||
| '--hdf5_path', | ||
| help='path to output UAV123 hdf5', | ||
| default='~/data/UAV123/UAV123_10fps/') | ||
|
|
||
| parser.add_argument( | ||
| '-d', | ||
| '--verify_enabled', | ||
| help='path to classes file, defaults to pascal_classes.txt', | ||
| default=False) | ||
|
|
||
|
|
||
| def find_car_person_folders(seq_path): | ||
| """ Find folders with car or person pictures | ||
| """ | ||
| assert(os.path.exists(seq_path)) | ||
| folders = os.listdir(seq_path) | ||
| if len(folders): | ||
| print('No folders in ' + seq_path) | ||
| car_person_folders = [] | ||
| group_person_folders = [] | ||
| for folder in folders: | ||
| person_match = re.search(r'person\d+$', folder) | ||
| car_match = re.search(r'car\d+$', folder) | ||
| group_match = re.search(r'group\d+$', folder) | ||
| if person_match is not None: | ||
| car_person_folders.append(person_match.group(0)) | ||
| print('Adding folder ' + person_match.group(0)) | ||
| elif car_match is not None: | ||
| car_person_folders.append(car_match.group(0)) | ||
| print('Adding folder ' + car_match.group(0)) | ||
| elif group_match is not None: | ||
| group_person_folders.append(group_match.group(0)) | ||
| print('Adding folder ' + group_match.group(0)) | ||
| car_person_folders = sorted(car_person_folders) | ||
| group_person_folders = sorted(group_person_folders) | ||
| return car_person_folders + group_person_folders | ||
|
|
||
| def find_car_person_anns(ann_path): | ||
| """Find annoation files related to car and persons | ||
| """ | ||
| car_re = r'car\d+(_\d+)?.txt' | ||
| person_re = r'person\d+(_\d+)?.txt' | ||
| group_re = r'group\d+(_\d+)?.txt' | ||
| car_person_ann = [] | ||
| group_ann = [] | ||
| for ann in os.listdir(ann_path): | ||
| car_match = re.match(car_re, ann) | ||
| person_match = re.match(person_re, ann) | ||
| group_match = re.match(group_re, ann) | ||
| if car_match: | ||
| car_person_ann.append(car_match.group(0)) | ||
| print('Adding Car anno ' + car_match.group(0)) | ||
| elif person_match: | ||
| car_person_ann.append(person_match.group(0)) | ||
| print('Adding Person anno ' + person_match.group(0)) | ||
| elif group_match: | ||
| group_ann.append(group_match.group(0)) | ||
| print('Adding Person anno ' + group_match.group(0)) | ||
| car_person_ann = sorted(car_person_ann) | ||
| group_ann = sorted(group_ann) | ||
| return car_person_ann + group_ann | ||
|
|
||
| def match_dataseq_anno(seq_path, ann_path): | ||
| """ Find the label file for each video folder. Note that for some videos there are multiple | ||
| Note: There is a new line missing in car7 label file and it has been manually fixed | ||
| """ | ||
| car_person_folders = find_car_person_folders(seq_path) | ||
| car_person_anns = find_car_person_anns(ann_path) | ||
| annotations = [] | ||
| object_images = [] | ||
| for folder in car_person_folders: | ||
| imgs = os.listdir(os.path.join(seq_path, folder)) | ||
| imgs = sorted(imgs) | ||
| imgs = [os.path.join(seq_path, folder, img) for img in imgs] | ||
| print('Folder name: ' + folder) | ||
| print('Total images: ' + str(len(imgs))) | ||
| ann_name1 = folder + '.txt' | ||
| ann_name2 = folder + '_' | ||
| anns = [ann for ann in car_person_anns if ann_name1 in ann or ann_name2 in ann] | ||
| anns = sorted(anns) | ||
| anns = [os.path.join(ann_path, ann) for ann in anns] | ||
| ann_data = ''.join([open(f).read() for f in anns]) | ||
| ann_data = ann_data.split('\n') | ||
| parsed_anns = [] | ||
| for ann in ann_data: | ||
| ann = ann.split(',') | ||
| try: | ||
| ann = [int (a) for a in ann] | ||
| except ValueError: | ||
| ann = [0, 0, 0, 0] | ||
| parsed_anns.append(ann) | ||
| parsed_anns = np.array(parsed_anns) | ||
| # covert the (xmin, ymin, w, h) to (xmin,ymin, xmax,ymax) | ||
| parsed_anns[:,2] = parsed_anns[:,0] + parsed_anns[:,2] | ||
| parsed_anns[:,3] = parsed_anns[:,1] + parsed_anns[:,3] | ||
| label = np.zeros((parsed_anns.shape[0], 1), dtype=np.int) | ||
| if 'car' in folder.lower(): | ||
| label.fill(classes.index('car')) | ||
| elif 'person' in folder.lower(): | ||
| label.fill(classes.index('person')) | ||
| else: | ||
| print('warning: unknown label') | ||
| continue | ||
| # final label is (class, xmin,ymin, xmax,ymax), the same as voc parse script | ||
| parsed_anns = np.concatenate((label, parsed_anns), axis=1) | ||
| print('Total annotations: ' + str(len(parsed_anns))) | ||
| if len(parsed_anns) != len(imgs): | ||
| print('warning: image and anno have different size. Turncating') | ||
| num = min(len(parsed_anns),len(imgs)) | ||
| parsed_anns = parsed_anns[:num,:] | ||
| imgs = imgs[:num] | ||
| annotations.append(parsed_anns) | ||
| object_images.append(imgs) | ||
| return object_images, annotations, car_person_folders | ||
|
|
||
|
|
||
| def select_object_detection_images(list_videos, list_annos, list_folders, clean_info = '~/data/UAV123/UAV123_10fps_clean'): | ||
| """ The original UAV123 dataset was for object tracking purpose and usually only one of | ||
| the object is labelled. To train an object detection network, we need remove the images where | ||
| not all the objects are labelled. Otherwise, it might take longer time for model to converge. | ||
| """ | ||
| clean_info = os.path.expanduser(clean_info) | ||
| assert(os.path.exists(clean_info)) | ||
| assert(len(list_videos) == len(list_annos)) | ||
| assert(len(list_videos) > 0) | ||
| out_images = [] | ||
| out_annos = [] | ||
| out_folders = [] | ||
| selected_video = os.listdir(clean_info) | ||
| selected_video = [int(i) for i in selected_video] | ||
| selected_video = sorted(selected_video) | ||
| selected_imgs = dict.fromkeys(selected_video) | ||
| for video_file in selected_video: | ||
| img_idxs = os.listdir(os.path.join(clean_info, str(video_file))) | ||
| img_idxs = sorted([int(item.split('.jpg')[0]) for item in img_idxs]) | ||
| selected_imgs[video_file] = img_idxs | ||
| # now parse list_videos, list_annos based on above selected_imgs information | ||
| for key, labled_images in selected_imgs.iteritems(): | ||
| raw_video = list_videos[key] # this in fact is a list of image | ||
| raw_anns = list_annos[key] | ||
| video = [] | ||
| anns =[] | ||
| for i in labled_images: | ||
| if raw_anns[i][2] * raw_anns[i][3] < 16: # w*h > 16 pixes | ||
| continue | ||
| video.append(raw_video[i]) | ||
| anns.append(raw_anns[i]) | ||
| out_images.append(video) | ||
| out_annos.append(np.array(anns)) | ||
| out_folders.append(list_folders[key]) | ||
| return out_images, out_annos, out_folders | ||
|
|
||
| def balance_video_annos(videos, annos, max_allowed_sample=100): | ||
| """ The number if images for each videos has large variance and this might | ||
| cause the deep learning model overfit on certain type of images. To overcome | ||
| the issue of using video data for image detection we only allow max_allowed_sample | ||
| of images for each video and mark them as balanced_video and balanced_annos | ||
| The remaining one will be marked as unbalanced_video and annos and can be used for validation purpose | ||
| Parameters | ||
| ---------- | ||
| videos : list | ||
| List of list of images. | ||
| annos : list | ||
| List of list of images | ||
| max_allowed_sample: int | ||
| The maximum allow image samples for each video clip | ||
| Returns | ||
| ------- | ||
| balance_videos : List | ||
| List of list of images | ||
| balance_videos : List | ||
| List of list of annos. | ||
| unbalance_videos : List | ||
| List of list of images. | ||
| unbalance_videos : List | ||
| List of list of anno. | ||
| """ | ||
| assert(len(videos) == len(annos)) | ||
| balance_images = [] | ||
| balance_labels = [] | ||
| unbalance_images = [] | ||
| unbalance_labels = [] | ||
| min_samples = min([len(video) for video in videos]) | ||
| max_allowed_sample = min(min_samples, max_allowed_sample) | ||
| max_allowed_sample = max(max_allowed_sample, 100) | ||
| for i in range(len(videos)): | ||
| images = np.array(videos[i]) | ||
| labels = np.array(annos[i]) | ||
| # TODO: for better performance, sample images with fixed interval | ||
| if len(images) > max_allowed_sample: | ||
| rnd_idxs = sorted(np.random.choice(len(images), max_allowed_sample, replace=False)) | ||
| balance_images.extend(images[rnd_idxs].tolist()) | ||
| balance_labels.extend(labels[rnd_idxs].tolist()) | ||
| unbalance_images.extend([images[j] for j in range(len(images)) if not (j in rnd_idxs)]) | ||
| unbalance_labels.extend([labels[j] for j in range(len(labels)) if not (j in rnd_idxs)]) | ||
| else: | ||
| balance_images.extend(images) | ||
| balance_labels.extend(labels) | ||
| return balance_images, np.array(balance_labels), unbalance_images, np.array(unbalance_labels) | ||
|
|
||
| def get_image_for_id(images, image_id): | ||
| assert(image_id < len(images)) | ||
| fname = images[image_id] | ||
| with open(fname, 'rb') as in_file: | ||
| data = in_file.read() | ||
| # Use of encoding based on: https://github.com/h5py/h5py/issues/745 | ||
| return np.fromstring(data, dtype='uint8') | ||
|
|
||
| def add_to_dataset(dataset_images, dataset_boxes, images, bboxes, start=0): | ||
| """ Store image and bboxes data into dataset | ||
| """ | ||
| current_rows = len(bboxes) | ||
| total_rows = current_rows + dataset_images.shape[0] | ||
| dataset_images.resize(total_rows, axis=0) | ||
| dataset_boxes.resize(total_rows, axis=0) | ||
| for i in range(min(len(images), len(bboxes))): | ||
| dataset_images[start + i] = get_image_for_id(images, i) | ||
| dataset_boxes[start + i] = bboxes[i].flatten('C') | ||
| return i | ||
|
|
||
| def draw_on_image_files(images, bboxes, name_hint='debug'): | ||
| xmin, ymin = bboxes[:,0],bboxes[:,1] | ||
| xmax, ymax = xmin + bboxes[:,2], ymin + bboxes[:,3] | ||
| corners = np.concatenate((xmin.reshape(-1,1), ymin.reshape(-1,1), xmax.reshape(-1,1), ymax.reshape(-1,1)), axis=1) | ||
| corners = np.array(corners, dtype=np.int) | ||
| for i in range(min(len(images), len(bboxes))): | ||
| img = cv2.imread(images[i]) | ||
| corner = corners[i] | ||
| cv2.rectangle(img, (corner[0], corner[1]),(corner[2], corner[3]), (0,255,0), 10) | ||
| out_dir = os.path.join('/tmp', name_hint) | ||
| if not os.path.exists(out_dir): | ||
| os.mkdir(out_dir) | ||
| out_img_path = os.path.join(out_dir, str(i)+'.jpg') | ||
| cv2.imwrite(out_img_path, img) | ||
|
|
||
| def draw_bboxes(image, bboxes): | ||
| decoded_image = copy.deepcopy(image) | ||
| if image.shape[0] > 3180: | ||
| decoded_image = cv2.imdecode(image, 1) | ||
| if bboxes is None: | ||
| return decoded_image | ||
| corners = bboxes[:, 1:] | ||
| corners = np.array(corners, dtype=np.int) | ||
| for corner in corners: | ||
| cv2.rectangle(decoded_image, (corner[0], corner[1]),(corner[2], corner[3]), (0,255,0), 10) | ||
| return decoded_image | ||
|
|
||
| def draw_on_images(dataset_images, dataset_boxes, out_dir='/tmp/uav123/'): | ||
| if not os.path.exists(out_dir): | ||
| os.mkdir(out_dir) | ||
| for i in range(dataset_images.shape[0]): | ||
| boxes = np.array(dataset_boxes[i]).reshape(-1, 5) | ||
| img = draw_bboxes(dataset_images[i], boxes) | ||
| out_img_path = os.path.join(out_dir, str(i)+'.jpg') | ||
| cv2.imwrite(out_img_path, img) | ||
| return | ||
|
|
||
| def _main(args): | ||
| seq_path = os.path.expanduser(args.seq_path) | ||
| anno_path = os.path.expanduser(args.anno_path) | ||
| hdf5_path = os.path.expanduser(args.hdf5_path) | ||
| verify_enabled = args.verify_enabled | ||
| assert(os.path.exists(seq_path)) | ||
| assert(os.path.exists(anno_path)) | ||
| if verify_enabled: | ||
| hdf5_path = os.path.join(hdf5_path, 'UAV123.hdf5') | ||
| print("Verifying the HD5 data....") | ||
| if not os.path.exists(hdf5_path): | ||
| print(hdf5_path + " does not exits!") | ||
| return | ||
| uav123 = h5py.File(hdf5_path, 'r') | ||
| print("Verifying the training data....") | ||
| draw_on_images(uav123['train/images'], uav123['train/boxes']) | ||
| print("Verifying the validation data....") | ||
| draw_on_images(uav123['valid/images'], uav123['valid/boxes']) | ||
| print("Verification is done") | ||
| return | ||
| list_videos, list_annos, list_folders = match_dataseq_anno(seq_path, anno_path) | ||
| print(len(list_videos), len(list_annos)) | ||
| videos, annos, folders = select_object_detection_images(list_videos, list_annos, list_folders) | ||
| print('Total number of images: '+ str(sum([len(i) for i in videos]))) | ||
| balance_images, balance_annos, unbalance_images, unbalance_annos = balance_video_annos(videos, annos) | ||
| Xtrain, ytrain = shuffle(balance_images, balance_annos, random_state=0) | ||
| _, Xvalid, _, yvalid = train_test_split(unbalance_images, unbalance_annos, test_size=0.15, random_state=42) | ||
| # draw_on_images(Xtrain, ytrain, name_hint='train') | ||
| # draw_on_images(Xvalid, yvalid, name_hint='valid') | ||
| # We will use balance_images, balance_annos as train data | ||
| # and select a portion from unbalance_images, unbalance_annos to use as validation data | ||
| if not os.path.exists(hdf5_path): | ||
| print('Creating ' + hdf5_path) | ||
| os.mkdir(hdf5_path) | ||
| # Create HDF5 dataset structure | ||
| print('Creating HDF5 dataset structure.') | ||
| fname = os.path.join(hdf5_path, 'UAV123.hdf5') | ||
|
|
||
| if os.path.exists(fname): | ||
| print('Removing old HDF5') | ||
| os.remove(fname) | ||
| uav123_h5file = h5py.File(fname, 'w') | ||
| uint8_dt = h5py.special_dtype( | ||
| vlen=np.dtype('uint8')) # variable length uint8 | ||
| uint32_dt = h5py.special_dtype( | ||
| vlen=np.dtype('uint32')) # variable length uint8 | ||
| vlen_int_dt = h5py.special_dtype( | ||
| vlen=np.dtype(int)) # variable length default int | ||
| train_group = uav123_h5file.create_group('train') | ||
| valid_group = uav123_h5file.create_group('valid') | ||
| # store class list for reference class ids as csv fixed-length numpy string | ||
| uav123_h5file.attrs['classes'] = np.string_(str.join(',', classes)) | ||
|
|
||
| # store images as variable length uint8 arrays | ||
| dataset_train_images = train_group.create_dataset( | ||
| 'images', shape=(0, ), maxshape=(None, ), dtype=uint8_dt) | ||
|
|
||
| dataset_valid_images = valid_group.create_dataset( | ||
| 'images', shape=(0, ), maxshape=(None, ), dtype=uint8_dt) | ||
|
|
||
| # store images as variable length uint8 arrays | ||
| dataset_train_boxes = train_group.create_dataset( | ||
| 'boxes', shape=(0, ), maxshape=(None, ), dtype=uint32_dt) | ||
|
|
||
| dataset_valid_boxes = valid_group.create_dataset( | ||
| 'boxes', shape=(0, ), maxshape=(None, ), dtype=uint32_dt) | ||
|
|
||
| print('Adding ' + str(len(Xtrain)) + ' training data') | ||
| add_to_dataset(dataset_train_images, dataset_train_boxes, Xtrain, ytrain, start=0) | ||
| print('Adding ' + str(len(Xvalid)) + ' validation data') | ||
| add_to_dataset(dataset_valid_images, dataset_valid_boxes, Xvalid, yvalid, start=0) | ||
| print('Closing HDF5 file.') | ||
| uav123_h5file.close() | ||
| print('Done.') | ||
|
|
||
| if __name__ == '__main__': | ||
| _main(parser.parse_args()) |
| @@ -0,0 +1,219 @@ | ||
| """ | ||
| Convert Pascal VOC 2007+2012 detection dataset to HDF5. | ||
| Does not preserve full XML annotations. | ||
| Combines all VOC subsets (train, val test) with VOC2012 train for full | ||
| training set as done in Faster R-CNN paper. | ||
| Code based on: | ||
| https://github.com/pjreddie/darknet/blob/master/scripts/voc_label.py | ||
| """ | ||
|
|
||
| import argparse | ||
| import os | ||
| import xml.etree.ElementTree as ElementTree | ||
|
|
||
| import h5py | ||
| import numpy as np | ||
|
|
||
| sets_from_2007 = [('2007', 'train'), ('2007', 'val')] | ||
| train_set = [('2012', 'train')] | ||
| val_set = [('2012', 'val')] | ||
| test_set = [('2007', 'test')] | ||
|
|
||
| voc_classes = [ | ||
| "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", | ||
| "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", | ||
| "pottedplant", "sheep", "sofa", "train", "tvmonitor" | ||
| ] | ||
| # We only care about below two classes (for now) | ||
| vehicles = ["bus", "car", "train"] | ||
| aerial_classes = ["person", "vehicle"] | ||
|
|
||
| parser = argparse.ArgumentParser( | ||
| description='Convert Pascal VOC 2007+2012 detection dataset to HDF5.') | ||
| parser.add_argument( | ||
| '-p', | ||
| '--path_to_voc', | ||
| help='path to VOCdevkit directory', | ||
| default='~/data/PascalVOC/VOCdevkit') | ||
|
|
||
|
|
||
| def get_boxes_for_id(voc_path, year, image_id): | ||
| """Get object bounding boxes annotations for given image. | ||
| Parameters | ||
| ---------- | ||
| voc_path : str | ||
| Path to VOCdevkit directory. | ||
| year : str | ||
| Year of dataset containing image. Either '2007' or '2012'. | ||
| image_id : str | ||
| Pascal VOC identifier for given image. | ||
| Returns | ||
| ------- | ||
| boxes : array of int | ||
| bounding box annotations of class label, xmin, ymin, xmax, ymax as a | ||
| 5xN array. | ||
| """ | ||
| fname = os.path.join(voc_path, 'VOC{}/Annotations/{}.xml'.format(year, | ||
| image_id)) | ||
| with open(fname) as in_file: | ||
| xml_tree = ElementTree.parse(in_file) | ||
| root = xml_tree.getroot() | ||
| boxes = [] | ||
| for obj in root.iter('object'): | ||
| difficult = obj.find('difficult').text | ||
| label = obj.find('name').text | ||
| if int(difficult) == 1: # exclude difficult or unlisted classes | ||
| continue | ||
| if (label != 'person') and (label not in vehicles): | ||
| continue | ||
| # map 'car', 'bus' and 'train' to label 'vehicle' | ||
| if label in vehicles: | ||
| label = 'vehicle' | ||
| xml_box = obj.find('bndbox') | ||
| bbox = (aerial_classes.index(label), int(xml_box.find('xmin').text), | ||
| int(xml_box.find('ymin').text), int(xml_box.find('xmax').text), | ||
| int(xml_box.find('ymax').text)) | ||
| boxes.extend(bbox) | ||
| return np.array(boxes) # .T # return transpose so last dimension is variable length | ||
|
|
||
| def get_image_for_id(voc_path, year, image_id): | ||
| """Get image data as uint8 array for given image. | ||
| Parameters | ||
| ---------- | ||
| voc_path : str | ||
| Path to VOCdevkit directory. | ||
| year : str | ||
| Year of dataset containing image. Either '2007' or '2012'. | ||
| image_id : str | ||
| Pascal VOC identifier for given image. | ||
| Returns | ||
| ------- | ||
| image_data : array of uint8 | ||
| Compressed JPEG byte string represented as array of uint8. | ||
| """ | ||
| fname = os.path.join(voc_path, 'VOC{}/JPEGImages/{}.jpg'.format(year, | ||
| image_id)) | ||
| with open(fname, 'rb') as in_file: | ||
| data = in_file.read() | ||
| # Use of encoding based on: https://github.com/h5py/h5py/issues/745 | ||
| return np.fromstring(data, dtype='uint8') | ||
|
|
||
|
|
||
| def get_ids(voc_path, datasets): | ||
| """Get image identifiers for corresponding list of dataset identifies. | ||
| Parameters | ||
| ---------- | ||
| voc_path : str | ||
| Path to VOCdevkit directory. | ||
| datasets : list of str tuples | ||
| List of dataset identifiers in the form of (year, dataset) pairs. | ||
| Returns | ||
| ------- | ||
| ids : list of str | ||
| List of all image identifiers for given datasets. | ||
| """ | ||
| ids = [] | ||
| for year, image_set in datasets: | ||
| id_file = os.path.join(voc_path, 'VOC{}/ImageSets/Main/{}.txt'.format( | ||
| year, image_set)) | ||
| with open(id_file, 'r') as image_ids: | ||
| ids.extend(map(str.strip, image_ids.readlines())) | ||
| return ids | ||
|
|
||
|
|
||
| def add_to_dataset(voc_path, year, ids, images, boxes, start=0): | ||
| """Process all given ids and adds them to given datasets.""" | ||
| idx = 0 | ||
| for i, voc_id in enumerate(ids): | ||
| image_data = get_image_for_id(voc_path, year, voc_id) | ||
| image_boxes = get_boxes_for_id(voc_path, year, voc_id) | ||
| # ignore images without interesting objects | ||
| if image_boxes.shape[0] == 0: | ||
| continue | ||
| images[start + idx] = image_data | ||
| boxes[start + idx] = image_boxes | ||
| idx += 1 | ||
| return idx | ||
|
|
||
| def _main(args): | ||
| voc_path = os.path.expanduser(args.path_to_voc) | ||
| train_ids = get_ids(voc_path, train_set) | ||
| val_ids = get_ids(voc_path, val_set) | ||
| test_ids = get_ids(voc_path, test_set) | ||
| train_ids_2007 = get_ids(voc_path, sets_from_2007) | ||
| total_train_ids = len(train_ids) + len(train_ids_2007) | ||
|
|
||
| # Create HDF5 dataset structure | ||
| print('Creating HDF5 dataset structure.') | ||
| fname = os.path.join(voc_path, 'pascal_voc_07_12_person_vehicle.hdf5') | ||
| if os.path.exists(fname): | ||
| print('Removing old ' + fname) | ||
| os.remove(fname) | ||
|
|
||
| voc_h5file = h5py.File(fname, 'w') | ||
| uint8_dt = h5py.special_dtype( | ||
| vlen=np.dtype('uint8')) # variable length uint8 | ||
| vlen_int_dt = h5py.special_dtype( | ||
| vlen=np.dtype(int)) # variable length default int | ||
| train_group = voc_h5file.create_group('train') | ||
| val_group = voc_h5file.create_group('valid') | ||
| test_group = voc_h5file.create_group('test') | ||
|
|
||
| # store class list for reference class ids as csv fixed-length numpy string | ||
| voc_h5file.attrs['classes'] = np.string_(str.join(',', aerial_classes)) | ||
|
|
||
| # store images as variable length uint8 arrays | ||
| train_images = train_group.create_dataset( | ||
| 'images', shape=(total_train_ids, ), dtype=uint8_dt, chunks=True) | ||
| val_images = val_group.create_dataset( | ||
| 'images', shape=(len(val_ids), ), dtype=uint8_dt, chunks=True) | ||
| test_images = test_group.create_dataset( | ||
| 'images', shape=(len(test_ids), ), dtype=uint8_dt, chunks=True) | ||
|
|
||
| # store boxes as class_id, xmin, ymin, xmax, ymax | ||
| train_boxes = train_group.create_dataset( | ||
| 'boxes', shape=(total_train_ids, ), dtype=vlen_int_dt, chunks=True) | ||
| val_boxes = val_group.create_dataset( | ||
| 'boxes', shape=(len(val_ids), ), dtype=vlen_int_dt, chunks=True) | ||
| test_boxes = test_group.create_dataset( | ||
| 'boxes', shape=(len(test_ids), ), dtype=vlen_int_dt, chunks=True) | ||
|
|
||
| # process all ids and add to datasets | ||
| print('Processing Pascal VOC 2007 datasets for training set.') | ||
| last_2007 = add_to_dataset(voc_path, '2007', train_ids_2007, train_images, | ||
| train_boxes) | ||
| print('Processing Pascal VOC 2012 training set.') | ||
| total = add_to_dataset( | ||
| voc_path, | ||
| '2012', | ||
| train_ids, | ||
| train_images, | ||
| train_boxes, | ||
| start=last_2007) | ||
| train_images.resize(total, axis=0) | ||
| train_boxes.resize(total, axis=0) | ||
|
|
||
| print('Processing Pascal VOC 2012 val set.') | ||
| total = add_to_dataset(voc_path, '2012', val_ids, val_images, val_boxes) | ||
| val_images.resize(total, axis=0) | ||
| val_boxes.resize(total, axis=0) | ||
|
|
||
| print('Processing Pascal VOC 2007 test set.') | ||
| total = add_to_dataset(voc_path, '2007', test_ids, test_images, test_boxes) | ||
| test_images.resize(total, axis=0) | ||
| test_boxes.resize(total, axis=0) | ||
|
|
||
| print('Closing HDF5 file.') | ||
| voc_h5file.close() | ||
| print('Done.') | ||
|
|
||
| if __name__ == '__main__': | ||
| _main(parser.parse_args()) |
| @@ -0,0 +1,244 @@ | ||
| """Convert Pascal VOC 2007+2012 detection dataset to TFRecords. | ||
| Does not preserve full XML annotations. | ||
| Combines all VOC 2007 subsets (train, val) with VOC2012 for training. | ||
| Uses VOC2012 val for val and VOC2007 test for test. | ||
| Code based on: | ||
| https://github.com/pjreddie/darknet/blob/master/scripts/voc_label.py | ||
| https://github.com/tensorflow/models/blob/master/inception/inception/data/build_image_data.py | ||
| """ | ||
|
|
||
| import argparse | ||
| import os | ||
| import xml.etree.ElementTree as ElementTree | ||
| from datetime import datetime | ||
|
|
||
| import numpy as np | ||
| import tensorflow as tf | ||
|
|
||
| from voc_to_hdf5 import get_ids | ||
|
|
||
| sets_from_2007 = [('2007', 'train'), ('2007', 'val')] | ||
| train_set = [('2012', 'train'), ('2012', 'val')] | ||
| test_set = [('2007', 'test')] | ||
|
|
||
| classes = [ | ||
| "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", | ||
| "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", | ||
| "pottedplant", "sheep", "sofa", "train", "tvmonitor" | ||
| ] | ||
|
|
||
| parser = argparse.ArgumentParser( | ||
| description='Convert Pascal VOC 2007+2012 detection dataset to TFRecords.') | ||
| parser.add_argument( | ||
| '-p', | ||
| '--path_to_voc', | ||
| help='path to Pascal VOC dataset', | ||
| default='~/data/PascalVOC/VOCdevkit') | ||
|
|
||
| # Small graph for image decoding | ||
| decoder_sess = tf.Session() | ||
| image_placeholder = tf.placeholder(dtype=tf.string) | ||
| decoded_jpeg = tf.image.decode_jpeg(image_placeholder, channels=3) | ||
|
|
||
|
|
||
| def process_image(image_path): | ||
| """Decode image at given path.""" | ||
| with open(image_path, 'rb') as f: | ||
| image_data = f.read() | ||
| image = decoder_sess.run(decoded_jpeg, | ||
| feed_dict={image_placeholder: image_data}) | ||
| assert len(image.shape) == 3 | ||
| height = image.shape[0] | ||
| width = image.shape[2] | ||
| assert image.shape[2] == 3 | ||
| return image_data, height, width | ||
|
|
||
|
|
||
| def process_anno(anno_path): | ||
| """Process Pascal VOC annotations.""" | ||
| with open(anno_path) as f: | ||
| xml_tree = ElementTree.parse(f) | ||
| root = xml_tree.getroot() | ||
| size = root.find('size') | ||
| height = float(size.find('height').text) | ||
| width = float(size.find('width').text) | ||
| boxes = [] | ||
| for obj in root.iter('object'): | ||
| difficult = obj.find('difficult').text | ||
| label = obj.find('name').text | ||
| if label not in classes or int( | ||
| difficult) == 1: # exclude difficult or unlisted classes | ||
| continue | ||
| xml_box = obj.find('bndbox') | ||
| bbox = { | ||
| 'class': classes.index(label), | ||
| 'y_min': float(xml_box.find('ymin').text) / height, | ||
| 'x_min': float(xml_box.find('xmin').text) / width, | ||
| 'y_max': float(xml_box.find('ymax').text) / height, | ||
| 'x_max': float(xml_box.find('xmax').text) / width | ||
| } | ||
| boxes.append(bbox) | ||
| return boxes | ||
|
|
||
|
|
||
| def convert_to_example(image_data, boxes, filename, height, width): | ||
| """Convert Pascal VOC ground truth to TFExample protobuf. | ||
| Parameters | ||
| ---------- | ||
| image_data : bytes | ||
| Encoded image bytes. | ||
| boxes : dict | ||
| Bounding box corners and class labels | ||
| filename : string | ||
| Path to image file. | ||
| height : int | ||
| Image height. | ||
| width : int | ||
| Image width. | ||
| Returns | ||
| ------- | ||
| example : protobuf | ||
| Tensorflow Example protobuf containing image and bounding boxes. | ||
| """ | ||
| box_classes = [b['class'] for b in boxes] | ||
| box_ymin = [b['y_min'] for b in boxes] | ||
| box_xmin = [b['x_min'] for b in boxes] | ||
| box_ymax = [b['y_max'] for b in boxes] | ||
| box_xmax = [b['x_max'] for b in boxes] | ||
| encoded_image = [tf.compat.as_bytes(image_data)] | ||
| base_name = [tf.compat.as_bytes(os.path.basename(filename))] | ||
|
|
||
| example = tf.train.Example(features=tf.train.Features(feature={ | ||
| 'filename': | ||
| tf.train.Feature(bytes_list=tf.train.BytesList(value=base_name)), | ||
| 'height': | ||
| tf.train.Feature(int64_list=tf.train.Int64List(value=[height])), | ||
| 'width': | ||
| tf.train.Feature(int64_list=tf.train.Int64List(value=[width])), | ||
| 'classes': | ||
| tf.train.Feature(int64_list=tf.train.Int64List(value=box_classes)), | ||
| 'y_mins': | ||
| tf.train.Feature(float_list=tf.train.FloatList(value=box_ymin)), | ||
| 'x_mins': | ||
| tf.train.Feature(float_list=tf.train.FloatList(value=box_xmin)), | ||
| 'y_maxes': | ||
| tf.train.Feature(float_list=tf.train.FloatList(value=box_ymax)), | ||
| 'x_maxes': | ||
| tf.train.Feature(float_list=tf.train.FloatList(value=box_xmax)), | ||
| 'encoded': | ||
| tf.train.Feature(bytes_list=tf.train.BytesList(value=encoded_image)) | ||
| })) | ||
| return example | ||
|
|
||
|
|
||
| def get_image_path(voc_path, year, image_id): | ||
| """Get path to image for given year and image id.""" | ||
| return os.path.join(voc_path, 'VOC{}/JPEGImages/{}.jpg'.format(year, | ||
| image_id)) | ||
|
|
||
|
|
||
| def get_anno_path(voc_path, year, image_id): | ||
| """Get path to image annotation for given year and image id.""" | ||
| return os.path.join(voc_path, 'VOC{}/Annotations/{}.xml'.format(year, | ||
| image_id)) | ||
|
|
||
|
|
||
| def process_dataset(name, image_paths, anno_paths, result_path, num_shards): | ||
| """Process selected Pascal VOC dataset to generate TFRecords files. | ||
| Parameters | ||
| ---------- | ||
| name : string | ||
| Name of resulting dataset 'train' or 'test'. | ||
| image_paths : list | ||
| List of paths to images to include in dataset. | ||
| anno_paths : list | ||
| List of paths to corresponding image annotations. | ||
| result_path : string | ||
| Path to put resulting TFRecord files. | ||
| num_shards : int | ||
| Number of shards to split TFRecord files into. | ||
| """ | ||
| shard_ranges = np.linspace(0, len(image_paths), num_shards + 1).astype(int) | ||
| counter = 0 | ||
| for shard in range(num_shards): | ||
| # Generate shard file name | ||
| output_filename = '{}-{:05d}-of-{:05d}'.format(name, shard, num_shards) | ||
| output_file = os.path.join(result_path, output_filename) | ||
| writer = tf.python_io.TFRecordWriter(output_file) | ||
|
|
||
| shard_counter = 0 | ||
| files_in_shard = range(shard_ranges[shard], shard_ranges[shard + 1]) | ||
| for i in files_in_shard: | ||
| image_file = image_paths[i] | ||
| anno_file = anno_paths[i] | ||
|
|
||
| # processes image + anno | ||
| image_data, height, width = process_image(image_file) | ||
| boxes = process_anno(anno_file) | ||
|
|
||
| # convert to example | ||
| example = convert_to_example(image_data, boxes, image_file, height, | ||
| width) | ||
|
|
||
| # write to writer | ||
| writer.write(example.SerializeToString()) | ||
|
|
||
| shard_counter += 1 | ||
| counter += 1 | ||
|
|
||
| if not counter % 1000: | ||
| print('{} : Processed {:d} of {:d} images.'.format( | ||
| datetime.now(), counter, len(image_paths))) | ||
| writer.close() | ||
| print('{} : Wrote {} images to {}'.format( | ||
| datetime.now(), shard_counter, output_filename)) | ||
|
|
||
| print('{} : Wrote {} images to {} shards'.format(datetime.now(), counter, | ||
| num_shards)) | ||
|
|
||
|
|
||
| def _main(args): | ||
| """Locate files for train and test sets and then generate TFRecords.""" | ||
| voc_path = args.path_to_voc | ||
| voc_path = os.path.expanduser(voc_path) | ||
| result_path = os.path.join(voc_path, 'TFRecords') | ||
| print('Saving results to {}'.format(result_path)) | ||
|
|
||
| train_path = os.path.join(result_path, 'train') | ||
| test_path = os.path.join(result_path, 'test') | ||
|
|
||
| train_ids = get_ids(voc_path, train_set) # 2012 trainval | ||
| test_ids = get_ids(voc_path, test_set) # 2007 test | ||
| train_ids_2007 = get_ids(voc_path, sets_from_2007) # 2007 trainval | ||
| total_train_ids = len(train_ids) + len(train_ids_2007) | ||
| print('{} train examples and {} test examples'.format(total_train_ids, | ||
| len(test_ids))) | ||
|
|
||
| train_image_paths = [ | ||
| get_image_path(voc_path, '2012', i) for i in train_ids | ||
| ] | ||
| train_image_paths.extend( | ||
| [get_image_path(voc_path, '2007', i) for i in train_ids_2007]) | ||
| test_image_paths = [get_image_path(voc_path, '2007', i) for i in test_ids] | ||
|
|
||
| train_anno_paths = [get_anno_path(voc_path, '2012', i) for i in train_ids] | ||
| train_anno_paths.extend( | ||
| [get_anno_path(voc_path, '2007', i) for i in train_ids_2007]) | ||
| test_anno_paths = [get_anno_path(voc_path, '2007', i) for i in test_ids] | ||
|
|
||
| process_dataset( | ||
| 'train', | ||
| train_image_paths, | ||
| train_anno_paths, | ||
| train_path, | ||
| num_shards=60) | ||
| process_dataset( | ||
| 'test', test_image_paths, test_anno_paths, test_path, num_shards=20) | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| _main(parser.parse_args(args)) |
| @@ -0,0 +1,247 @@ | ||
| import os | ||
|
|
||
| import tensorflow as tf | ||
| from keras.models import Model, load_model | ||
| from keras.layers import Reshape, Activation, Conv2D, Input | ||
| from keras.layers import MaxPooling2D, BatchNormalization, Flatten, Dense, Lambda | ||
| from keras.layers.merge import concatenate | ||
| from keras.applications.mobilenet import MobileNet | ||
| from keras.layers.advanced_activations import LeakyReLU | ||
|
|
||
| from keras_darknet19 import Darknet19 | ||
| from keras_mobilenet import depthwise_conv_block, relu6 | ||
|
|
||
| def space_to_depth_x2(x): | ||
| """Thin wrapper for Tensorflow space_to_depth with block_size=2.""" | ||
| # Import currently required to make Lambda work. | ||
| # See: https://github.com/fchollet/keras/issues/5088#issuecomment-273851273 | ||
| import tensorflow as tf | ||
| return tf.space_to_depth(x, block_size=2) | ||
|
|
||
| def space_to_depth_x2_output_shape(input_shape): | ||
| """Determine space_to_depth output shape for block_size=2. | ||
| Note: For Lambda with TensorFlow backend, output shape may not be needed. | ||
| """ | ||
| return (input_shape[0], input_shape[1] // 2, input_shape[2] // 2, 4 * | ||
| input_shape[3]) if input_shape[1] else (input_shape[0], None, None, | ||
| 4 * input_shape[3]) | ||
|
|
||
|
|
||
| def space_to_depth_x4(x): | ||
| """Thin wrapper for Tensorflow space_to_depth with block_size=4.""" | ||
| # Import currently required to make Lambda work. | ||
| import tensorflow as tf | ||
| return tf.space_to_depth(x, block_size=4) | ||
|
|
||
| def space_to_depth_x4_output_shape(input_shape): | ||
| """Determine space_to_depth output shape for block_size=4. | ||
| """ | ||
| return (input_shape[0], input_shape[1] // 4, input_shape[2] // 4, 16 * | ||
| input_shape[3]) if input_shape[1] else (input_shape[0], None, None, | ||
| 16 * input_shape[3]) | ||
|
|
||
| class FeatureExtractor(object): | ||
| """Abstract class for feature extracor | ||
| """ | ||
| # to be defined in each subclass | ||
| def __init__(self, input_tensor): | ||
| raise NotImplementedError("error message") | ||
|
|
||
| # to be defined in each subclass | ||
| def normalize(self, image): | ||
| raise NotImplementedError("error message") | ||
|
|
||
| def output_shape(self): | ||
| return self.feature_model.get_output_shape_at(-1)[1:3] | ||
|
|
||
| def get_feature_model(self): | ||
| return self.feature_model | ||
|
|
||
| class Darknet19Feature(FeatureExtractor): | ||
| """Original YoLov2 with Darknet19 as feature extractor | ||
| Parameters | ||
| ---------- | ||
| input_tensor : tensor | ||
| Input tensor with shape (height, width, num_channel) | ||
| weights: | ||
| Load pretrained weights with COCO dataset | ||
| shallow_detection : bool | ||
| Whether use a shallow net. In the original YoLov2 design, the size of the last | ||
| feature is 1/32 of input due to 5 maxpooling. When shallow_detection is enabled, | ||
| we only keep all the layes upto the first 4 maxpooling. This is to hopefully improve | ||
| performance for small objects. | ||
| three_scale_detection : bool | ||
| Whether to use 3 scale of features for detection. The original YoLov2 will use feature | ||
| from last layer and one earlier feature and concatenate them togther. We extend this idea | ||
| by introducing an extra scale of feature to improve the detection accuracy. | ||
| """ | ||
| def __init__(self, input_tensor, weights='COCO', shallow_detection=True, three_scale_detection=False): | ||
|
|
||
| fine_grained_layers = [17, 27, 43] #[1/4, 1/8, 1/16] | ||
| if shallow_detection: | ||
| fine_grained_layers = fine_grained_layers[0:2] | ||
| num_fina_layers = 512 | ||
| final_feature_layer = 43 # Total 44 layer | ||
| else: | ||
| fine_grained_layers = fine_grained_layers[1:] | ||
| num_fina_layers = 1024 | ||
| final_feature_layer = -1 # total 75 layers | ||
|
|
||
| feature_model = Darknet19(input_tensor, include_top=False) | ||
| feature_model = Model(inputs=feature_model.input, outputs=feature_model.layers[final_feature_layer].output) | ||
|
|
||
| if weights == 'COCO': | ||
| print("Loading trained COCO weights...") | ||
| model_path = os.path.join('weights', 'yolo-coco-m.h5') | ||
| trained_model = load_model(model_path) | ||
| trained_layers = trained_model.layers | ||
| feature_layers = feature_model.layers | ||
| for i in range(0, min(len(feature_layers), len(trained_layers))): | ||
| weights = trained_layers[i].get_weights() | ||
| feature_layers[i].set_weights(weights) | ||
|
|
||
| x0 = feature_model.layers[fine_grained_layers[0]].output | ||
| x1 = feature_model.layers[fine_grained_layers[1]].output | ||
| x2 = feature_model.output | ||
|
|
||
| if shallow_detection: | ||
| x0 = Conv2D(8, (1,1), strides=(1,1), padding='same', use_bias=False)(x0) | ||
| x1 = Conv2D(32, (1,1), strides=(1,1), padding='same', use_bias=False)(x1) | ||
| num_fina_layers = 512 | ||
|
|
||
| else: | ||
| x0 = Conv2D(16, (1,1), strides=(1,1), padding='same', use_bias=False)(x0) | ||
| x1 = Conv2D(64, (1,1), strides=(1,1), padding='same', use_bias=False)(x1) | ||
| num_fina_layers = 1024 | ||
|
|
||
| # Layer 19 | ||
| x2 = Conv2D(num_fina_layers, (3,3), strides=(1,1), padding='same', name='conv_19', use_bias=False)(x2) | ||
| x2 = BatchNormalization(name='norm_19')(x2) | ||
| x2 = LeakyReLU(alpha=0.1)(x2) | ||
|
|
||
| # Layer 20 | ||
| x2 = Conv2D(num_fina_layers, (3,3), strides=(1,1), padding='same', name='conv_20', use_bias=False)(x2) | ||
| x2 = BatchNormalization(name='norm_20')(x2) | ||
| x2 = LeakyReLU(alpha=0.1)(x2) | ||
|
|
||
| # earlier net feature | ||
| x0 = BatchNormalization(name='norm_space_to_depth_x4')(x0) | ||
| x0 = LeakyReLU(alpha=0.1)(x0) | ||
| x0_reshaped = Lambda( | ||
| space_to_depth_x4, | ||
| output_shape=space_to_depth_x4_output_shape, | ||
| name='space_to_depth_x4')(x0) | ||
|
|
||
| # earlier net feature | ||
| x1 = BatchNormalization(name='norm_space_to_depth_x2')(x1) | ||
| x1 = LeakyReLU(alpha=0.1)(x1) | ||
| x1_reshaped = Lambda( | ||
| space_to_depth_x2, | ||
| output_shape=space_to_depth_x2_output_shape, | ||
| name='space_to_depth_x2')(x1) | ||
|
|
||
| if three_scale_detection: | ||
| x = concatenate([x0_reshaped, x1_reshaped, x2]) | ||
| else: | ||
| x = concatenate([x1_reshaped, x2]) | ||
|
|
||
| x = Conv2D(num_fina_layers, (3,3), strides=(1,1), padding='same', name='conv_detection', use_bias=False)(x) | ||
| x = BatchNormalization(name='norm_detection_feature')(x) | ||
| x = LeakyReLU(alpha=0.1)(x) | ||
| self.feature_model = Model(feature_model.inputs, x) | ||
|
|
||
|
|
||
| def normalize(self, image): | ||
| return image / 255. | ||
|
|
||
|
|
||
| class MobileNetFeature(FeatureExtractor): | ||
| """MobileNet based YoLo | ||
| Parameters | ||
| ---------- | ||
| input_shape : tensor | ||
| Input tensor shape (height, width, num_channel) | ||
| weights: string | ||
| Load pretrained weights with imagenet dataset | ||
| shallow_detection : bool | ||
| Whether use a shallow net. In the original YoLov2 design, the size of the last | ||
| feature is 1/32 of input due to 5 maxpooling. When shallow_detection is enabled, | ||
| we only keep all the layes upto the first 4 maxpooling. This is to hopefully improve | ||
| performance for small objects. | ||
| three_scale_detection : bool | ||
| Whether to use 3 scale of features for detection. The original YoLov2 will use feature | ||
| from last layer and one earlier feature and concatenate them togther. We extend this idea | ||
| by introducing an extra scale of feature to improve the detection accuracy. | ||
| """ | ||
|
|
||
| def __init__(self, input_tensor, weights='imagenet', shallow_detection=False, three_scale_detection=False): | ||
|
|
||
| fine_grained_layers = [21, 33, 69] #[1/4, 1/8, 1/16] | ||
|
|
||
| if shallow_detection: | ||
| fine_grained_layers = fine_grained_layers[0:2] | ||
| final_feature_layer = 69 | ||
| else: | ||
| fine_grained_layers = fine_grained_layers[1:] | ||
| final_feature_layer = -1 | ||
|
|
||
| feature_model = MobileNet(input_tensor=input_tensor, include_top=False, weights=None) | ||
| feature_model = Model(inputs=feature_model.input, outputs=feature_model.layers[final_feature_layer].output) | ||
|
|
||
| if weights == 'imagenet': | ||
| print('Loading pretrained weights from ImageNet...') | ||
| trained_model = MobileNet(input_shape=(224, 224, 3), include_top=False, weights=weights) | ||
| trained_layers = trained_model.layers | ||
| feature_layers = feature_model.layers | ||
| for i in range(0, min(len(feature_layers), len(trained_layers))): | ||
| weights = trained_layers[i].get_weights() | ||
| feature_layers[i].set_weights(weights) | ||
|
|
||
| x0 = feature_model.layers[fine_grained_layers[0]].output | ||
| x1 = feature_model.layers[fine_grained_layers[1]].output | ||
| x2 = feature_model.output | ||
|
|
||
| if shallow_detection: | ||
| x0 = Conv2D(8, (1,1), strides=(1,1), padding='same', use_bias=False)(x0) | ||
| x1 = Conv2D(32, (1,1), strides=(1,1), padding='same', use_bias=False)(x1) | ||
| num_final_layers = 512 | ||
|
|
||
| else: | ||
| x0 = Conv2D(16, (1,1), strides=(1,1), padding='same', use_bias=False)(x0) | ||
| x1 = Conv2D(64, (1,1), strides=(1,1), padding='same', use_bias=False)(x1) | ||
| num_final_layers = 1024 | ||
|
|
||
| x2 = depthwise_conv_block(x2, num_final_layers, 1.0, block_id=14) | ||
| x2 = depthwise_conv_block(x2, num_final_layers, 1.0, block_id=15) | ||
|
|
||
| x1 = BatchNormalization()(x1) | ||
| x1 = Lambda(relu6)(x1) | ||
| x1_reshaped = Lambda( | ||
| space_to_depth_x2, | ||
| output_shape=space_to_depth_x2_output_shape, | ||
| name='space_to_depth_x2')(x1) | ||
|
|
||
| x0 = BatchNormalization()(x0) | ||
| x0 = Lambda(relu6)(x0) | ||
| x0_reshaped = Lambda( | ||
| space_to_depth_x4, | ||
| output_shape=space_to_depth_x4_output_shape, | ||
| name='space_to_depth_x4')(x0) | ||
|
|
||
| if three_scale_detection: | ||
| x = concatenate([x0_reshaped, x1_reshaped, x2]) | ||
| else: | ||
| x = concatenate([x1_reshaped, x2]) | ||
| x = depthwise_conv_block(x, num_final_layers, 1.0, block_id=16) | ||
| self.feature_model = Model(feature_model.inputs, x) | ||
|
|
||
| def normalize(self, image): | ||
| image = image / 255. | ||
| image = image - 0.5 | ||
| image = image * 2. | ||
| return image | ||
|
|
||
|
|
||
|
|
||
|
|
| @@ -0,0 +1,79 @@ | ||
| """ | ||
| DarKNet19 Keras Implementation: | ||
| YOLO9000: Better, Faster, Stronger | ||
| https://arxiv.org/pdf/1612.08242 | ||
| """ | ||
| from keras.models import Model | ||
| from keras.layers import Input, Conv2D, MaxPool2D | ||
| from keras.layers import BatchNormalization, Activation | ||
| from keras.layers import GlobalAvgPool2D | ||
| from keras.layers.advanced_activations import LeakyReLU | ||
|
|
||
| def Darknet19(image_tensor=None, num_classes=1000, include_top=False): | ||
| """ | ||
| DarkNet-19 Architecture Definition | ||
| Parameters | ||
| ---------- | ||
| image_tensor: tensor | ||
| Input tensor. Default: None | ||
| num_classes: int | ||
| Number of classes for classfication tasks. Default: 1000 | ||
| include_top: bool | ||
| Whether includes the last layer (only needs for classfication tasks). Default: False | ||
| """ | ||
| if image_tensor is None: | ||
| image_tensor = Input(shape=(None, None, 3)) | ||
|
|
||
| x = conv_block(image_tensor, 32, (3, 3)) # << --- Input layer | ||
| x = MaxPool2D(strides=2)(x) | ||
|
|
||
| x = conv_block(x, 64, (3, 3)) | ||
| x = MaxPool2D(strides=2)(x) | ||
|
|
||
| x = conv_block(x, 128, (3, 3)) | ||
| x = conv_block(x, 64, (1, 1)) | ||
| x = conv_block(x, 128, (3, 3)) | ||
| x = MaxPool2D(strides=2)(x) | ||
|
|
||
| x = conv_block(x, 256, (3, 3)) | ||
| x = conv_block(x, 128, (1, 1)) | ||
| x = conv_block(x, 256, (3, 3)) | ||
| x = MaxPool2D(strides=2)(x) | ||
|
|
||
| x = conv_block(x, 512, (3, 3)) | ||
| x = conv_block(x, 256, (1, 1)) | ||
| x = conv_block(x, 512, (3, 3)) | ||
| x = conv_block(x, 256, (1, 1)) | ||
| x = conv_block(x, 512, (3, 3)) | ||
| x = MaxPool2D(strides=2)(x) | ||
|
|
||
| x = conv_block(x, 1024, (3, 3)) | ||
| x = conv_block(x, 512, (1, 1)) | ||
| x = conv_block(x, 1024, (3, 3)) | ||
| x = conv_block(x, 512, (1, 1)) | ||
| x = conv_block(x, 1024, (3, 3)) # ---> feature extraction ends here | ||
|
|
||
| if include_top: | ||
| x = Conv2D(num_classes, (1, 1), activation='linear', padding='same')(x) | ||
| x = GlobalAvgPool2D()(x) | ||
| x = Activation(activation='softmax')(x) | ||
|
|
||
| darknet = Model(image_tensor, x) | ||
|
|
||
| return darknet | ||
|
|
||
|
|
||
| def conv_block(x, filters, kernel_size, name=None): | ||
| """ | ||
| Standard YOLOv2 Convolutional Block as suggested in YOLO9000 paper | ||
| :param x: | ||
| :param filters: | ||
| :param kernel_size: | ||
| :param kernel_regularizer: | ||
| :return: | ||
| """ | ||
| x = Conv2D(filters=filters, kernel_size=kernel_size, padding='same', | ||
| use_bias=False, name=name)(x) | ||
| x = BatchNormalization(name=name if name is None else 'batch_norm_%s' % name)(x) | ||
| x = LeakyReLU(alpha=0.1, name=name if name is None else 'leaky_relu_%s' % name)(x) | ||
| return x |
| @@ -0,0 +1,198 @@ | ||
| """ | ||
| MobileNet Implementation in Keras | ||
| Author: https://github.com/fchollet/keras/blob/master/keras/applications/mobilenet.py | ||
| """ | ||
| import keras.backend as K | ||
| from keras.layers import Input, InputSpec | ||
| from keras.layers import Conv2D | ||
| from keras.layers import BatchNormalization | ||
| from keras.layers import Activation | ||
| from keras.layers import GlobalAvgPool2D, Reshape, Dropout | ||
| from keras.models import Model | ||
| from keras import initializers, regularizers, constraints | ||
| from keras.utils import conv_utils | ||
|
|
||
| def keras_mobile_net(input_size=(224, 224, 3), include_top=True, n_classes=1000, alpha=1.0, depth_multiplier=1): | ||
| if input_size is None: | ||
| img_input = Input(shape=(None, None, 3)) | ||
| else: | ||
| img_input = Input(shape=input_size) | ||
|
|
||
| shape = (1, 1, int(1024 * alpha)) | ||
|
|
||
| x = _conv_block(img_input, 32, alpha, strides=(2, 2)) | ||
| x = depthwise_conv_block(x, 64, alpha, depth_multiplier, block_id=1) | ||
| x = depthwise_conv_block(x, 128, alpha, depth_multiplier, block_id=2, strides=(2, 2)) | ||
| x = depthwise_conv_block(x, 128, alpha, depth_multiplier, block_id=3) | ||
| x = depthwise_conv_block(x, 256, alpha, depth_multiplier, block_id=4, strides=(2, 2)) | ||
| x = depthwise_conv_block(x, 256, alpha, depth_multiplier, block_id=5) | ||
| x = depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=6, strides=(2, 2)) | ||
| x = depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=7) | ||
| x = depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=8) | ||
| x = depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=9) | ||
| x = depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=10) | ||
| x = depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=11) | ||
|
|
||
| x = depthwise_conv_block(x, 1024, alpha, depth_multiplier, block_id=12, strides=(2, 2)) | ||
| x = depthwise_conv_block(x, 1024, alpha, depth_multiplier, block_id=13) | ||
|
|
||
| if include_top: | ||
| x = GlobalAvgPool2D()(x) | ||
| x = Reshape(shape, name='reshape_1')(x) | ||
| x = Dropout(0.0, name='dropout')(x) | ||
|
|
||
| x = Conv2D(n_classes, (1, 1), padding='same', name='conv_preds')(x) | ||
| x = Activation('softmax', name='act_softmax')(x) | ||
| x = Reshape((n_classes,), name='reshape_2')(x) | ||
|
|
||
| model = Model(inputs=img_input, outputs=x) | ||
| return model | ||
|
|
||
|
|
||
| def relu6(x): | ||
| return K.relu(x, max_value=6) | ||
|
|
||
| def _conv_block(inputs, filters, alpha, kernel=(3, 3), strides=(1, 1), name='conv1'): | ||
| """ Standard Convolutional Block""" | ||
| filters = int(filters * alpha) | ||
| x = Conv2D(filters, kernel, padding='same', use_bias=False, strides=strides, name=name)(inputs) | ||
| x = BatchNormalization(name='%s_bn' % name)(x) | ||
| return Activation(relu6, name='%s_relu' % name)(x) | ||
|
|
||
|
|
||
| def depthwise_conv_block(inputs, pointwise_conv_filters, alpha, depth_multiplier=1, strides=(1, 1), block_id=1): | ||
| """ | ||
| A depthwise convolution block. | ||
| """ | ||
| pointwise_conv_filters = int(pointwise_conv_filters * alpha) | ||
| x = DepthwiseConv2D((3, 3), padding='same', | ||
| depth_multiplier=depth_multiplier, strides=strides, | ||
| use_bias=False, name='conv_dw_%d' % block_id)(inputs) | ||
| x = BatchNormalization(name='conv_dw_%d_bn' % block_id)(x) | ||
| x = Activation(relu6, name='conv_dw_%d_relu' % block_id)(x) | ||
|
|
||
| x = Conv2D(pointwise_conv_filters, (1, 1), padding='same', use_bias=False, strides=(1, 1), name='conv_pw_%d' % block_id)(x) | ||
| x = BatchNormalization(name='conv_pw_%d_bn' % block_id)(x) | ||
| return Activation(relu6, name='conv_pw_%d_relu' % block_id)(x) | ||
|
|
||
|
|
||
| class DepthwiseConv2D(Conv2D): | ||
| """ | ||
| Depthwise separable 2D convolution. | ||
| Reference: https://github.com/fchollet/keras/blob/master/keras/applications/mobilenet.py | ||
| """ | ||
| def __init__(self, kernel_size, strides=(1, 1), padding='valid', depth_multiplier=1, | ||
| data_format=None, activation=None, use_bias=True, | ||
| depthwise_initializer='glorot_uniform', bias_initializer='zeros', | ||
| depthwise_regularizer=None, bias_regularizer=None, activity_regularizer=None, depthwise_constraint=None, | ||
| bias_constraint=None, **kwargs): | ||
| super(DepthwiseConv2D, self).__init__( | ||
| filters=None, | ||
| kernel_size=kernel_size, | ||
| strides=strides, | ||
| padding=padding, | ||
| data_format=data_format, | ||
| activation=activation, | ||
| use_bias=use_bias, | ||
| bias_regularizer=bias_regularizer, | ||
| activity_regularizer=activity_regularizer, | ||
| bias_constraint=bias_constraint, | ||
| **kwargs) | ||
| self.depth_multiplier = depth_multiplier | ||
| self.depthwise_initializer = initializers.get(depthwise_initializer) | ||
| self.depthwise_regularizer = regularizers.get(depthwise_regularizer) | ||
| self.depthwise_constraint = constraints.get(depthwise_constraint) | ||
| self.bias_initializer = initializers.get(bias_initializer) | ||
|
|
||
| def build(self, input_shape): | ||
| if len(input_shape) < 4: | ||
| raise ValueError('Inputs to `DepthwiseConv2D` should have rank 4. ' | ||
| 'Received input shape:', str(input_shape)) | ||
| if self.data_format == 'channels_first': | ||
| channel_axis = 1 | ||
| else: | ||
| channel_axis = 3 | ||
| if input_shape[channel_axis] is None: | ||
| raise ValueError('The channel dimension of the inputs to ' | ||
| '`DepthwiseConv2D` ' | ||
| 'should be defined. Found `None`.') | ||
| input_dim = int(input_shape[channel_axis]) | ||
| depthwise_kernel_shape = (self.kernel_size[0], | ||
| self.kernel_size[1], | ||
| input_dim, | ||
| self.depth_multiplier) | ||
|
|
||
| self.depthwise_kernel = self.add_weight( | ||
| shape=depthwise_kernel_shape, | ||
| initializer=self.depthwise_initializer, | ||
| name='depthwise_kernel', | ||
| regularizer=self.depthwise_regularizer, | ||
| constraint=self.depthwise_constraint) | ||
|
|
||
| if self.use_bias: | ||
| self.bias = self.add_weight(shape=(input_dim * self.depth_multiplier,), | ||
| initializer=self.bias_initializer, | ||
| name='bias', | ||
| regularizer=self.bias_regularizer, | ||
| constraint=self.bias_constraint) | ||
| else: | ||
| self.bias = None | ||
| # Set input spec. | ||
| self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim}) | ||
| self.built = True | ||
|
|
||
| def call(self, inputs, training=None): | ||
| outputs = K.depthwise_conv2d( | ||
| inputs, | ||
| self.depthwise_kernel, | ||
| strides=self.strides, | ||
| padding=self.padding, | ||
| dilation_rate=self.dilation_rate, | ||
| data_format=self.data_format) | ||
|
|
||
| if self.bias: | ||
| outputs = K.bias_add( | ||
| outputs, | ||
| self.bias, | ||
| data_format=self.data_format) | ||
|
|
||
| if self.activation is not None: | ||
| return self.activation(outputs) | ||
|
|
||
| return outputs | ||
|
|
||
| def compute_output_shape(self, input_shape): | ||
| if self.data_format == 'channels_first': | ||
| rows = input_shape[2] | ||
| cols = input_shape[3] | ||
| out_filters = input_shape[1] * self.depth_multiplier | ||
| elif self.data_format == 'channels_last': | ||
| rows = input_shape[1] | ||
| cols = input_shape[2] | ||
| out_filters = input_shape[3] * self.depth_multiplier | ||
|
|
||
| rows = conv_utils.conv_output_length(rows, self.kernel_size[0], | ||
| self.padding, | ||
| self.strides[0]) | ||
| cols = conv_utils.conv_output_length(cols, self.kernel_size[1], | ||
| self.padding, | ||
| self.strides[1]) | ||
|
|
||
| if self.data_format == 'channels_first': | ||
| return (input_shape[0], out_filters, rows, cols) | ||
| elif self.data_format == 'channels_last': | ||
| return (input_shape[0], rows, cols, out_filters) | ||
|
|
||
| def get_config(self): | ||
| config = super(DepthwiseConv2D, self).get_config() | ||
| config.pop('filters') | ||
| config.pop('kernel_initializer') | ||
| config.pop('kernel_regularizer') | ||
| config.pop('kernel_constraint') | ||
| config['depth_multiplier'] = self.depth_multiplier | ||
| config['depthwise_initializer'] = initializers.serialize(self.depthwise_initializer) | ||
| config['depthwise_regularizer'] = regularizers.serialize(self.depthwise_regularizer) | ||
| config['depthwise_constraint'] = constraints.serialize(self.depthwise_constraint) | ||
| return config |
| @@ -0,0 +1,143 @@ | ||
| """ | ||
| YOLOv2 Loss Function Implementation | ||
| Input: out feature map from network | ||
| Output: | ||
| A scalar - loss value for back propagation | ||
| ------------------ | ||
| Loss of YOLOv2 Implementation. Few Notes | ||
| * What we get from the CNN is a feature map (imagine as a 3-D box) | ||
| * Each cell in a feature map is a vector size CFG.N_ANCHORS* (5 + N_CLASSES) as: | ||
| ------------- ANCHOR 1--------------- -------- ANCHORS 2 ------------- ...... ------------ANCHOR N ----------- | ||
| [tx, ty, tw, th, to , label_vector..],[tx1, ty1, tw1, th1, label_vector]......[tx_n, ty_n, tw_n, th_m, label...] | ||
| ---------------------------------------------------------------------------------------------------------------- | ||
| One cell in a feature map | ||
| * tx, ty : predicts of relative center of bounding box to its current cell. Therefore, true center points of | ||
| a prediction would be : | ||
| xc = sigmoid(tx) + cx | ||
| yc = sigmoid(ty) + cy | ||
| * tw, th: predicts the scaling value for true width and height of the bounding box based on the anchor as: | ||
| w = exp(tw) * px | ||
| h = exp(th) * py | ||
| * to : objectiveness of the cell : the probability of having an object in the cell | ||
| * label: classification vector to calculate soft-max | ||
| """ | ||
| import re | ||
| import numpy as np | ||
| import tensorflow as tf | ||
| import keras.backend as K | ||
| from yolo_uav import get_anchors | ||
| import cfg as CFG | ||
| import pdb | ||
| import math | ||
|
|
||
| def custom_loss(y_true, y_pred): | ||
| """ | ||
| Loss Function of YOLOv2 | ||
| :param y_true: a Tensor [batch_size, GRID_H, GRID_W, CFG.N_ANCHORS*(N_CLASSES + 5)] | ||
| :param y_pred: a Tensor [batch_size, GRID_H, GRID_H, N_ANCHOR*(N_CLASSES + 5)] | ||
| :return: a scalar | ||
| loss value | ||
| """ | ||
| # Config Anchors | ||
| anchors = get_anchors(CFG.ANCHORS_PATH) | ||
| if CFG.SHALLOW_DETECTOR: | ||
| anchors = anchors * 2 | ||
| # pdb.set_trace() | ||
| pred_shape = K.shape(y_pred)[1:3] | ||
| gt_shape = K.shape(y_true) # shape of ground truth value | ||
| GRID_H = tf.cast(pred_shape[0], tf.int32) # shape of output feature map | ||
| GRID_W = tf.cast(pred_shape[1], tf.int32) | ||
|
|
||
| output_size = tf.cast(tf.reshape([GRID_W, GRID_H], [1, 1, 1, 1, 2]), tf.float32) | ||
| y_pred = tf.reshape(y_pred, [-1, pred_shape[0], pred_shape[1],CFG.N_ANCHORS, CFG.N_CLASSES + 5]) | ||
| y_true = tf.reshape(y_true, [-1, gt_shape[1], gt_shape[2], CFG.N_ANCHORS, CFG.N_CLASSES + 5]) | ||
|
|
||
| # Grid Map to calculate offset | ||
| c_xy = _create_offset_map(K.shape(y_pred)) | ||
|
|
||
| # Scale anchors to correct aspect ratio | ||
| pred_box_xy = (tf.sigmoid(y_pred[:, :, :, :, :2]) + c_xy) / output_size | ||
| pred_box_wh = tf.exp(y_pred[:, :, :, :, 2:4]) * np.reshape(anchors, [1, 1, 1, CFG.N_ANCHORS, 2]) / output_size | ||
| pred_box_wh = tf.sqrt(pred_box_wh) | ||
| pred_box_conf = tf.sigmoid(y_pred[:, :, :, :, 4:5]) | ||
| pred_box_prob = tf.nn.softmax(y_pred[:, :, :, :, 5:]) | ||
|
|
||
| # Adjust ground truth | ||
| true_box_xy = y_true[:, :, :, :, 0:2] | ||
| true_box_wh = tf.sqrt(y_true[:, :, :, :, 2:4]) | ||
|
|
||
| # adjust confidence | ||
| pred_tem_wh = tf.pow(pred_box_wh, 2) * output_size | ||
| pred_box_ul = pred_box_xy - 0.5 * pred_tem_wh | ||
| pred_box_bd = pred_box_xy + 0.5 * pred_tem_wh | ||
| pred_box_area = pred_tem_wh[:, :, :, :, 0] * pred_tem_wh[:, :, :, :, 1] | ||
|
|
||
| true_tem_wh = tf.pow(true_box_wh, 2) * output_size | ||
| true_box_ul = true_box_xy - 0.5 * true_tem_wh | ||
| true_box_bd = true_box_xy + 0.5 * true_tem_wh | ||
| true_box_area = true_tem_wh[:, :, :, :, 0] * true_tem_wh[:, :, :, :, 1] | ||
|
|
||
| intersect_ul = tf.maximum(pred_box_ul, true_box_ul) | ||
| intersect_br = tf.minimum(pred_box_bd, true_box_bd) | ||
| intersect_wh = tf.maximum(intersect_br - intersect_ul, 0.0) | ||
| intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] | ||
|
|
||
| iou = tf.truediv(intersect_area, true_box_area + pred_box_area - intersect_area) | ||
| best_box = tf.equal(iou, tf.reduce_max(iou, [3], True)) | ||
| best_box = tf.to_float(best_box) | ||
| true_box_conf = tf.expand_dims(best_box * y_true[:, :, :, :, 4], -1) | ||
| true_box_prob = y_true[:, :, :, :, 5:] | ||
|
|
||
| # Localization Loss | ||
| weight_coor = 5.0 * tf.concat(4 * [true_box_conf], 4) | ||
| true_boxes = tf.concat([true_box_xy, true_box_wh], 4) | ||
| pred_boxes = tf.concat([pred_box_xy, pred_box_wh], 4) | ||
| loc_loss = tf.pow(true_boxes - pred_boxes, 2) * weight_coor | ||
| loc_loss = tf.reshape(loc_loss, [-1, tf.cast(GRID_W * GRID_H, tf.int32) * CFG.N_ANCHORS * 4]) | ||
| loc_loss = tf.reduce_mean(tf.reduce_sum(loc_loss, 1)) | ||
|
|
||
| # # NOTE: YOLOv2 does not use cross-entropy loss. | ||
| # Object Confidence Loss | ||
| weight_conf = 0.5 * (1. - true_box_conf) + 5.0 * true_box_conf | ||
| obj_conf_loss = tf.pow(true_box_conf - pred_box_conf, 2) * weight_conf | ||
| obj_conf_loss = tf.reshape(obj_conf_loss, [-1, tf.cast(GRID_W * GRID_H, tf.int32) * CFG.N_ANCHORS]) | ||
| obj_conf_loss = tf.reduce_mean(tf.reduce_sum(obj_conf_loss, 1)) | ||
|
|
||
| # Category Loss | ||
| weight_prob = 1.0 * tf.concat(CFG.N_CLASSES * [true_box_conf], 4) | ||
| category_loss = tf.pow(true_box_prob - pred_box_prob, 2) * weight_prob | ||
| category_loss = tf.reshape(category_loss, [-1, tf.cast(GRID_W * GRID_H, tf.int32) * CFG.N_ANCHORS * CFG.N_CLASSES]) | ||
| category_loss = tf.reduce_mean(tf.reduce_sum(category_loss, 1)) | ||
|
|
||
| loss = 0.5 * (loc_loss + obj_conf_loss + category_loss) | ||
| # loss = tf.Print(loss, [loc_loss, obj_conf_loss, category_loss], message='Loc, obj, conf ') | ||
| return loss | ||
|
|
||
|
|
||
| def _create_offset_map(output_shape): | ||
| """ | ||
| In Yolo9000 paper, Grid map to calculate offsets for each cell in the output feature map | ||
| """ | ||
| GRID_H = tf.cast(output_shape[1], tf.int32) # shape of output feature map | ||
| GRID_W = tf.cast(output_shape[2], tf.int32) | ||
|
|
||
| cx = tf.cast((K.arange(0, stop=GRID_W)), dtype=tf.float32) | ||
| cx = K.tile(cx, [GRID_H]) | ||
| cx = K.reshape(cx, [-1, GRID_H, GRID_W, 1]) | ||
|
|
||
| cy = K.cast((K.arange(0, stop=GRID_H)), dtype=tf.float32) | ||
| cy = K.reshape(cy, [-1, 1]) | ||
| cy = K.tile(cy, [1, GRID_W]) | ||
| cy = K.reshape(cy, [-1]) | ||
| cy = K.reshape(cy, [-1, GRID_H, GRID_W, 1]) | ||
|
|
||
| c_xy = tf.stack([cx, cy], -1) | ||
| c_xy = K.cast(c_xy, tf.float32) | ||
| return c_xy |
| @@ -0,0 +1,80 @@ | ||
| person | ||
| bicycle | ||
| car | ||
| motorbike | ||
| aeroplane | ||
| bus | ||
| train | ||
| truck | ||
| boat | ||
| traffic light | ||
| fire hydrant | ||
| stop sign | ||
| parking meter | ||
| bench | ||
| bird | ||
| cat | ||
| dog | ||
| horse | ||
| sheep | ||
| cow | ||
| elephant | ||
| bear | ||
| zebra | ||
| giraffe | ||
| backpack | ||
| umbrella | ||
| handbag | ||
| tie | ||
| suitcase | ||
| frisbee | ||
| skis | ||
| snowboard | ||
| sports ball | ||
| kite | ||
| baseball bat | ||
| baseball glove | ||
| skateboard | ||
| surfboard | ||
| tennis racket | ||
| bottle | ||
| wine glass | ||
| cup | ||
| fork | ||
| knife | ||
| spoon | ||
| bowl | ||
| banana | ||
| apple | ||
| sandwich | ||
| orange | ||
| broccoli | ||
| carrot | ||
| hot dog | ||
| pizza | ||
| donut | ||
| cake | ||
| chair | ||
| sofa | ||
| pottedplant | ||
| bed | ||
| diningtable | ||
| toilet | ||
| tvmonitor | ||
| laptop | ||
| mouse | ||
| remote | ||
| keyboard | ||
| cell phone | ||
| microwave | ||
| oven | ||
| toaster | ||
| sink | ||
| refrigerator | ||
| book | ||
| clock | ||
| vase | ||
| scissors | ||
| teddy bear | ||
| hair drier | ||
| toothbrush |
| @@ -0,0 +1,20 @@ | ||
| aeroplane | ||
| bicycle | ||
| bird | ||
| boat | ||
| bottle | ||
| bus | ||
| car | ||
| cat | ||
| chair | ||
| cow | ||
| diningtable | ||
| dog | ||
| horse | ||
| motorbike | ||
| person | ||
| pottedplant | ||
| sheep | ||
| sofa | ||
| train | ||
| tvmonitor |
| @@ -0,0 +1,5 @@ | ||
| 14.001599, 14.765134 | ||
| 1.691922, 3.070213 | ||
| 6.153262, 13.402029 | ||
| 3.111511, 7.955340 | ||
| 8.375247, 5.988119 |
| @@ -0,0 +1,2 @@ | ||
| person | ||
| vehicle |
| @@ -0,0 +1 @@ | ||
| 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828 |
| @@ -0,0 +1,122 @@ | ||
| """ | ||
| Overfit one image with 1000 epochs to test the loss function properly | ||
| """ | ||
| import random | ||
| import h5py | ||
| import os | ||
| import PIL | ||
| import io | ||
| import cv2 | ||
|
|
||
| from argparse import ArgumentParser | ||
| from loss import custom_loss | ||
| from yolo_uav import * | ||
| import numpy as np | ||
| import cfg as CFG | ||
|
|
||
| import keras | ||
| import tensorflow as tf | ||
| from datagen import DataBatchGenerator | ||
| from keras.models import load_model | ||
| import keras.backend as K | ||
| import matplotlib.pyplot as plt | ||
|
|
||
| parser = ArgumentParser( | ||
| description="Retrain the Yolo-UAV for a dataset") | ||
|
|
||
| parser.add_argument('-p', | ||
| '--data_path', | ||
| help='path to HDF5 file containing dataset', | ||
| default='~/data/PascalVOC/VOCdevkit/pascal_voc_07_12_person_vehicle.hdf5') | ||
|
|
||
|
|
||
| parser.add_argument('-w', | ||
| '--weights_path', | ||
| help="Path to pre-trained weight files", | ||
| type=str, default=None) | ||
|
|
||
| parser.add_argument('-e', | ||
| '--num_epochs', | ||
| help='Number of epochs for training', | ||
| type=int, default=100) | ||
|
|
||
| parser.add_argument('-b', | ||
| '--batch_size', | ||
| help='Number of batch size', | ||
| type=int, default=CFG.BATCH_SIZE) | ||
|
|
||
|
|
||
| def _main_(): | ||
| args = parser.parse_args() | ||
| data_path = args.data_path | ||
| weights_path = args.weights_path | ||
| batch_size = args.batch_size | ||
| num_epochs = args.num_epochs | ||
|
|
||
| # ################### | ||
| # PREPARE DATA INPUT | ||
| # ################### | ||
|
|
||
| anchors = get_anchors(CFG.ANCHORS_PATH) | ||
| classes = get_classes(CFG.CLASSES_PATH) | ||
| data_path = os.path.expanduser(data_path) | ||
|
|
||
| if CFG.SHALLOW_DETECTOR: | ||
| anchors = anchors * 2 | ||
| assert(CFG.N_ANCHORS == len(anchors)) | ||
| assert(CFG.N_CLASSES == len(classes)) | ||
| assert(os.path.exists(data_path)) | ||
| hdf5_data = h5py.File(data_path, 'r') | ||
| num_training = hdf5_data['train/images'].shape[0] | ||
|
|
||
| print("==========================") | ||
| print('\t anchors:', anchors) | ||
| print('\t classes:', classes) | ||
| print('\t train_path:', data_path) | ||
| print('\t num_training:', num_training) | ||
| print("==========================") | ||
|
|
||
| yolo_detector = YOLODetector(feature_extractor_name=CFG.FEATURE_EXTRACTOR) | ||
| detect_model = yolo_detector.model | ||
| detect_model.summary() | ||
| # ################# | ||
| # COMPILE AND RUN | ||
| # ################# | ||
| detect_model.compile(optimizer='adam', loss=custom_loss) | ||
|
|
||
| train_batch_gen = DataBatchGenerator(hdf5_data, train='train', jitter=True) | ||
| valid_batch_gen = DataBatchGenerator(hdf5_data, train='valid') | ||
|
|
||
| logging = TensorBoard() | ||
| early_stopping = EarlyStopping( | ||
| monitor='val_loss', min_delta=0, patience=10, verbose=1, mode='auto') | ||
| train_steps_per_epoch = train_batch_gen.training_instances // batch_size | ||
| valid_steps_per_epoch = valid_batch_gen.training_instances // batch_size | ||
| print('train_steps_per_epoch=', train_steps_per_epoch) | ||
| print('valid_steps_per_epoch=', valid_steps_per_epoch) | ||
|
|
||
| num_loop_epochs = 5 | ||
| loop = num_epochs // num_loop_epochs | ||
| for i in range(loop): | ||
| weight_name = 'weights/' + 'best_{}{}{}_loop_{}.h5'.format( | ||
| CFG.FEATURE_EXTRACTOR, int(CFG.SHALLOW_DETECTOR), int(CFG.USE_THREE_SCALE_FEATURE), i) | ||
|
|
||
| checkpoint = ModelCheckpoint( | ||
| weight_name, monitor='val_loss', save_weights_only=True, save_best_only=True) | ||
| detect_model.fit_generator(generator=train_batch_gen.flow_from_hdf5(), | ||
| validation_data=valid_batch_gen.flow_from_hdf5(), | ||
| steps_per_epoch=train_steps_per_epoch, | ||
| validation_steps=valid_steps_per_epoch, | ||
| callbacks=[checkpoint, logging], | ||
| epochs=num_loop_epochs, | ||
| workers=1, | ||
| verbose=1) | ||
| weight_name = 'weights/' + '{}{}{}_loop_{}.h5'.format( | ||
| CFG.FEATURE_EXTRACTOR, CFG.SHALLOW_DETECTOR, CFG.USE_THREE_SCALE_FEATURE, i) | ||
| detect_model.save_weights(weight_name) | ||
| compute_recall_precision( | ||
| hdf5_data, yolo_detector, weight_name, train='valid', num_samples=1024) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| _main_() |
| @@ -0,0 +1,154 @@ | ||
| """ | ||
| Overfit one image with 1000 epochs to test the loss function properly | ||
| """ | ||
| import random | ||
| import h5py | ||
| import os | ||
| import PIL | ||
| import io | ||
| import cv2 | ||
|
|
||
| from argparse import ArgumentParser | ||
| from loss import custom_loss | ||
| from yolo_uav import * | ||
| import numpy as np | ||
| import cfg as CFG | ||
|
|
||
| import keras | ||
| import tensorflow as tf | ||
| from keras.models import load_model | ||
| import keras.backend as K | ||
| import matplotlib.pyplot as plt | ||
|
|
||
| parser = ArgumentParser(description="Over-fit one sample to validate YOLOv2 Loss Function") | ||
|
|
||
| parser.add_argument('-p', '--path', help="Path to training text file ", | ||
| type=str, default=None) | ||
|
|
||
| parser.add_argument('-w', '--weights', help="Path to pre-trained weight files", | ||
| type=str, default=None) | ||
|
|
||
| parser.add_argument('-e', '--epochs', help='Number of epochs for training', | ||
| type=int, default=1000) | ||
|
|
||
| parser.add_argument('-b', '--batch', help='Number of batch size', | ||
| type=int, default=1) | ||
|
|
||
| parser.add_argument( | ||
| '-d', | ||
| '--data_path', | ||
| help='path to HDF5 file containing pascal voc dataset', | ||
| default='~/data/PascalVOC/VOCdevkit/pascal_voc_07_12_person_vehicle.hdf5') | ||
|
|
||
| args = parser.parse_args() | ||
| annotation_path = args.path | ||
| WEIGHTS_FILE = args.weights | ||
| BATCH_SIZE = args.batch | ||
| EPOCHS = args.epochs | ||
|
|
||
| def _main_(): | ||
| # ################### | ||
| # PREPARE DATA INPUT | ||
| # ################### | ||
| anchors = get_anchors(CFG.ANCHORS_PATH) | ||
| classes = get_classes(CFG.CLASSES_PATH) | ||
|
|
||
| if CFG.SHALLOW_DETECTOR: | ||
| anchors = anchors * 2 | ||
| print(anchors) | ||
| test_size = 128 | ||
| voc_path = os.path.expanduser(args.data_path) | ||
| voc = h5py.File(voc_path, 'r') | ||
| total_test_instances = voc['train/images'].shape[0] | ||
|
|
||
| test_list = np.random.choice(total_test_instances, test_size, replace=False) | ||
|
|
||
| x_batch = np.zeros((test_size, CFG.IMAGE_HEIGHT, CFG.IMAGE_WIDTH, 3)) | ||
| y_batch = np.zeros((test_size, CFG.FEAT_H, CFG.FEAT_W, CFG.N_ANCHORS, 5 + CFG.N_CLASSES)) | ||
| b_batch = [] | ||
|
|
||
| cur_id = 0 | ||
| for test_id in sorted(test_list): | ||
| # Original boxes stored as 1D list of class, x_min, y_min, x_max, y_max. | ||
| image = PIL.Image.open(io.BytesIO(voc['train/images'][test_id])) | ||
| orig_size = np.array([image.width, image.height]) | ||
| orig_size = np.expand_dims(orig_size, axis=0) | ||
|
|
||
| image = image.resize((CFG.IMAGE_WIDTH, CFG.IMAGE_HEIGHT), PIL.Image.BICUBIC) | ||
| image_data = np.array(image, dtype=np.float) | ||
| image_data /= 255. | ||
| x_batch[cur_id] = image_data | ||
|
|
||
| boxes = voc['train/boxes'][test_id] | ||
| boxes = boxes.reshape((-1, 5)) | ||
|
|
||
| # Get box parameters as x_center, y_center, box_width, box_height, class. | ||
| boxes_xy = 0.5 * (boxes[:, 3:5] + boxes[:, 1:3]) | ||
| boxes_wh = boxes[:, 3:5] - boxes[:, 1:3] | ||
| boxes_xy = boxes_xy / orig_size | ||
| boxes_wh = boxes_wh / orig_size | ||
| boxes = np.concatenate((boxes_xy, boxes_wh, boxes[:, 0:1]), axis=1) | ||
|
|
||
| for box in boxes: | ||
| label = int(box[-1]) | ||
| one_hot = np.eye(CFG.N_CLASSES)[label] | ||
| xc, yc, w, h = box[0:4] | ||
| b_batch.append(BoundBox(xc, yc, w, h, c=1.0, classes=one_hot)) | ||
|
|
||
| object_mask = np.concatenate([[xc, yc, w, h], [1.0], one_hot]) # A cell in grid map` | ||
|
|
||
| center_x = xc * CFG.FEAT_W | ||
| center_y = yc * CFG.FEAT_H | ||
| r = int(np.floor(center_x)) | ||
| c = int(np.floor(center_y)) | ||
| fw = w * CFG.FEAT_W | ||
| fh = h * CFG.FEAT_H | ||
|
|
||
| # find the anchor that best predicts this box | ||
| best_anchor = -1 | ||
| max_iou = -1 | ||
| shifted_box = BoundBox(0, 0, fw, fh) | ||
|
|
||
| for i in range(len(anchors)): | ||
| anchor_bb = BoundBox(0, 0, anchors[i][0], anchors[i][1]) | ||
| iou = bbox_iou(shifted_box, anchor_bb) | ||
| if max_iou < iou: | ||
| best_anchor = i | ||
| max_iou = iou | ||
| print(c,r, best_anchor, max_iou, object_mask) | ||
| if r < CFG.FEAT_W and c < CFG.FEAT_H: | ||
| y_batch[cur_id, c, r, best_anchor, :] = object_mask # Construct Feature map ground truth | ||
|
|
||
| cur_id += 1 | ||
|
|
||
| # pdb.set_trace() | ||
| y_batch = y_batch.reshape([test_size, CFG.FEAT_H, CFG.FEAT_W, CFG.N_ANCHORS*(5 + CFG.N_CLASSES)]) | ||
|
|
||
| yolo_detector = YOLODetector(feature_extractor_name=CFG.FEATURE_EXTRACTOR) | ||
| # yolo_detector.model.summary() | ||
| # ################# | ||
| # COMPILE AND RUN | ||
| # ################# | ||
| yolo_detector.model.compile(optimizer='adam', loss=custom_loss) | ||
|
|
||
| num_steps = 200 | ||
|
|
||
| yolo_detector.model.fit(x_batch, y_batch, batch_size=CFG.BATCH_SIZE, epochs=num_steps) | ||
| yolo_detector.model.save_weights('overfit.weights') | ||
|
|
||
| netout = yolo_detector.model.predict(x_batch, batch_size=CFG.BATCH_SIZE) | ||
| netouts = netout.reshape(-1, CFG.FEAT_H, CFG.FEAT_W, CFG.N_ANCHORS, (5 + CFG.N_CLASSES)) | ||
| idx = 0 | ||
| boxes_pred = [] | ||
| for i in range(len(netouts)): | ||
| image_data = x_batch[i] | ||
| boxes = yolo_detector.decode_netout(netouts[i]) | ||
| boxes_pred += boxes | ||
| img = draw_boxes(image_data, boxes, classes) | ||
| img = np.array(img * 255, dtype=np.uint8) | ||
| img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) | ||
| cv2.imwrite('/tmp/output/img_'+str(i)+'.jpg',img) | ||
| # import pdb; pdb.set_trace() | ||
| get_recall_precision(boxes_pred, b_batch) | ||
| if __name__ == "__main__": | ||
| _main_() |
| @@ -0,0 +1 @@ | ||
| /home/jzhang/data/weights |