In [1]:
%matplotlib inline

In [6]:
import copy
import cv2
import itertools
import json
import keras
import keras.applications.resnet50
import keras.backend
import keras.callbacks
import keras.engine
import keras.engine.topology
import keras.layers
import keras.layers.core
import keras.models
import keras.objectives
import keras.optimizers
import numpy
import random
import sys
import threading

In [7]:
def get_data(input_path):
    found_bg = False
    all_imgs = {}

    classes_count = {}

    class_mapping = {}

    visualise = True

    with open(input_path, 'r') as f:

        print('Parsing annotation files')

        for line in f:
            line_split = line.strip().split(' ')
            (filename, x1, y1, x2, y2, class_name) = line_split

            if class_name not in classes_count:
                classes_count[class_name] = 1
            else:
                classes_count[class_name] += 1

            if class_name not in class_mapping:
                if class_name == 'bg' and found_bg == False:
                    print(
                    'Found class name with special name bg. Will be treated as a background region (this is usually for hard negative mining).')
                    found_bg = True
                class_mapping[class_name] = len(class_mapping)

            if filename not in all_imgs:
                all_imgs[filename] = {}

                img = cv2.imread(filename)
                (rows, cols) = img.shape[:2]
                all_imgs[filename]['filepath'] = filename
                all_imgs[filename]['width'] = cols
                all_imgs[filename]['height'] = rows
                all_imgs[filename]['bboxes'] = []
                if numpy.random.randint(0, 6) > 0:
                    all_imgs[filename]['imageset'] = 'trainval'
                else:
                    all_imgs[filename]['imageset'] = 'test'

            all_imgs[filename]['bboxes'].append(
                {'class': class_name, 'x1': int(x1), 'x2': int(x2),
                 'y1': int(y1), 'y2': int(y2)})

        all_data = []
        for key in all_imgs:
            all_data.append(all_imgs[key])

        # make sure the bg class is last in the list
        if found_bg:
            if class_mapping['bg'] != len(class_mapping) - 1:
                key_to_switch = [key for key in class_mapping.keys() if
                                 class_mapping[key] == len(class_mapping) - 1][
                    0]
                val_to_switch = class_mapping['bg']
                class_mapping['bg'] = len(class_mapping) - 1
                class_mapping[key_to_switch] = val_to_switch

        return all_data, classes_count, class_mapping

In [8]:
class ROI(keras.engine.topology.Layer):
    '''ROI pooling layer for 2D inputs.
    See Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition,
    K. He, X. Zhang, S. Ren, J. Sun
    # Arguments
        pool_size: int
            Size of pooling region to use. pool_size = 7 will result in a 7x7 region.
        num_rois: number of regions of interest to be used
    # Input shape
        list of two 4D tensors [X_img,X_roi] with shape:
        X_img:
        `(1, channels, rows, cols)` if dim_ordering='th'
        or 4D tensor with shape:
        `(1, rows, cols, channels)` if dim_ordering='tf'.
        X_roi:
        `(1,num_rois,4)` list of rois, with ordering (x,y,w,h)
    # Output shape
        3D tensor with shape:
        `(1, num_rois, channels, pool_size, pool_size)`
    '''

    def __init__(self, pool_size, num_rois, **kwargs):

        self.dim_ordering = keras.backend.image_dim_ordering()
        assert self.dim_ordering in {'tf',
                                     'th'}, 'dim_ordering must be in {tf, th}'

        self.pool_size = pool_size
        self.num_rois = num_rois

        super(ROI, self).__init__(**kwargs)

    def build(self, input_shape):
        if self.dim_ordering == 'th':
            self.nb_channels = input_shape[0][1]
        elif self.dim_ordering == 'tf':
            self.nb_channels = input_shape[0][3]

    def compute_output_shape(self, input_shape):
        if self.dim_ordering == 'th':
            return None, self.num_rois, self.nb_channels, self.pool_size, self.pool_size
        else:
            return None, self.num_rois, self.pool_size, self.pool_size, self.nb_channels

    def call(self, x, mask=None):
        assert (len(x) == 2)

        img = x[0]
        rois = x[1]

        input_shape = keras.backend.shape(img)

        outputs = []

        for roi_idx in range(self.num_rois):

            x = rois[0, roi_idx, 0]
            y = rois[0, roi_idx, 1]
            w = rois[0, roi_idx, 2]
            h = rois[0, roi_idx, 3]

            row_length = w / float(self.pool_size)
            col_length = h / float(self.pool_size)

            num_pool_regions = self.pool_size

            if self.dim_ordering == 'th':
                for jy in range(num_pool_regions):
                    for ix in range(num_pool_regions):
                        x1 = x + ix * row_length
                        x2 = x1 + row_length
                        y1 = y + jy * col_length
                        y2 = y1 + col_length

                        x1 = keras.backend.cast(x1, 'int32')
                        x2 = keras.backend.cast(x2, 'int32')
                        y1 = keras.backend.cast(y1, 'int32')
                        y2 = keras.backend.cast(y2, 'int32')

                        x2 = x1 + keras.backend.maximum(1, x2 - x1)
                        y2 = y1 + keras.backend.maximum(1, y2 - y1)

                        new_shape = [input_shape[0], input_shape[1],
                                     y2 - y1, x2 - x1]

                        x_crop = img[:, :, y1:y2, x1:x2]
                        xm = keras.backend.reshape(x_crop, new_shape)
                        pooled_val = keras.backend.max(xm, axis=(2, 3))
                        outputs.append(pooled_val)

            elif self.dim_ordering == 'tf':
                for jy in range(num_pool_regions):
                    for ix in range(num_pool_regions):
                        x1 = x + ix * row_length
                        x2 = x1 + row_length
                        y1 = y + jy * col_length
                        y2 = y1 + col_length

                        x1 = keras.backend.cast(x1, 'int32')
                        x2 = keras.backend.cast(x2, 'int32')
                        y1 = keras.backend.cast(y1, 'int32')
                        y2 = keras.backend.cast(y2, 'int32')

                        x2 = x1 + keras.backend.maximum(1, x2 - x1)
                        y2 = y1 + keras.backend.maximum(1, y2 - y1)

                        new_shape = [input_shape[0], y2 - y1,
                                     x2 - x1, input_shape[3]]
                        x_crop = img[:, y1:y2, x1:x2, :]
                        xm = keras.backend.reshape(x_crop, new_shape)
                        pooled_val = keras.backend.max(xm, axis=(1, 2))
                        outputs.append(pooled_val)

        final_output = keras.backend.concatenate(outputs, axis=0)
        final_output = keras.backend.reshape(final_output, (
            1, self.num_rois, self.pool_size, self.pool_size,
            self.nb_channels))

        if self.dim_ordering == 'th':
            final_output = keras.backend.permute_dimensions(final_output,
                                                            (0, 1, 4, 2, 3))
        else:
            final_output = keras.backend.permute_dimensions(final_output,
                                                            (0, 1, 2, 3, 4))

        return final_output

In [9]:
def identity_block_td(input_tensor, kernel_size, filters, stage, block, trainable=True):
    '''The identity_block is the block that has no conv layer at shortcut
    # Arguments
            input_tensor: input tensor
            kernel_size: defualt 3, the kernel size of middle conv layer at main path
            filters: list of integers, the nb_filters of 3 conv layer at main path
            stage: integer, current stage label, used for generating layer names
            block: 'a','b'..., current block label, used for generating layer names
    '''
    nb_filter1, nb_filter2, nb_filter3 = filters
    if keras.backend.image_dim_ordering() == 'tf':
        bn_axis = 3
    else:
        bn_axis = 1

    conv_name_base = 'res' + str(stage) + block + '_branch'
    bn_name_base = 'bn' + str(stage) + block + '_branch'

    x = keras.layers.TimeDistributed(
        keras.layers.Convolution2D(nb_filter1, (1, 1), trainable=trainable,
                                   kernel_initializer='normal'),
        name=conv_name_base + '2a')(input_tensor)
    x = keras.layers.TimeDistributed(
        keras.layers.BatchNormalization(axis=bn_axis),
        name=bn_name_base + '2a')(x)
    x = keras.layers.Activation('relu')(x)

    x = keras.layers.TimeDistributed(
        keras.layers.Convolution2D(nb_filter2, (kernel_size, kernel_size),
                                   trainable=trainable,
                                   kernel_initializer='normal',
                                   padding='same'),
        name=conv_name_base + '2b')(x)
    x = keras.layers.TimeDistributed(
        keras.layers.BatchNormalization(axis=bn_axis),
        name=bn_name_base + '2b')(x)
    x = keras.layers.Activation('relu')(x)

    x = keras.layers.TimeDistributed(
        keras.layers.Convolution2D(nb_filter3, (1, 1), trainable=trainable,
                                   kernel_initializer='normal'),
        name=conv_name_base + '2c')(x)
    x = keras.layers.TimeDistributed(
        keras.layers.BatchNormalization(axis=bn_axis),
        name=bn_name_base + '2c')(x)

    x = keras.layers.merge([x, input_tensor], mode='sum')
    x = keras.layers.Activation('relu')(x)

    return x


def conv_block_td(input_tensor, kernel_size, filters, stage, block, strides=(2, 2), trainable=True):
    '''conv_block is the block that has a conv layer at shortcut
    # Arguments
            input_tensor: input tensor
            kernel_size: defualt 3, the kernel size of middle conv layer at main path
            filters: list of integers, the nb_filters of 3 conv layer at main path
            stage: integer, current stage label, used for generating layer names
            block: 'a','b'..., current block label, used for generating layer names
    Note that from stage 3, the first conv layer at main path is with strides=(2,2)
    And the shortcut should have strides=(2,2) as well
    '''
    nb_filter1, nb_filter2, nb_filter3 = filters
    if keras.backend.image_dim_ordering() == 'tf':
        bn_axis = 3
    else:
        bn_axis = 1

    conv_name_base = 'res' + str(stage) + block + '_branch'
    bn_name_base = 'bn' + str(stage) + block + '_branch'

    x = keras.layers.TimeDistributed(keras.layers.Convolution2D(nb_filter1, (1, 1), strides=strides, trainable=trainable, kernel_initializer='normal'), name=conv_name_base + '2a')(input_tensor)
    x = keras.layers.TimeDistributed(keras.layers.BatchNormalization(axis=bn_axis), name=bn_name_base + '2a')(x)
    x = keras.layers.Activation('relu')(x)
    x = keras.layers.TimeDistributed(keras.layers.Convolution2D(nb_filter2, (kernel_size, kernel_size), padding='same', trainable=trainable, kernel_initializer='normal'), name=conv_name_base + '2b')(x)
    x = keras.layers.TimeDistributed(keras.layers.BatchNormalization(axis=bn_axis), name=bn_name_base + '2b')(x)
    x = keras.layers.Activation('relu')(x)
    x = keras.layers.TimeDistributed(keras.layers.Convolution2D(nb_filter3, (1, 1), kernel_initializer='normal'), name=conv_name_base + '2c', trainable=trainable)(x)
    x = keras.layers.TimeDistributed(keras.layers.BatchNormalization(axis=bn_axis), name=bn_name_base + '2c')(x)

    shortcut = keras.layers.TimeDistributed(keras.layers.Convolution2D(nb_filter3, (1, 1), strides=strides, trainable=trainable, kernel_initializer='normal'), name=conv_name_base + '1')(input_tensor)
    shortcut = keras.layers.TimeDistributed(keras.layers.BatchNormalization(axis=bn_axis), name=bn_name_base + '1')(shortcut)

    x = keras.layers.merge([x, shortcut], mode='sum')
    x = keras.layers.Activation('relu')(x)
    return x


def nn_base(input_tensor=None, trainable=False):
    # Determine proper input shape
    if keras.backend.image_dim_ordering() == 'th':
        input_shape = (3, None, None)
    else:
        input_shape = (None, None, 3)

    if input_tensor is None:
        img_input = keras.layers.Input(shape=input_shape)
    else:
        if not keras.backend.is_keras_tensor(input_tensor):
            img_input = keras.layers.Input(tensor=input_tensor, shape=input_shape)
        else:
            img_input = input_tensor

    if keras.backend.image_dim_ordering() == 'tf':
        bn_axis = 3
    else:
        bn_axis = 1

    x = keras.layers.ZeroPadding2D((3, 3))(img_input)
    x = keras.layers.Convolution2D(64, (7, 7), strides=(2, 2), name='conv1', trainable=trainable)(x)
    x = keras.layers.BatchNormalization(axis=bn_axis, name='bn_conv1')(x)
    x = keras.layers.Activation('relu')(x)
    x = keras.layers.MaxPooling2D((3, 3), strides=(2, 2))(x)

    x = keras.applications.resnet50.conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
    x = keras.applications.resnet50.identity_block(x, 3, [64, 64, 256], stage=2, block='b')
    x = keras.applications.resnet50.identity_block(x, 3, [64, 64, 256], stage=2, block='c')

    x = keras.applications.resnet50.conv_block(x, 3, [128, 128, 512], stage=3, block='a')
    x = keras.applications.resnet50.identity_block(x, 3, [128, 128, 512], stage=3, block='b')
    x = keras.applications.resnet50.identity_block(x, 3, [128, 128, 512], stage=3, block='c')
    x = keras.applications.resnet50.identity_block(x, 3, [128, 128, 512], stage=3, block='d')

    x = keras.applications.resnet50.conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
    x = keras.applications.resnet50.identity_block(x, 3, [256, 256, 1024], stage=4, block='b')
    x = keras.applications.resnet50.identity_block(x, 3, [256, 256, 1024], stage=4, block='c')
    x = keras.applications.resnet50.identity_block(x, 3, [256, 256, 1024], stage=4, block='d')
    x = keras.applications.resnet50.identity_block(x, 3, [256, 256, 1024], stage=4, block='e')
    x = keras.applications.resnet50.identity_block(x, 3, [256, 256, 1024], stage=4, block='f')

    return x


def classifier_layers(x, trainable=False):
    x = conv_block_td(x, 3, [512, 512, 2048], stage=5, block='a', strides=(1, 1), trainable=trainable)
    x = identity_block_td(x, 3, [512, 512, 2048], stage=5, block='b', trainable=trainable)
    x = identity_block_td(x, 3, [512, 512, 2048], stage=5, block='c', trainable=trainable)

    x = keras.layers.TimeDistributed(keras.layers.AveragePooling2D((7, 7)), name='avg_pool')(x)

    return x


def rpn(base_layers, num_anchors):
    x = keras.layers.Convolution2D(512, (3, 3), padding='same', activation='relu', kernel_initializer='normal', name='rpn_conv1')(base_layers)

    x_class = keras.layers.Convolution2D(num_anchors, (1, 1),
                                         activation='sigmoid',
                                         kernel_initializer='uniform',
                                         name='rpn_out_class')(x)
    x_regr = keras.layers.Convolution2D(num_anchors * 4, (1, 1),
                                        activation='linear',
                                        kernel_initializer='normal',
                                        name='rpn_out_regress')(x)

    return [x_class, x_regr]


def classifier(base_layers, input_rois, num_rois, nb_classes=21):
    pooling_regions = 7

    out_roi_pool = ROI(pooling_regions, num_rois)([base_layers, input_rois])

    out = classifier_layers(out_roi_pool, trainable=True)

    out = keras.layers.TimeDistributed(keras.layers.Flatten(), name='td_flatten')(out)

    out_class = keras.layers.TimeDistributed(keras.layers.Dense(nb_classes, activation='softmax', kernel_initializer='zero'), name='dense_class_{}'.format(nb_classes))(out)
    # note: no regression target for bg class
    out_regr = keras.layers.TimeDistributed(keras.layers.Dense(4 * (nb_classes - 1), activation='linear', kernel_initializer='zero'), name='dense_regress_{}'.format(nb_classes))(out)

    return [out_class, out_regr]


In [10]:
if keras.backend.image_dim_ordering() == 'tf':
    import tensorflow as tf

lambda_rpn_regr = 1.0
lambda_rpn_class = 1.0

lambda_cls_regr = 1.0
lambda_cls_class = 1.0

epsilon = 1e-4


def rpn_loss_regr(num_anchors):
    def rpn_loss_regr_fixed_num(y_true, y_pred):
        if keras.backend.image_dim_ordering() == 'th':
            x = y_true[:, 4 * num_anchors:, :, :] - y_pred
            x_abs = keras.backend.abs(x)
            x_bool = keras.backend.less_equal(x_abs, 1.0)
            return lambda_rpn_regr * keras.backend.sum(
                y_true[:, :4 * num_anchors, :, :] * (
                x_bool * (0.5 * x * x) + (1 - x_bool) * (
                x_abs - 0.5))) / keras.backend.sum(
                epsilon + y_true[:, :4 * num_anchors, :, :])
        else:
            x = y_true[:, :, :, 4 * num_anchors:] - y_pred
            x_abs = keras.backend.abs(x)
            x_bool = keras.backend.cast(keras.backend.less_equal(x_abs, 1.0),
                                        tf.float32)
            # x_bool = K.less_equal(x_abs, 1.0)

            return lambda_rpn_regr * keras.backend.sum(
                y_true[:, :, :, :4 * num_anchors] * (
                x_bool * (0.5 * x * x) + (1 - x_bool) * (
                x_abs - 0.5))) / keras.backend.sum(
                epsilon + y_true[:, :, :, :4 * num_anchors])

    return rpn_loss_regr_fixed_num


def rpn_loss_cls(num_anchors):
    def rpn_loss_cls_fixed_num(y_true, y_pred):
        if keras.backend.image_dim_ordering() == 'tf':
            return lambda_rpn_class * keras.backend.sum(y_true[:, :, :,
                                                        :num_anchors] * keras.backend.binary_crossentropy(
                y_pred[:, :, :, :],
                y_true[:, :, :, num_anchors:])) / keras.backend.sum(
                epsilon + y_true[:, :, :, :num_anchors])
        else:
            return lambda_rpn_class * keras.backend.sum(
                y_true[:, :num_anchors, :,
                :] * keras.backend.binary_crossentropy(y_pred[:, :, :, :],
                                                       y_true[:, num_anchors:,
                                                       :,
                                                       :])) / keras.backend.sum(
                epsilon + y_true[:, :num_anchors, :, :])
            # return lambda_rpn_class * K.sum(y_pred[:, :, :, :])
            # return lambda_rpn_class * K.sum(y_true[:, :, :, :num_anchors] * K.abs(y_pred[:, :, :, :] - y_true[:, :, :, num_anchors:]))

    return rpn_loss_cls_fixed_num


def class_loss_regr(num_rois, num_classes):
    def class_loss_regr_fixed_num(y_true, y_pred):
        x = y_true[:, :, 4 * num_classes:] - y_pred
        x_abs = keras.backend.abs(x)
        x_bool = keras.backend.cast(keras.backend.less_equal(x_abs, 1.0), 'float32')
        return lambda_cls_regr * keras.backend.sum(y_true[:, :, :4 * num_classes] * (x_bool * (0.5 * x * x) + (1 - x_bool) * (x_abs - 0.5))) / keras.backend.sum(epsilon + y_true[:, :, :4 * num_classes])

    return class_loss_regr_fixed_num


def class_loss_cls(y_true, y_pred):
    return lambda_cls_class * keras.objectives.categorical_crossentropy(y_true[0, :, :], y_pred[0, :, :])




In [11]:
def augment(img_data):
    img_data_aug = copy.deepcopy(img_data)

    img = cv2.imread(img_data_aug['filepath'])

    img = img[:, :, (2, 1, 0)]

    return img_data_aug, img


def get_img_output_length(width, height):
    def get_output_length(input_length):
        # zero_pad
        input_length += 6
        # apply 4 strided convolutions
        filter_sizes = [7, 3, 1, 1]
        stride = 2
        for filter_size in filter_sizes:
            input_length = (input_length - filter_size + stride) // stride
        return input_length

    return get_output_length(width), get_output_length(height)


def union(au, bu):
    x = min(au[0], bu[0])
    y = min(au[1], bu[1])
    w = max(au[2], bu[2]) - x
    h = max(au[3], bu[3]) - y
    return x, y, w, h


def intersection(ai, bi):
    x = max(ai[0], bi[0])
    y = max(ai[1], bi[1])
    w = min(ai[2], bi[2]) - x
    h = min(ai[3], bi[3]) - y
    if w < 0 or h < 0:
        return 0, 0, 0, 0
    return x, y, w, h


def iou(a, b):
    # a and b should be (x1,y1,x2,y2)
    assert a[0] < a[2]
    assert a[1] < a[3]
    assert b[0] < b[2]
    assert b[1] < b[3]

    i = intersection(a, b)
    u = union(a, b)

    area_i = i[2] * i[3]
    area_u = u[2] * u[3]
    return float(area_i) / float(area_u)


def get_new_img_size(width, height, img_min_side=600):
    if width <= height:
        f = float(img_min_side) / width
        resized_height = int(f * height)
        resized_width = img_min_side
    else:
        f = float(img_min_side) / height
        resized_width = int(f * width)
        resized_height = img_min_side

    return resized_width, resized_height


In [12]:
class Sample:
    def __init__(self, counts):
        self.classes = []

        descriptions = counts.keys()

        for description in descriptions:
            if counts[description] > 0:
                self.classes.append(description)

        self.cycle = itertools.cycle(self.classes)

        self.current = next(self.cycle)

    def skip(self, image):
        present = False

        for boundary in image["bboxes"]:
            description = boundary["class"]

            if description == self.current:
                present = True

                self.current = next(self.cycle)

                break

        if present:
            return False
        else:
            return True

class Iterator:
    def __init__(self, iterator):
        self.iterator = iterator

        self.lock = threading.Lock()

    def __iter__(self):
        return self

    def next(self):
        with self.lock:
            return self.iterator.next()


In [13]:
def calcY(C, class_mapping, img_data, width, height, resized_width, resized_height):
    downscale = float(C.rpn_stride)
    anchor_sizes = C.anchor_box_scales
    anchor_ratios = C.anchor_box_ratios
    num_anchors = len(anchor_sizes) * len(anchor_ratios)

    # calculate the output map size based on the network architecture
    (output_width, output_height) = get_img_output_length(resized_width,
                                                          resized_height)

    n_anchratios = len(anchor_ratios)

    # initialise empty output objectives
    y_rpn_overlap = numpy.zeros((output_height, output_width, num_anchors))
    y_is_box_valid = numpy.zeros((output_height, output_width, num_anchors))
    y_rpn_regr = numpy.zeros((output_height, output_width, num_anchors * 4))

    num_bboxes = len(img_data['bboxes'])

    num_anchors_for_bbox = numpy.zeros(num_bboxes).astype(int)
    best_anchor_for_bbox = -1 * numpy.ones((num_bboxes, 4)).astype(int)
    best_iou_for_bbox = numpy.zeros(num_bboxes).astype(numpy.float32)
    best_x_for_bbox = numpy.zeros((num_bboxes, 4)).astype(int)
    best_dx_for_bbox = numpy.zeros((num_bboxes, 4)).astype(numpy.float32)

    # get the GT box coordinates, and resize to account for image resizing
    gta = numpy.zeros((num_bboxes, 4))
    for bbox_num, bbox in enumerate(img_data['bboxes']):
        # get the GT box coordinates, and resize to account for image resizing
        gta[bbox_num, 0] = bbox['x1'] * (resized_width / float(width))
        gta[bbox_num, 1] = bbox['x2'] * (resized_width / float(width))
        gta[bbox_num, 2] = bbox['y1'] * (resized_height / float(height))
        gta[bbox_num, 3] = bbox['y2'] * (resized_height / float(height))

    # rpn ground truth

    for anchor_size_idx in xrange(len(anchor_sizes)):
        for anchor_ratio_idx in xrange(n_anchratios):
            anchor_x = anchor_sizes[anchor_size_idx] * \
                       anchor_ratios[anchor_ratio_idx][0]
            anchor_y = anchor_sizes[anchor_size_idx] * \
                       anchor_ratios[anchor_ratio_idx][1]

            for ix in xrange(output_width):
                # x-coordinates of the current anchor box
                x1_anc = downscale * (ix + 0.5) - anchor_x / 2
                x2_anc = downscale * (ix + 0.5) + anchor_x / 2

                # ignore boxes that go across image boundaries
                if x1_anc < 0 or x2_anc > resized_width:
                    continue

                for jy in xrange(output_height):

                    # y-coordinates of the current anchor box
                    y1_anc = downscale * (jy + 0.5) - anchor_y / 2
                    y2_anc = downscale * (jy + 0.5) + anchor_y / 2

                    # ignore boxes that go across image boundaries
                    if y1_anc < 0 or y2_anc > resized_height:
                        continue

                    # bbox_type indicates whether an anchor should be a target
                    bbox_type = 'neg'

                    # this is the best IOU for the (x,y) coord and the current anchor
                    # note that this is different from the best IOU for a GT bbox
                    best_iou_for_loc = 0.0

                    for bbox_num in xrange(num_bboxes):

                        # get IOU of the current GT box and the current anchor box
                        curr_iou = iou([gta[bbox_num, 0], gta[bbox_num, 2],
                                        gta[bbox_num, 1], gta[bbox_num, 3]],
                                       [x1_anc, y1_anc, x2_anc, y2_anc])
                        # calculate the regression targets if they will be needed
                        if curr_iou > best_iou_for_bbox[
                            bbox_num] or curr_iou > C.rpn_max_overlap:
                            cx = (gta[bbox_num, 0] + gta[bbox_num, 1]) / 2.0
                            cy = (gta[bbox_num, 2] + gta[bbox_num, 3]) / 2.0
                            cxa = (x1_anc + x2_anc) / 2.0
                            cya = (y1_anc + y2_anc) / 2.0

                            tx = (cx - cxa) / (x2_anc - x1_anc)
                            ty = (cy - cya) / (y2_anc - y1_anc)
                            tw = numpy.log(
                                (gta[bbox_num, 1] - gta[bbox_num, 0]) / (
                                    x2_anc - x1_anc))
                            th = numpy.log(
                                (gta[bbox_num, 3] - gta[bbox_num, 2]) / (
                                    y2_anc - y1_anc))

                        if img_data['bboxes'][bbox_num]['class'] != 'bg':

                            # all GT boxes should be mapped to an anchor box, so we keep track of which anchor box was best
                            if curr_iou > best_iou_for_bbox[bbox_num]:
                                best_anchor_for_bbox[bbox_num] = [jy, ix,
                                                                  anchor_ratio_idx,
                                                                  anchor_size_idx]
                                best_iou_for_bbox[bbox_num] = curr_iou
                                best_x_for_bbox[bbox_num, :] = [x1_anc, x2_anc,
                                                                y1_anc, y2_anc]
                                best_dx_for_bbox[bbox_num, :] = [tx, ty, tw,
                                                                 th]

                            # we set the anchor to positive if the IOU is >0.7 (it does not matter if there was another better box, it just indicates overlap)
                            if curr_iou > C.rpn_max_overlap:
                                bbox_type = 'pos'
                                num_anchors_for_bbox[bbox_num] += 1
                                # we update the regression layer target if this IOU is the best for the current (x,y) and anchor position
                                if curr_iou > best_iou_for_loc:
                                    best_iou_for_loc = curr_iou
                                    best_regr = (tx, ty, tw, th)

                            # if the IOU is >0.3 and <0.7, it is ambiguous and no included in the objective
                            if C.rpn_min_overlap < curr_iou < C.rpn_max_overlap:
                                # gray zone between neg and pos
                                if bbox_type != 'pos':
                                    bbox_type = 'neutral'

                    # turn on or off outputs depending on IOUs
                    if bbox_type == 'neg':
                        y_is_box_valid[
                            jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 1
                        y_rpn_overlap[
                            jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 0
                    elif bbox_type == 'neutral':
                        y_is_box_valid[
                            jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 0
                        y_rpn_overlap[
                            jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 0
                    elif bbox_type == 'pos':
                        y_is_box_valid[
                            jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 1
                        y_rpn_overlap[
                            jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 1
                        start = 4 * (
                            anchor_ratio_idx + n_anchratios * anchor_size_idx)
                        y_rpn_regr[jy, ix, start:start + 4] = best_regr

    # we ensure that every bbox has at least one positive RPN region

    for idx in xrange(num_anchors_for_bbox.shape[0]):
        if num_anchors_for_bbox[idx] == 0:
            # no box with an IOU greater than zero ...
            if best_anchor_for_bbox[idx, 0] == -1:
                continue
            y_is_box_valid[
                best_anchor_for_bbox[idx, 0], best_anchor_for_bbox[idx, 1],
                best_anchor_for_bbox[idx, 2] + n_anchratios *
                best_anchor_for_bbox[idx, 3]] = 1
            y_rpn_overlap[
                best_anchor_for_bbox[idx, 0], best_anchor_for_bbox[idx, 1],
                best_anchor_for_bbox[idx, 2] + n_anchratios *
                best_anchor_for_bbox[idx, 3]] = 1
            start = 4 * (
                best_anchor_for_bbox[idx, 2] + n_anchratios *
                best_anchor_for_bbox[
                    idx, 3])
            y_rpn_regr[
            best_anchor_for_bbox[idx, 0], best_anchor_for_bbox[idx, 1],
            start:start + 4] = best_dx_for_bbox[idx, :]

    y_rpn_overlap = numpy.transpose(y_rpn_overlap, (2, 0, 1))
    y_rpn_overlap = numpy.expand_dims(y_rpn_overlap, axis=0)

    y_is_box_valid = numpy.transpose(y_is_box_valid, (2, 0, 1))
    y_is_box_valid = numpy.expand_dims(y_is_box_valid, axis=0)

    y_rpn_regr = numpy.transpose(y_rpn_regr, (2, 0, 1))
    y_rpn_regr = numpy.expand_dims(y_rpn_regr, axis=0)

    pos_locs = numpy.where(numpy.logical_and(y_rpn_overlap[0, :, :, :] == 1,
                                             y_is_box_valid[0, :, :, :] == 1))
    neg_locs = numpy.where(numpy.logical_and(y_rpn_overlap[0, :, :, :] == 0,
                                             y_is_box_valid[0, :, :, :] == 1))

    num_pos = len(pos_locs[0])

    # one issue is that the RPN has many more negative than positive regions, so we turn off some of the negative
    # regions. We also limit it to 256 regions.
    num_regions = 256
    '''
    if len(pos_locs[0]) > num_regions/2:
        val_locs = random.sample(range(len(pos_locs[0])), len(pos_locs[0]) - num_regions/2)
        y_is_box_valid[0, pos_locs[0][val_locs], pos_locs[1][val_locs], pos_locs[2][val_locs]] = 0
        num_pos = num_regions/2

    if len(neg_locs[0]) + num_pos > num_regions:
        val_locs = random.sample(range(len(neg_locs[0])), len(neg_locs[0]) - num_pos)
        y_is_box_valid[0, neg_locs[0][val_locs], neg_locs[1][val_locs], neg_locs[2][val_locs]] = 0
    '''
    y_rpn_cls = numpy.concatenate([y_is_box_valid, y_rpn_overlap], axis=1)
    y_rpn_regr = numpy.concatenate(
        [numpy.repeat(y_rpn_overlap, 4, axis=1), y_rpn_regr], axis=1)
    # classifier ground truth
    x_rois = []
    y_class_num = numpy.zeros((C.num_rois, len(class_mapping)))
    # regr has 8 * num_classes values: 4 for on/off, 4 for w,y,w,h for each class
    num_non_bg_classes = len(class_mapping) - 1
    y_class_regr = numpy.zeros((C.num_rois, 2 * 4 * num_non_bg_classes))

    for i in range(C.num_rois):
        # generate either a bg sample or a class sample, and select acceptable IOUs
        if i < C.num_rois / 2:
            sample_type = 'pos'
            min_iou = C.classifier_max_overlap
            max_iou = 1.0
        else:
            sample_type = 'neg'
            min_iou = C.classifier_min_overlap
            max_iou = C.classifier_max_overlap
        not_valid_gt = True

        num_attempts = 0

        while not_valid_gt:
            min_size = 64
            try:
                x = numpy.random.randint(0, (
                    resized_width - min_size - downscale - 2))
                y = numpy.random.randint(0, (
                    resized_height - min_size - downscale - 2))
                w = numpy.random.randint(min_size,
                                         (resized_width - x - downscale))
                h = numpy.random.randint(min_size,
                                         (resized_height - y - downscale))
            except:
                pass
            largest_iou = 0.0
            bbox_idx = -1

            num_attempts += 1
            if num_attempts > 10000:
                return

            for bbox_num in xrange(num_bboxes):
                # get IOU of the current GT box and the current anchor box
                curr_iou = iou(
                    [gta[bbox_num, 0], gta[bbox_num, 2], gta[bbox_num, 1],
                     gta[bbox_num, 3]], [x, y, x + w, y + h])
                if curr_iou > largest_iou:
                    largest_iou = curr_iou
                    bbox_idx = bbox_num

            if min_iou < largest_iou <= max_iou:
                not_valid_gt = False
                x_rois.append(
                    [int(round(x / downscale)), int(round(y / downscale)),
                     int(round(w / downscale)), int(round(h / downscale))])
                if sample_type == 'pos':
                    cls_name = img_data['bboxes'][bbox_idx]['class']
                    x1 = x
                    y1 = y

                    cxg = (gta[bbox_idx, 0] + gta[bbox_idx, 1]) / 2.0
                    cyg = (gta[bbox_idx, 2] + gta[bbox_idx, 3]) / 2.0

                    cx = x1 + w / 2.0
                    cy = y1 + h / 2.0

                    tx = (cxg - cx) / float(w)
                    ty = (cyg - cy) / float(h)
                    tw = numpy.log(
                        (gta[bbox_idx, 1] - gta[bbox_idx, 0]) / float(w))
                    th = numpy.log(
                        (gta[bbox_idx, 3] - gta[bbox_idx, 2]) / float(h))
                else:
                    cls_name = 'bg'

                class_num = class_mapping[cls_name]
                y_class_num[i, class_num] = 1
                if class_num != num_non_bg_classes:
                    y_class_regr[i,
                    4 * class_num:4 * class_num + 4] = 1  # set value to 1 if the sample is positive
                    y_class_regr[i,
                    num_non_bg_classes * 4 + 4 * class_num:num_non_bg_classes * 4 + 4 * class_num + 4] = [
                        tx, ty, tw, th]
                break

    x_rois = numpy.array(x_rois)
    y_class_num = numpy.expand_dims(y_class_num, axis=0)
    y_class_regr = numpy.expand_dims(y_class_regr, axis=0)
    x_rois = numpy.expand_dims(x_rois, axis=0)
    return numpy.copy(x_rois), numpy.copy(y_rpn_cls), numpy.copy(y_rpn_regr), numpy.copy(y_class_num), numpy.copy(y_class_regr)


In [14]:
def generator(f):
    def g(*args, **kwargs):
        return Iterator(f(*args, **kwargs))

    return g


@generator
def get_anchor_gt(images, classes, counts, configuration, backend, mode="train"):
    downscale = float(configuration.rpn_stride)

    images = sorted(images)

    anchor_sizes = configuration.anchor_box_scales
    anchor_ratios = configuration.anchor_box_ratios

    num_anchors = len(anchor_sizes) * len(anchor_ratios)

    sample_selector = Sample(counts)

    while True:
        if mode == 'train':
            random.shuffle(images)

        for img_data in images:
            try:
                if configuration.balanced_classes and sample_selector.skip(img_data):
                    continue

                img_data_aug, x_img = augment(img_data)

                (width, height) = (
                    img_data_aug['width'], img_data_aug['height'])
                (rows, cols, _) = x_img.shape

                assert cols == width
                assert rows == height

                # get image dimensions for resizing
                (resized_width, resized_height) = get_new_img_size(width,
                                                                   height,
                                                                   configuration.im_size)

                # resize the image so that smalles side is length = 600px
                x_img = cv2.resize(x_img, (resized_width, resized_height),
                                   interpolation=cv2.INTER_CUBIC)

                # calculate the output map size based on the network architecture
                (output_width, output_height) = get_img_output_length(
                    resized_width, resized_height)
                try:
                    x_rois, y_rpn_cls, y_rpn_regr, y_class_num, y_class_regr = calcY(
                        configuration, classes, img_data_aug, width, height,
                        resized_width, resized_height)
                except:
                    continue
                # Zero-center by mean pixel
                x_img = x_img.astype(numpy.float32)
                x_img[:, :, 0] -= 103.939
                x_img[:, :, 1] -= 116.779
                x_img[:, :, 2] -= 123.68

                x_img = numpy.transpose(x_img, (2, 0, 1))
                x_img = numpy.expand_dims(x_img, axis=0)

                y_rpn_regr[:, y_rpn_regr.shape[1] / 2:, :, :] *= configuration.std_scaling
                y_class_regr[:, y_class_regr.shape[1] / 2:, :] *= configuration.std_scaling

                if backend == 'tf':
                    x_img = numpy.transpose(x_img, (0, 2, 3, 1))
                    y_rpn_cls = numpy.transpose(y_rpn_cls, (0, 2, 3, 1))
                    y_rpn_regr = numpy.transpose(y_rpn_regr, (0, 2, 3, 1))

                yield [numpy.copy(x_img), numpy.copy(x_rois)], [
                    numpy.copy(y_rpn_cls),
                    numpy.copy(y_rpn_regr),
                    numpy.copy(y_class_num),
                    numpy.copy(
                        y_class_regr)]

            except Exception as e:
                print(e)
                continue

In [15]:
class Configuration:
    def __init__(self):
        # setting for data augmentation
        self.use_horizontal_flips = False
        self.use_vertical_flips = False
        self.scale_augment = False
        self.random_rotate = False
        self.random_rotate_scale = 180.

        # anchor box scales
        self.anchor_box_scales = [64, 128, 256]#[128, 256, 512]

        # anchor box ratios
        self.anchor_box_ratios = [[1, 1], [1, 2], [2, 1]]

        # size to resize the smallest side of the image
        self.im_size = 448#600

        # number of ROIs at once
        self.num_rois = 2

        # stride at the RPN (this depends on the network configuration)
        self.rpn_stride = 16

        self.balanced_classes = False

        # scaling the stdev
        self.std_scaling = 4.0

        # overlaps for RPN
        self.rpn_min_overlap = 0.3
        self.rpn_max_overlap = 0.7

        # overlaps for classifier ROIs
        self.classifier_min_overlap = 0.1
        self.classifier_max_overlap = 0.5

        # location of pretrained weights for the base network
        # weight files can be found at:
        # https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_th_dim_ordering_th_kernels_notop.h5
        # https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
        if keras.backend.image_dim_ordering() == 'th':
            self.base_net_weights = 'resnet50_weights_th_dim_ordering_th_kernels_notop.h5'
        else:
            self.base_net_weights = 'resnet50_weights_tf_dim_ordering_tf_kernels.h5'

        self.model_path = 'model_frcnn.hdf5'

In [28]:
pathname = '/home/jhung0/training.txt'
weights = None
output = '/home/jhung0/scratch.h5'
sys.setrecursionlimit(40000)

configuration = Configuration()

configuration.num_rois = 2

configuration.use_vertical_flips = True

images, counts, classes = get_data(pathname)

if 'bg' not in counts:
    counts['bg'] = 0
    classes['bg'] = len(classes)

with open('classes.json', 'w') as class_data_json:
    json.dump(classes, class_data_json)

inv_map = {v: k for k, v in classes.iteritems()}

print('Num classes (including bg) = {}'.format(len(counts)))
random.shuffle(images)

train_imgs = [s for s in images if s['imageset'] == 'trainval']
val_imgs = [s for s in images if s['imageset'] == 'test']

print('Num train samples {}'.format(len(train_imgs)))
print('Num val samples {}'.format(len(val_imgs)))

if keras.backend.image_dim_ordering() == 'th':
    input_shape_img = (3, None, None)
else:
    input_shape_img = (None, None, 3)

img_input = keras.layers.Input(shape=input_shape_img)

roi_input = keras.layers.Input(shape=(configuration.num_rois, 4))

# define the base network (resnet here, can be VGG, Inception, etc)
shared_layers = nn_base(img_input, trainable=True)

# define the RPN, built on the base layers
num_anchors = len(configuration.anchor_box_scales) * len(configuration.anchor_box_ratios)

r_p_n = rpn(shared_layers, num_anchors)

# the classifier is build on top of the base layers + the ROI pooling layer
# + extra layers
_classifier = classifier(shared_layers, roi_input, configuration.num_rois,
                         nb_classes=len(counts))

# define the full model
model = keras.models.Model([img_input, roi_input], r_p_n + _classifier)

model.compile(
    optimizer=keras.optimizers.Adam(1e-5, decay=0.0),
    loss=[
        rpn_loss_cls(num_anchors),
        rpn_loss_regr(num_anchors),
        class_loss_cls,
        class_loss_regr(configuration.num_rois, len(counts) - 1)
    ],
    metrics={
        'dense_class_{}_loss'.format(len(counts)): 'accuracy'
    }
)
if weights:
    model.load_weights(weights, by_name=True)

model_checkpoint = keras.callbacks.ModelCheckpoint("model.hdf5")

model.fit_generator(
    generator=get_anchor_gt(
        train_imgs,
        classes,
        counts,
        configuration,
        keras.backend.image_dim_ordering(),
        mode="training"
    ),
    steps_per_epoch=1000,
    epochs=2,
    validation_data=get_anchor_gt(
        val_imgs,
        classes,
        counts,
        configuration,
        keras.backend.image_dim_ordering(),
        mode="validation"
    ),
    validation_steps=1000,
    callbacks=[
        model_checkpoint
    ],
    max_q_size=100,
    workers=8
)
if output:
    model.save(output)

Parsing annotation files
Num classes (including bg) = 7
Num train samples 9213
Num val samples 1885




Epoch 1/2

KeyboardInterrupt: 

In [27]:
model.save('/home/jhung0/scratch.h5')