## Regional Proposal Network

In [1]:
import tensorflow as tf
import keras
import keras.backend as K
import numpy as np
import cv2

Using TensorFlow backend.


### Backbone of the Network
  > RESNET50

In [2]:
print(keras.__version__)

2.2.4


### Importing ResNet50 Source code
(From Actual Keras Implementation of ResNet50)

In [3]:
from keras.layers import Input, Add, Dense, Activation, Flatten, Convolution2D, MaxPooling2D, ZeroPadding2D, \
    AveragePooling2D, TimeDistributed, BatchNormalization

In [4]:
def identity_block(input_tensor, kernel_size, filters, stage, block, trainable=True):
    nb_filter1, nb_filter2, nb_filter3 = filters
    if K.image_dim_ordering() == 'tf':
        bn_axis = 3
    else:
        bn_axis = 1
    conv_name_base = 'res' + str(stage) + block + '_branch'
    bn_name_base = 'bn' + str(stage) + block + '_branch'
    x = Convolution2D(nb_filter1, (1, 1), name=conv_name_base + '2a', trainable=trainable)(input_tensor)
    x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
    x = Activation('relu')(x)
    x = Convolution2D(nb_filter2, (kernel_size, kernel_size), padding='same', name=conv_name_base + '2b', trainable=trainable)(x)
    x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
    x = Activation('relu')(x)
    x = Convolution2D(nb_filter3, (1, 1), name=conv_name_base + '2c', trainable=trainable)(x)
    x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)
    x = Add()([x, input_tensor])
    x = Activation('relu')(x)
    return x

In [5]:
def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2), trainable=True):
    nb_filter1, nb_filter2, nb_filter3 = filters
    if K.image_dim_ordering() == 'tf':
        bn_axis = 3
    else:
        bn_axis = 1
    conv_name_base = 'res' + str(stage) + block + '_branch'
    bn_name_base = 'bn' + str(stage) + block + '_branch'
    x = Convolution2D(nb_filter1, (1, 1), strides=strides, name=conv_name_base + '2a', trainable=trainable)(input_tensor)
    x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
    x = Activation('relu')(x)
    x = Convolution2D(nb_filter2, (kernel_size, kernel_size), padding='same', name=conv_name_base + '2b', trainable=trainable)(x)
    x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
    x = Activation('relu')(x)
    x = Convolution2D(nb_filter3, (1, 1), name=conv_name_base + '2c', trainable=trainable)(x)
    x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)
    shortcut = Convolution2D(nb_filter3, (1, 1), strides=strides, name=conv_name_base + '1', trainable=trainable)(input_tensor)
    shortcut = BatchNormalization(axis=bn_axis, name=bn_name_base + '1')(shortcut)
    x = Add()([x, shortcut])
    x = Activation('relu')(x)
    return x

In [6]:
def nn_base(input_tensor=None, trainable=False):
    # Determine proper input shape
    if K.image_dim_ordering() == 'th':
        input_shape = (3, None, None)
    else:
        input_shape = (None, None, 3)
        
    if input_tensor is None:
        img_input = Input(shape=input_shape)
    else:
        if not K.is_keras_tensor(input_tensor):
            print("Not Keras tensor")
            img_input = Input(tensor=input_tensor, shape=input_shape)
        else:
            img_input = input_tensor
            print("Keras tensor")

    if K.image_dim_ordering() == 'tf':
        bn_axis = 3
    else:
        bn_axis = 1

    x = ZeroPadding2D((3, 3))(img_input)
    print(x.shape)
    x = Convolution2D(64, (7, 7), strides=(2, 2), name='conv1', trainable = trainable)(x)
    print(x.shape)
    x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x)
    print(x.shape)
    x = Activation('relu')(x)
    print(x.shape)
    x = MaxPooling2D((3, 3), strides=(2, 2))(x)
    print(x.shape)

    x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1), trainable = trainable)
    print(x.shape)
    x = identity_block(x, 3, [64, 64, 256], stage=2, block='b', trainable = trainable)
    print(x.shape)
    x = identity_block(x, 3, [64, 64, 256], stage=2, block='c', trainable = trainable)
    print(x.shape)

    x = conv_block(x, 3, [128, 128, 512], stage=3, block='a', trainable = trainable)
    print(x.shape)
    x = identity_block(x, 3, [128, 128, 512], stage=3, block='b', trainable = trainable)
    print(x.shape)
    x = identity_block(x, 3, [128, 128, 512], stage=3, block='c', trainable = trainable)
    print(x.shape)
    x = identity_block(x, 3, [128, 128, 512], stage=3, block='d', trainable = trainable)
    
    print(x.shape)
    x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a', trainable = trainable)
    print(x.shape)
    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b', trainable = trainable)
    print(x.shape)
    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c', trainable = trainable)
    print(x.shape)
    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d', trainable = trainable)
    print(x.shape)

    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e', trainable = trainable)
    print(x.shape)
    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f', trainable = trainable)
    print(x.shape)
    
    return x

#### Sampling Ratio = $1024/64$ = $16$

### Generating Anchors

#### Bounding Boxes, Input Image

In [7]:
width, height = 1024, 1024
subsampling_ratio = 16

img = np.zeros((height, width, 3))

### Bounding Box Format ####
# Origin - top-left corner.
# [xmin, ymin, xmaxm, ymax]
############################

bbox = np.asarray([[20, 40, 400, 100], [400, 800, 800, 1000]], dtype = np.int32)

#### Creating Anchor Boxes of Different Scales & Aspect Ratios

In [8]:
feature_map = nn_base(K.expand_dims(K.variable(img), axis=0))
print(feature_map.shape[1:3])

W1204 23:09:05.038197 17700 deprecation_wrapper.py:119] From C:\Users\Dyanesh\Anaconda3\envs\tensorflow_gpu\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1204 23:09:05.044710 17700 deprecation_wrapper.py:119] From C:\Users\Dyanesh\Anaconda3\envs\tensorflow_gpu\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1204 23:09:05.090514 17700 deprecation_wrapper.py:119] From C:\Users\Dyanesh\Anaconda3\envs\tensorflow_gpu\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W1204 23:09:05.091513 17700 deprecation_wrapper.py:119] From C:\Users\Dyanesh\Anaconda3\envs\tensorflow_gpu\lib\site-packages\keras\backend\tensorflow_backend.py:181: The name tf.ConfigProto is deprecated. Please use tf.

Not Keras tensor
(1, 1030, 1030, 3)
(1, 512, 512, 64)


W1204 23:09:11.587612 17700 deprecation_wrapper.py:119] From C:\Users\Dyanesh\Anaconda3\envs\tensorflow_gpu\lib\site-packages\keras\backend\tensorflow_backend.py:1834: The name tf.nn.fused_batch_norm is deprecated. Please use tf.compat.v1.nn.fused_batch_norm instead.

W1204 23:09:11.803311 17700 deprecation_wrapper.py:119] From C:\Users\Dyanesh\Anaconda3\envs\tensorflow_gpu\lib\site-packages\keras\backend\tensorflow_backend.py:3976: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.



(1, 512, 512, 64)
(1, 512, 512, 64)
(1, 255, 255, 64)
(1, 255, 255, 256)
(1, 255, 255, 256)
(1, 255, 255, 256)
(1, 128, 128, 512)
(1, 128, 128, 512)
(1, 128, 128, 512)
(1, 128, 128, 512)
(1, 64, 64, 1024)
(1, 64, 64, 1024)
(1, 64, 64, 1024)
(1, 64, 64, 1024)
(1, 64, 64, 1024)
(1, 64, 64, 1024)
(64, 64)


In [9]:
scales = [8, 16, 32]
ratios = [0.5, 1, 2]

k = len(scales) * len(ratios)
feature_map_size = feature_map.shape[1]

anchors = np.zeros((k * feature_map_size * feature_map_size, 4))
print(anchors.shape)

(36864, 4)


#### Anchor Centers

In [10]:
x = np.arange(subsampling_ratio/2, width, subsampling_ratio, dtype = np.int32)

anchor_centres = np.zeros((anchors.shape[0], 2))

anchor_centres[:, 0] = np.tile(np.repeat(x, k), feature_map_size) #XCOORDINATES
anchor_centres[:, 1] = np.repeat(x, k * feature_map_size) #YCOORDINATES


##### ANCHOR CENTRES ######
# A1,1, A1,2, A1,3 .... A1,K, A2,1, A2,2, ... A2,K, ........ AFEATUREMAP_SIZE*FEATUREMAP_SIZE,K 
###########################

#### Anchor Box Coordinates

In [19]:
## Checking Numpy Indexing

i = np.array([1,2,3])
x = np.array([1,2,4,5,5,6,7,8,9,9])
y = np.array([0,0,0,0,0,0,0,0,0,0])

x[i] = y[i]
print(x)

[1 0 0 0 5 6 7 8 9 9]


In [11]:
start_box_no = 0
for scale in scales:
    for ratio in ratios:
        w = subsampling_ratio * scale * np.sqrt(ratio)
        h = subsampling_ratio * scale * (1/np.sqrt(ratio))
        
        ### ANCHOR BOX COORDINATES WITH SCALEi, RATIOj###
        anchor_coords = np.arange(start_box_no, anchor_centres.shape[0], step = k)
        
        anchors[anchor_coords, 0] = anchor_centres[anchor_coords, 0] - w/2 # XMIN
        anchors[anchor_coords, 2] = anchor_centres[anchor_coords, 0] + w/2 # YMIN
        anchors[anchor_coords, 1] = anchor_centres[anchor_coords, 1] - h/2 # XMAX
        anchors[anchor_coords, 3] = anchor_centres[anchor_coords, 1] + h/2 # YMAX
        
        start_box_no += 1
        

In [43]:
print(anchors[:k])

[[ -37.254834    -82.50966799   53.254834     98.50966799]
 [ -56.          -56.           72.           72.        ]
 [ -82.50966799  -37.254834     98.50966799   53.254834  ]
 [ -82.50966799 -173.01933598   98.50966799  189.01933598]
 [-120.         -120.          136.          136.        ]
 [-173.01933598  -82.50966799  189.01933598   98.50966799]
 [-173.01933598 -354.03867197  189.01933598  370.03867197]
 [-248.         -248.          264.          264.        ]
 [-354.03867197 -173.01933598  370.03867197  189.01933598]]


### Assigning Labels to Anchor Boxes

Positive Labels

> 1. For a Ground Truth, the anchor with the max IOU.
> 2. For an Anchor Box, if it has IOU > 0.7 with any of the ground truths.

Negative Labels

> If the anchor box has IOU < 0.3 with every ground truth, then its a negative label.

#### Calculating IoU

In [12]:
def IoU(a, b):
    
    (xa1, ya1, xa2, ya2) = a
    (xb1, yb1, xb2, yb2) = b
    
    ## CALCULATE INTERSECTION, UNION.
    
    x1 = max(xa1, xb1)
    y1 = max(ya1, yb1)
    x2 = min(xa2, xb2)
    y2 = min(ya2, yb2)
    
    ## IF INTERSECTION IS ONE POINT, THEN AREA IS ONE PIXEL.
    intersection = max(0, x2 - x1 + 1) * max(0, y2 - y1 + 1)

    area1 = (xa2 - xa1 + 1) * (ya2 - ya1 + 1)
    area2 = (xb2 - xb1 + 1) * (yb2 - yb1 + 1)
    
    union = area1 + area2 - intersection
    
    return intersection/union
    
    ## UNION = A + B - (A^B)

In [23]:
## ROUGH WORK ##

a = [20, 30, 400, 500]
b = [30, 40, 300, 400]

print(IoU(a, b))

0.5451683189282869


#### Filtering out Anchors whose Coordinates lie outside the Image

In [24]:
## ROUGH WORK ##

temp = np.asarray([
    [1, 2, 3, 4],
    [5, 0, 0, 0],
    [9, 8, 7, 0]
])

print(temp.shape)

## FIRST COLUMN FILTER
print(temp[temp[:,0] > 0])
temp = temp[temp[:,0] > 0]

## SECOND COLUMN FILTER
print(temp[temp[:,1] > 0])
temp = temp[temp[:,1] > 0]

## FOURTH COLUMN FILTER
print(temp[temp[:,3] > 0])
temp = temp[temp[:,3] > 0]



## APPLYING ALL FILTERS TOGETHER
temp = np.asarray([
    [1, 2, 3, 4],
    [5, 0, 0, 0],
    [9, 8, 7, 0]
])

print((temp[:,0] > 0) & (temp[:,1] > 0))
temp = temp[(temp[:,0] > 0) & (temp[:,1] > 0) & (temp[:,3] > 0)]
print(temp)

(3, 4)
[[1 2 3 4]
 [5 0 0 0]
 [9 8 7 0]]
[[1 2 3 4]
 [9 8 7 0]]
[[1 2 3 4]]
[ True False  True]
[[1 2 3 4]]


In [25]:
## ROUGH WORK ##
temp = anchors
print(temp.shape)
temp = temp[(temp[:,0] >= 0) & (temp[:,1] >= 0) & (temp[:,2] <= width) & (temp[:,3] <= height)]
print(temp.shape)

(36864, 4)
(18376, 4)


In [26]:
## ROUGH WORK ##
## USING NUMPY ##

# BRACKETS IMPORTANT #
index_inside = np.where(
        (anchors[:, 0] >= 0) &
        (anchors[:, 1] >= 0) &
        (anchors[:, 2] <= width) &
        (anchors[:, 3] <= height)
    )[0]

print(len(index_inside))

18376


In [13]:
## USING NUMPY INDEXING ##

filtered_anchors = anchors[
    (anchors[:, 0] >= 0) &
    (anchors[:, 1] >= 0) &
    (anchors[:, 2] <= width) &
    (anchors[:, 3] <= height)
]

print("No. of Valid Anchors: {}".format(filtered_anchors.shape))

No. of Valid Anchors: (18376, 4)


#### What is required to label anchor boxes ??
> - For each ground truth, find all anchors that have IOU equal to the max IOU. (directly can be labelled positive.)
> - Find all anchors that have max IOU with respect to a groud truth. (positive > 0.7, negative < 0.3)

In [14]:
ious = np.zeros((bbox.shape[0], filtered_anchors.shape[0]), dtype = np.float64)

for i,a in enumerate(bbox):
    for j,b in enumerate(filtered_anchors):
        ious[i,j] = IoU(a, b)

print(ious.shape)
print(ious[0])

(2, 18376)
[0.36675443 0.38560272 0.38560272 ... 0.         0.         0.        ]


In [29]:
## ROUGH WORK ##


print(np.argmax(ious, axis=0))
print(np.argmax(ious, axis=1))

best_groundtruth_for_each_anchor = np.argmax(ious, axis=0)
best_anchor_for_each_groundtruth = np.argmax(ious, axis=1)


print(ious.max(axis = 0))
print(ious.max(axis = 1))



## CALCUALTING NO OF ANCHORS WITH IOU > 0.7 ##

print(ious[ious > 0.7].shape)
print(ious[ious < 0.3].shape)

[0 0 0 ... 1 0 0]
[    1 17815]
[0.36675443 0.38560272 0.38560272 ... 0.00283193 0.         0.        ]
[0.38560272 0.81984166]
(15,)
(36366,)


In [30]:
## ROUGH WORK ##

a = np.asarray([
    [1, 2, 3, 4],
    [5, 5, 6, 7],
])

## MAX ALONG AXIS 1 ##
x = a[np.arange(a.shape[0]),a.argmax(axis = 1)]
print(x)

## MAX ALONG AXIS 0 ##
x = a[a.argmax(axis = 0), np.arange(a.shape[1])]
print(x)


### BELOW METHOD NOT WORKING OUT ###
### GETTING BROADCASTED UNNECESSARILY ###

# print(a.argmax(axis = 0))
# print(a[[1,1]])


[4 7]
[5 5 6 7]


In [15]:
### CONDITION 1 ###

'''
What do we need??
For each groundtruth, we need anchors with the max IOU.
'''

## BEST ANCHOR FOR EACH GROUND TRUTH ##
gt_best_arg_anchors = ious.argmax(axis = 1)

## BEST ANCHOR IOUS ##
gt_best_anchors_ious = ious.max(axis = 1)

## ANCHORS THAT HAVE IOU EQUAL TO BEST ANCHOR IOU ##
gt_best_anchors = np.where(np.isin(ious, gt_best_anchors_ious))[1]

'''
#### THIS IS NOT WORKING FOR SOME REASON ####
## ious[ious == gt_best_anchors_ious] ##
#############################################

### FOR LOOP METHOD OF OBTAINING ANCHORS EQUAL TO MAX IOU ###
### NOT ADVISABLE! USE NUMPY!! ###

for i in range(ious.shape[0]):
    print(ious[i].shape, gt_best_anchors_ious.shape)
    print(np.where(ious[i] == gt_best_anchors_ious[i]))

'''

print(gt_best_arg_anchors)
print(gt_best_anchors_ious)
print(gt_best_anchors)

[    1 17815]
[0.38560272 0.81984166]
[    1     2     3     4     5     6     7     8     9    10    11    12
    57    59    61    63    65    67    69    71    73    75    77    79
 17815 17819 17823]


In [16]:
### CONDITION 2 ###

"""
What do we need?
For each anchor, find for which ground truth it has highest IOU.
"""

anchor_best_arg_gt = ious.argmax(axis = 0)
anchor_best_gt_ious = ious.max(axis = 0)


### Here we dont have to find which ground truth box the respective anchor has hihgest IOU. ###
### Reference Paper says so. ###

print(anchor_best_arg_gt)
print(anchor_best_gt_ious)

[0 0 0 ... 1 0 0]
[0.36675443 0.38560272 0.38560272 ... 0.00283193 0.         0.        ]


#### Creating the Labels

Label Values

> - -1 = ignore label
> - 1 = positive label
> - 0 = negative label


IOU Thresholds

> - Positive Label = > 0.7
> - Negative Label = < 0.3

In [17]:
labels = np.full((filtered_anchors.shape[0],), fill_value = -1, dtype = np.int32)
positive_threshold = 0.7
negative_threshold = 0.3

In [18]:
### POSITIVE LABELS ###
## CONDITION 1

labels[gt_best_anchors] = 1

## CONDITION 2

labels[anchor_best_gt_ious >= positive_threshold] = 1

In [19]:
### NGEATIVE LABELS ###

labels[anchor_best_gt_ious < negative_threshold] = 0

### Training the RPN
> #### Sampling
> #### Parameterizing

#### Sampling
- Each minibatch comes from one single image.
- From that image, we will use 256 anchors as sample.
- In each sample, the ratio of positive to negative anchors is $1:1$.
- If this is not so, we will make it so by disabling certain anchors.

In [20]:
sample_size = 256
pos_ratio = 0.5

pos_size = sample_size * pos_ratio
neg_size = sample_size - pos_size


print("No. of Ideal Positive Samples: {}.".format(pos_size))
print("No. of Ideal Negative Samples: {}.".format(neg_size))

No. of Ideal Positive Samples: 128.0.
No. of Ideal Negative Samples: 128.0.


In [21]:
pos_labels = labels[labels == 1]
neg_labels = labels[labels == 0]
disabled_labels = labels[labels == -1]


print("No. of Positive Labels: {}".format(len(pos_labels)))
print("No. of Negative Labels: {}".format(len(neg_labels)))
print("No. of Disabled Labels: {}".format(len(disabled_labels)))

No. of Positive Labels: 39
No. of Negative Labels: 17990
No. of Disabled Labels: 347


In [38]:
## SHORT ROUTE ##
print(np.where(labels == 1)[0])

## LONG ROUTE ##
print(np.where((labels == 1) == True)[0])

[    1     2     3     4     5     6     7     8     9    10    11    12
    57    59    61    63    65    67    69    71    73    75    77    79
 17583 17588 17593 17598 17603 17807 17811 17815 17819 17823 17827 17831
 18023 18027 18031]
[    1     2     3     4     5     6     7     8     9    10    11    12
    57    59    61    63    65    67    69    71    73    75    77    79
 17583 17588 17593 17598 17603 17807 17811 17815 17819 17823 17827 17831
 18023 18027 18031]


In [22]:
### DISABLING ANCHORS ###

if len(pos_labels) > pos_size:
    pos_indices = np.where(labels == 1)[0]
    disabled_indices = np.random.choice(pos_indices, len(pos_labels) - pos_size, replace = False)
    labels[disabled_indices] = -1

## UPDATE NO. OF POSITIVE LABELS ##
pos_size = len(pos_labels)


if len(neg_labels) > sample_size - pos_size:
    neg_indices = np.where(labels == 0)[0]
    disabled_indices = np.random.choice(neg_indices, len(neg_indices) - pos_size, replace = False)
    labels[disabled_indices] = -1

## UPDATE NO. OF NEGATIVE LABELS ##
neg_size = pos_size


## RUN PREVIOUS CELL TO SEE THAT NO OF -VE LABELS == +VE LABELS ##

#### Parameterizing Anchor Box Coordiantes

> $x' = (x - xa) / wa $ \
$y' = (y - ya) / ha $ \
$w' = log(w/wa) $ \
$h' = log(h/ha) $

From this we can see that we need,

- groundtruth_box = \[xc, yc, w, h] for which that anchor has max IOU.


We need to format the anchor targets in the same form before parameterizing them.

In [23]:
## TAKING ADVANTAGE OF NUMPY BROADCASTING TO GENERATE GT BOXES FOR EACH ANHOR ##

anchor_best_gt_boxes_coords = bbox[anchor_best_arg_gt]
print(anchor_best_gt_boxes_coords)

[[  20   40  400  100]
 [  20   40  400  100]
 [  20   40  400  100]
 ...
 [ 400  800  800 1000]
 [  20   40  400  100]
 [  20   40  400  100]]


In [24]:
## CALCULATING ANCHOR BOX XCENTER, YCENTER, WIDTH, HEIGHT ##

anchor_w = filtered_anchors[:,2] - filtered_anchors[:,0]
anchor_h = filtered_anchors[:,3] - filtered_anchors[:,1]

anchor_x_c = filtered_anchors[:,0] + 0.5 * anchor_w
anchor_y_c = filtered_anchors[:,1] + 0.5 * anchor_h

print(anchor_x_c)
print(anchor_y_c)
print(anchor_w)
print(anchor_h)

print(anchor_x_c.shape)

[104. 120. 136. ... 888. 904. 920.]
[ 56.  56.  56. ... 968. 968. 968.]
[181.01933598 181.01933598 181.01933598 ... 181.01933598 181.01933598
 181.01933598]
[90.50966799 90.50966799 90.50966799 ... 90.50966799 90.50966799
 90.50966799]
(18376,)


In [25]:
## CALCULATING GROUND TRUTH XCENTER, YCENTER, WIDTH, HEIGHT ##

gt_w = anchor_best_gt_boxes_coords[:,2] - anchor_best_gt_boxes_coords[:,0]
gt_h = anchor_best_gt_boxes_coords[:,3] - anchor_best_gt_boxes_coords[:,1]

gt_x_c = anchor_best_gt_boxes_coords[:,0] + 0.5 * gt_w
gt_y_c = anchor_best_gt_boxes_coords[:,1] + 0.5 * gt_h

print(gt_x_c)
print(gt_y_c)
print(gt_w)
print(gt_h)


print(gt_x_c.shape)


[210. 210. 210. ... 600. 210. 210.]
[ 70.  70.  70. ... 900.  70.  70.]
[380 380 380 ... 400 380 380]
[ 60  60  60 ... 200  60  60]
(18376,)


In [27]:
## APPLYING ABOVE FORMULA ##

t_x = (gt_x_c - anchor_x_c)/anchor_w
t_y = (gt_y_c - anchor_y_c)/anchor_h
t_w = np.log(gt_w/anchor_w)
t_h = np.log(gt_h/anchor_h)

print(t_x)
print(t_y)
print(t_w)
print(t_h)


[ 0.5855728   0.49718446  0.40879611 ... -1.59099026 -3.83384458
 -3.92223293]
[ 0.15467961  0.15467961  0.15467961 ... -0.75130096 -9.92159202
 -9.92159202]
[0.7415674  0.7415674  0.7415674  ... 0.79286069 0.7415674  0.7415674 ]
[-0.41111211 -0.41111211 -0.41111211 ...  0.79286069 -0.41111211
 -0.41111211]


In [28]:
## CONVERTING TO [XC, YC, W, H] FORMAT ##


t_anchors = np.zeros((filtered_anchors.shape[0], 4))
t_anchors[:,0] = t_x
t_anchors[:,1] = t_y
t_anchors[:,2] = t_w
t_anchors[:,3] = t_h


print(t_anchors)
print(t_anchors.shape)

[[ 0.5855728   0.15467961  0.7415674  -0.41111211]
 [ 0.49718446  0.15467961  0.7415674  -0.41111211]
 [ 0.40879611  0.15467961  0.7415674  -0.41111211]
 ...
 [-1.59099026 -0.75130096  0.79286069  0.79286069]
 [-3.83384458 -9.92159202  0.7415674  -0.41111211]
 [-3.92223293 -9.92159202  0.7415674  -0.41111211]]
(18376, 4)


In [29]:
'''
1. We have calculated the anchors for only valid locations.
2. We need to do it for invalid locations as well.

And pass the whole thing to the RPN.
These are the regression targets.
'''

anchor_targets = np.zeros((anchors.shape[0], 4))
print(anchor_targets.shape)

anchor_targets[
    (anchors[:, 0] >= 0) &
    (anchors[:, 1] >= 0) &
    (anchors[:, 2] <= width) &
    (anchors[:, 3] <= height)
] = t_anchors

print( (anchors[:, 0] >= 0) & (anchors[:, 1] >= 0) & (anchors[:, 2] <= width) & (anchors[:, 3] <= height))
print(anchor_targets)

(36864, 4)
[False False False ... False False False]
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 ...
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [30]:
'''
We need to prepare the anchor labels.
These are the classification targets.
'''

anchor_labels = np.full((anchors.shape[0],), fill_value = -1, dtype = np.int32)
anchor_labels[
    (anchors[:, 0] >= 0) &
    (anchors[:, 1] >= 0) &
    (anchors[:, 2] <= width) &
    (anchors[:, 3] <= height)
] = labels

print(anchor_labels)
print(anchor_labels[anchor_labels == 1].shape)
print(anchor_labels[anchor_labels == 0].shape)
print(anchor_labels[anchor_labels == -1].shape)

[-1 -1 -1 ... -1 -1 -1]
(39,)
(39,)
(36786,)


#### Network Architecture

> Sliding a network over the feature map.
> Use a Conv2D for regression & classification.

- Regression No. of Output Channels: 4 * k
- Classification No. of Output Channels: k

In [31]:
from keras.layers import Reshape

In [32]:
def rpn(base_layers,num_anchors):
    x = Convolution2D(512, (3, 3), padding='same', activation='relu', kernel_initializer='normal', name='rpn_conv1')(base_layers)
    
    x_class = Convolution2D(num_anchors, (1, 1), activation='sigmoid', kernel_initializer='uniform', name='rpn_out_class')(x)
    x_regr = Convolution2D(num_anchors * 4, (1, 1), activation='linear', kernel_initializer='zero', name='rpn_out_regress')(x)
    
    x_class = Reshape((-1, 1))(x_class)
    x_regr = Reshape((-1, 4))(x_regr)
    
    return [x_class, x_regr, base_layers]

In [17]:
print(rpn(feature_map, k))

[<tf.Tensor 'reshape_1/Reshape:0' shape=(1, 36864, 1) dtype=float32>, <tf.Tensor 'reshape_2/Reshape:0' shape=(1, 36864, 4) dtype=float32>, <tf.Tensor 'activation_40/Relu:0' shape=(1, 64, 64, 1024) dtype=float32>]


In [33]:
pred_class_scores, pred_reg_scores, base_layers = rpn(feature_map, k)

In [67]:
print(type(pred_class_scores))

<class 'tensorflow.python.framework.ops.Tensor'>


In [34]:
#### RESHAPING REGRESSION OUTPUT TO [NO_ANCHORS,X,Y,W,H] ####
#### RESHAPING CLASSIFICATION OUTPUT TO [NO_ANCHORS, OBJECTNESS_SCORE]

print(pred_class_scores.shape)
# print(K.reshape(pred_class_scores, (1, -1)).shape)
print(pred_reg_scores.shape)
# print(K.reshape(pred_reg_scores, (1, -1, 4)).shape)

(1, 36864, 1)
(1, 36864, 4)


### Loss Functions for RPN

1. Classification Loss. (Binary Cross Entropy)
2. Regression Loss. (Smooth L1 Loss)

In [35]:
pred_class = K.get_value(K.reshape(pred_class_scores, (1, -1))[0])
pred_box = K.get_value(K.reshape(pred_reg_scores, (1, -1, 4))[0])

print(pred_class[:10])
print(pred_box[:10])

[0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5]
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [53]:
### ROUGH WORK ####

print(np.multiply([1,2,3], np.log([1, 2, 3])))
print(np.add(np.multiply([1,2,3], np.log([1, 2, 3])), [1, 2, 3]))
print(np.multiply([1, 2, 3], [1, 2, 3]))

[0.         1.38629436 3.29583687]
[1.         3.38629436 6.29583687]
[1 4 9]


In [36]:
######## BINARY CROSS ENTROPY ###########

def binary_crossentropy(y_pred, y_true):
    n = y_true.shape[0]
    x = np.multiply(y_true, np.log(y_pred))
    xhat = np.multiply((1 - y_true), np.log(1 - y_pred))
    return np.sum(x + xhat) * (-1/n)

In [55]:
### ROUGH WORK ###
#### TESTING BINARY CROSSENTROPY ######

pred = np.asarray([0.75, 0.25])
true = np.asarray([0, 1])
print(binary_crossentropy(y_pred = pred, y_true = true))

1.3862943611198906


In [37]:
######## SMOOTH L1 LOSS ############

def smooth_l1_loss(y_pred, y_true, delta = 1):
    x = np.abs(y_pred - y_true)
    return np.square(np.sum(x[x <= delta]))*0.5 + np.sum(delta * (x[x > delta] - 0.5 * delta))

In [57]:
### ROUGH WORK ###
#### TESTING SMOOTHL1 LOSS #####

pred = np.asarray([[1, 2, 3]])
true = np.asarray([[1.1, 2, 3.3]])

print(smooth_l1_loss(pred, true))

0.07999999999999996


### TO-DO

1. Convert all NumPy vectors into Keras Tensors for Loss calculations.
2. Define Data Generators.
3. Start model training.
4. Check the sampling done for loss calculations.

#### Creating the Region Proposal Network

In [38]:
from keras.models import Model
from keras.layers import Input
from keras.optimizers import Adam

In [39]:
input = Input(shape = (None, None, 3))
feature_map = nn_base(input, trainable = True)
print(feature_map)

Keras tensor
(?, ?, ?, 3)
(?, ?, ?, 64)
(?, ?, ?, 64)
(?, ?, ?, 64)
(?, ?, ?, 64)
(?, ?, ?, 256)
(?, ?, ?, 256)
(?, ?, ?, 256)
(?, ?, ?, 512)
(?, ?, ?, 512)
(?, ?, ?, 512)
(?, ?, ?, 512)
(?, ?, ?, 1024)
(?, ?, ?, 1024)
(?, ?, ?, 1024)
(?, ?, ?, 1024)
(?, ?, ?, 1024)
(?, ?, ?, 1024)
Tensor("activation_80/Relu:0", shape=(?, ?, ?, 1024), dtype=float32)


In [40]:
pred_class_scores, pred_reg_scores, features = rpn(feature_map, k)
print(pred_class_scores, pred_reg_scores, features)
# rpn_output = rpn(feature_map, k)
# print(rpn_output[:2])

Tensor("reshape_3_1/Reshape:0", shape=(?, ?, 1), dtype=float32) Tensor("reshape_4/Reshape:0", shape=(?, ?, 4), dtype=float32) Tensor("activation_80/Relu:0", shape=(?, ?, ?, 1024), dtype=float32)


In [41]:
#### USING KERAS FUNCTIONAL API ####

model_rpn = Model(input, [pred_class_scores, pred_reg_scores])
# model_rpn = Model(input, rpn_output[:2])
print(model_rpn)

<keras.engine.training.Model object at 0x000001AAB82467C8>


In [42]:
### LOADING WEIGHTS FOR PRETRAINED RESNET ON IMAGENET DATASET ###

weights_path = "C:\\Users\\Dyanesh\\Deep Learning\\keras_frcnn\\weights\\resnet50_weights_tf_dim_ordering_tf_kernels.h5"
print("Getting weights from {}".format(weights_path))
model_rpn.load_weights(weights_path, by_name = True)

Getting weights from C:\Users\Dyanesh\Deep Learning\keras_frcnn\weights\resnet50_weights_tf_dim_ordering_tf_kernels.h5


In [43]:
optimizer = Adam(lr = 1e-5, clipnorm = 0.001)

In [44]:
def rpn_loss_box_reg(y_true, y_pred):
#     def smooth_l1_sample():
    print("Regression Loss")
    print(y_true.shape, y_pred.shape)
#     return 0
#     K
    loss = K.abs(K.sum(y_true) - K.sum(y_pred))
    return loss
#     return K.placeholder(shape=(1,))
#     return smooth_l1_sample

HUBER_DELTA = 1
def smoothL1(y_true, y_pred):
   x   = K.abs(y_true - y_pred)
   x   = K.switch(x < HUBER_DELTA, 0.5 * x ** 2, HUBER_DELTA * (x - 0.5 * HUBER_DELTA))
   return  K.sum(x)

def binary_crossentropy(y_true, y_pred):
    return K.mean(K.binary_crossentropy(y_true, y_pred))

def rpn_loss_cls(y_true, y_pred):
#     def binary_crossentropy_sample(y_true, y_pred):
    print("Classification Loss")
    print(y_true.shape, y_pred.shape)
    loss = K.abs(K.sum(y_true) - K.sum(y_pred))
    return loss
#     return K.constant(1)
#     return K.placeholder(shape=(1,))
#     return binary_crossentropy_sample

In [45]:
# model_rpn.compile(optimizer=optimizer, loss=[rpn_loss_cls, rpn_loss_box_reg])
model_rpn.compile(optimizer=optimizer, loss=[binary_crossentropy, smoothL1])

W1204 23:12:55.113102 17700 deprecation_wrapper.py:119] From C:\Users\Dyanesh\Anaconda3\envs\tensorflow_gpu\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W1204 23:12:55.137044 17700 deprecation.py:323] From C:\Users\Dyanesh\Anaconda3\envs\tensorflow_gpu\lib\site-packages\tensorflow\python\ops\nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [46]:
Callbacks=keras.callbacks.ModelCheckpoint("./models/rpn/rpn.resnet.weights.{epoch:02d}-{loss:.2f}.hdf5", monitor='loss', verbose=1, save_best_only=True, save_weights_only=True, mode='auto', period=4)
callback=[Callbacks]

In [47]:
model_rpn.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None, None, 3 0                                            
__________________________________________________________________________________________________
zero_padding2d_2 (ZeroPadding2D (None, None, None, 3 0           input_2[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, None, None, 6 9472        zero_padding2d_2[0][0]           
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, None, None, 6 256         conv1[0][0]                      
__________________________________________________________________________________________________
activation

#### Defining the Data Generator

In [48]:
img_data = []

### SAMPLE IMAGE ###

data = {}
data['img'] = img
data['bboxs'] = bbox

img_data.append(data)

In [49]:
def get_targets(bboxs, anchor_scales, anchor_ratios, feature_map_size, subsampling_ratio):
    
    k = len(anchor_scales) * len(anchor_ratios)
    anchors = np.zeros((k * feature_map_size * feature_map_size, 4))
    
    x = np.arange(subsampling_ratio/2, width, subsampling_ratio, dtype = np.int32)
    
    ### ANCHOR CENTRES ###
    
    anchor_centres = np.zeros((anchors.shape[0], 2))
    anchor_centres[:, 0] = np.tile(np.repeat(x, k), feature_map_size) #XCOORDINATES
    anchor_centres[:, 1] = np.repeat(x, k * feature_map_size) #YCOORDINATES
    
    ### ANCHOR BOX COORDINATES ###
    
    start_box_no = 0
    for scale in anchor_scales:
        for ratio in anchor_ratios:
            w = subsampling_ratio * scale * np.sqrt(ratio)
            h = subsampling_ratio * scale * (1/np.sqrt(ratio))

            ### ANCHOR BOX COORDINATES WITH SCALEi, RATIOj###
            anchor_coords = np.arange(start_box_no, anchor_centres.shape[0], step = k)

            anchors[anchor_coords, 0] = anchor_centres[anchor_coords, 0] - w/2 # XMIN
            anchors[anchor_coords, 2] = anchor_centres[anchor_coords, 0] + w/2 # YMIN
            anchors[anchor_coords, 1] = anchor_centres[anchor_coords, 1] - h/2 # XMAX
            anchors[anchor_coords, 3] = anchor_centres[anchor_coords, 1] + h/2 # YMAX

            start_box_no += 1
    
    ### REMOVING OUT OF BOX ANCHORS ####
    
    filtered_anchors = anchors[
        (anchors[:, 0] >= 0) &
        (anchors[:, 1] >= 0) &
        (anchors[:, 2] <= width) &
        (anchors[:, 3] <= height)
    ]

    ### CALCULATING IOUS ###
    ious = np.zeros((bbox.shape[0], filtered_anchors.shape[0]), dtype = np.float64)

    for i,a in enumerate(bbox):
        for j,b in enumerate(filtered_anchors):
            ious[i,j] = IoU(a, b)

    ## BEST ANCHOR FOR EACH GROUND TRUTH ##
    gt_best_arg_anchors = ious.argmax(axis = 1)

    ## BEST ANCHOR IOUS ##
    gt_best_anchors_ious = ious.max(axis = 1)

    ## ANCHORS THAT HAVE IOU EQUAL TO BEST ANCHOR IOU ##
    gt_best_anchors = np.where(np.isin(ious, gt_best_anchors_ious))[1]
    
    anchor_best_arg_gt = ious.argmax(axis = 0)
    anchor_best_gt_ious = ious.max(axis = 0)
    
    ### LABELS ###
    
    labels = np.full((filtered_anchors.shape[0],), fill_value = -1, dtype = np.int32)
    positive_threshold = 0.7
    negative_threshold = 0.3

    ### POSITIVE LABELS ###
    ## CONDITION 1

    labels[gt_best_anchors] = 1

    ## CONDITION 2

    labels[anchor_best_gt_ious >= positive_threshold] = 1
    ### NGEATIVE LABELS ###

    labels[anchor_best_gt_ious < negative_threshold] = 0    
    
    ### SAMPLING ###
    
    sample_size = 256
    pos_ratio = 0.5

    pos_size = sample_size * pos_ratio
    neg_size = sample_size - pos_size
    
    pos_labels = labels[labels == 1]
    neg_labels = labels[labels == 0]
    disabled_labels = labels[labels == -1]
    
    ### DISABLING ANCHORS ###

    if len(pos_labels) > pos_size:
        pos_indices = np.where(labels == 1)[0]
        disabled_indices = np.random.choice(pos_indices, len(pos_labels) - pos_size, replace = False)
        labels[disabled_indices] = -1

    ## UPDATE NO. OF POSITIVE LABELS ##
    pos_size = len(pos_labels)

    if len(neg_labels) > sample_size - pos_size:
        neg_indices = np.where(labels == 0)[0]
        disabled_indices = np.random.choice(neg_indices, len(neg_indices) - pos_size, replace = False)
        labels[disabled_indices] = -1

    ## UPDATE NO. OF NEGATIVE LABELS ##
    neg_size = pos_size
    
    anchor_best_gt_boxes_coords = bbox[anchor_best_arg_gt]

    ## CALCULATING ANCHOR BOX XCENTER, YCENTER, WIDTH, HEIGHT ##

    anchor_w = filtered_anchors[:,2] - filtered_anchors[:,0]
    anchor_h = filtered_anchors[:,3] - filtered_anchors[:,1]

    anchor_x_c = filtered_anchors[:,0] + 0.5 * anchor_w
    anchor_y_c = filtered_anchors[:,1] + 0.5 * anchor_h
    
    ## CALCULATING GROUND TRUTH XCENTER, YCENTER, WIDTH, HEIGHT ##

    gt_w = anchor_best_gt_boxes_coords[:,2] - anchor_best_gt_boxes_coords[:,0]
    gt_h = anchor_best_gt_boxes_coords[:,3] - anchor_best_gt_boxes_coords[:,1]

    gt_x_c = anchor_best_gt_boxes_coords[:,0] + 0.5 * gt_w
    gt_y_c = anchor_best_gt_boxes_coords[:,1] + 0.5 * gt_h
    
    ## PARAMETERISING ##

    t_x = (gt_x_c - anchor_x_c)/anchor_w
    t_y = (gt_y_c - anchor_y_c)/anchor_h
    t_w = np.log(gt_w/anchor_w)
    t_h = np.log(gt_h/anchor_h)

    t_anchors = np.zeros((filtered_anchors.shape[0], 4))
    t_anchors[:,0] = t_x
    t_anchors[:,1] = t_y
    t_anchors[:,2] = t_w
    t_anchors[:,3] = t_h
    
    anchor_targets = np.zeros((anchors.shape[0], 4))

    anchor_targets[
        (anchors[:, 0] >= 0) &
        (anchors[:, 1] >= 0) &
        (anchors[:, 2] <= width) &
        (anchors[:, 3] <= height)
    ] = t_anchors

    anchor_labels = np.full((anchors.shape[0],), fill_value = -1, dtype = np.int32)
    anchor_labels[
        (anchors[:, 0] >= 0) &
        (anchors[:, 1] >= 0) &
        (anchors[:, 2] <= width) &
        (anchors[:, 3] <= height)
    ] = labels

    return anchor_labels, anchor_targets

In [50]:
def train_generator(img_data, anchor_scales, anchor_ratios, feature_map_size, subsampling_ratio):
    while True:
        for data in img_data:
            img = np.expand_dims(data['img'], axis = 0)
            bboxs= data['bboxs']
            anchor_cls_targets, anchor_reg_targets = get_targets(bboxs, anchor_scales, anchor_ratios, feature_map_size=feature_map_size, subsampling_ratio=subsampling_ratio)
            print(anchor_cls_targets.shape, anchor_reg_targets.shape, img.shape)
            anchor_cls_targets = np.expand_dims(anchor_cls_targets, axis = 0).reshape((1, -1, 1))
            anchor_reg_targets = np.expand_dims(anchor_reg_targets, axis = 0).reshape((1, -1, 4))
            yield np.copy(img), [np.copy(anchor_cls_targets), np.copy(anchor_reg_targets)]

In [34]:
history = model_rpn.fit_generator(train_generator(img_data, scales, ratios, feature_map_size, subsampling_ratio), epochs=1, steps_per_epoch = 1, callbacks=callback)
loss_history = history.history["loss"]

Epoch 1/1
(36864,) (36864, 4) (1, 1024, 1024, 3)
(36864,) (36864, 4) (1, 1024, 1024, 3)
(36864,) (36864, 4) (1, 1024, 1024, 3)
(36864,) (36864, 4) (1, 1024, 1024, 3)
(36864,) (36864, 4) (1, 1024, 1024, 3)
(36864,) (36864, 4) (1, 1024, 1024, 3)
(36864,) (36864, 4) (1, 1024, 1024, 3)
(36864,) (36864, 4) (1, 1024, 1024, 3)
(36864,) (36864, 4) (1, 1024, 1024, 3)
(36864,) (36864, 4) (1, 1024, 1024, 3)
(36864,) (36864, 4) (1, 1024, 1024, 3)
(36864,) (36864, 4) (1, 1024, 1024, 3)


ResourceExhaustedError: 2 root error(s) found.
  (0) Resource exhausted: OOM when allocating tensor with shape[1,255,255,256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node training/Adam/gradients/zeros_141-0-1-TransposeNCHWToNHWC-LayoutOptimizer}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[loss/reshape_6_loss/Mean_2/_3999]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

  (1) Resource exhausted: OOM when allocating tensor with shape[1,255,255,256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node training/Adam/gradients/zeros_141-0-1-TransposeNCHWToNHWC-LayoutOptimizer}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

0 successful operations.
0 derived errors ignored.

#### Loss Function
> There seems to be an issue in the loss function. This is because the tensors used belong to the Tensorflow Framework. \
To mitigate this, I need to learn how to use those tensors. That's what this section will be about.

##### Regression Loss

In [51]:
a = K.constant([1,2,3,4,5,6,7,8,9,10,11,12])
print(a)

b = K.reshape(a, (1,-1, 4))
print(b)

### Predicted.
c = K.ones((1, 64, 64, 36))
print(K.reshape(c, (1, -1, 4)))


### Targets.

d = K.ones((1, 36864, 4))


print("Predicted Shape = {}".format(c.shape))
print("Targets Shape = {}".format(d.shape))


###  NOW WE HAVE TO CALCULATE HUBER'S LOSS ###
c = K.reshape(c, (1, -1, 4))

x = K.abs(c - d)
print(x)

less_than_delta = K.less_equal(x, 1)
print(K.square(x[less_than_delta] * 0.5))

print(less_than_delta)

# print(c[c < 1])

Tensor("Const:0", shape=(12,), dtype=float32)
Tensor("Reshape_5:0", shape=(1, 3, 4), dtype=float32)
Tensor("Reshape_6:0", shape=(1, 36864, 4), dtype=float32)
Predicted Shape = (1, 64, 64, 36)
Targets Shape = (1, 36864, 4)
Tensor("Abs:0", shape=(1, 36864, 4), dtype=float32)
Tensor("Square:0", shape=(?,), dtype=float32)
Tensor("LessEqual:0", shape=(1, 36864, 4), dtype=bool)


In [52]:
HUBER_DELTA = 1
def smoothL1(y_true, y_pred):
   x   = K.abs(y_true - y_pred)
   x   = K.switch(x < HUBER_DELTA, 0.5 * x ** 2, HUBER_DELTA * (x - 0.5 * HUBER_DELTA))
   return  K.sum(x)

In [53]:
c = np.asarray([1, 2, 3])
d = np.asarray([1.1, 2, 3.3])
K.eval(smoothL1(c, d))

0.04999999999999996

##### Binary Crossentropy

In [54]:
a = K.constant([1, 1, 1e-5, 1, 0, 1, 0, 1], dtype = 'float32')
b = K.constant([0.5, 1, 1e-7, 0, 1, 1, 1, 0], dtype = 'float32')

In [55]:
print(K.eval(K.mean(K.binary_crossentropy(a,b))))

8.101784


In [56]:
K.eval(K.log(b))

array([ -0.6931472,   0.       , -16.118095 ,        -inf,   0.       ,
         0.       ,   0.       ,        -inf], dtype=float32)

In [57]:
def binary_crossentropy(y_true, y_pred):
    y_true = K.switch(y_true <= 0, 1e-15, y_true)
    y_pred = K.switch(y_pred <= 0, 1e-15, y_pred)
#     y_true = K.cast(y_true, 'float32') + 1e-7
#     y_pred = K.cast(y_pred, 'float32') + 1e-7
    a = y_true * K.log(y_pred)
    print(K.eval(a))
    b = (1 - y_true) * K.log(1 - y_pred) 
    print(K.eval(b))
#     sum = K.sum(a + b)

#     n = K.mean()
#     return sum
    return -1*K.mean(a + b)

In [125]:
print(K.eval(binary_crossentropy(a, b)))

AttributeError: 'float' object has no attribute 'get_shape'

### Generating Proposals for Faster RCNN
> If we have successfully trained the network, we would have the following -
    > -  Objectness Scores or Classification Scores.
    > -  Predicted Boxes Coordinates.

> To reduce the no. of proposals generated, we are applying Non-Maximum Suppression (NMS).
> After applying NMS, we will be taking only a number of top proposals sorted according to their classification scores.

- Researchers say that the reduction in the number of proposals doesnot affect the results. For the moment, let's keep that belief and work on the problem.

#### SINCE WE ARE LACKING THE RESOURCES TO RUN THE RPN, WE WILL BE GENERATING RANDOM SCORES AND LOCATIONS TO CONTINUE OUR WORK. ITS REALLY SAD AND DEPRESSING.STOP COMPLAINING AND START WORKING.

In [58]:
import tensorflow as tf
import numpy as np
import cv2
import keras.backend as K
import keras

##### Random Classification Scores

In [59]:
pred_cls_scores = np.random.normal(size=(1, 36864, 1))

## PROBABILITY CANT BE NEGATIVE ##

pred_cls_scores[(pred_cls_scores < 0) | (pred_cls_scores > 1)] = 0
print(pred_cls_scores.shape)

(1, 36864, 1)


##### Random Regression Scores

In [60]:
pred_reg_scores = np.random.normal(size=(1, 36864, 4))
print(pred_reg_scores.shape)

(1, 36864, 4)


#### Thresholds for Non-Maximum Supression

- Whether we are training or testing.
- Minimum IOU threshold for combining proposals. Redundancy is reduced majorly here.
- No. of proposals before NMS in testing/training.
- No. of proposals after NMS in testing/training.
- Minimum height/width of proposal.

In [61]:
nms_thresh = 0.7
n_train_pre_nms = 12000
n_train_post_nms = 2000
n_test_pre_nms = 6000
n_test_post_nms = 300
min_size = 16

#### Converting Location Predictions from RPN to Bounding Boxes

- Unparameterize the Predictions.
- Convert dxc, dxy, dh, dw into xmin, ymin, xmax, ymax

In [62]:
### formula ###

'''
x = w(anchor) * dxc(p) + xc(anchor)
y = h(anchor) * dxc(p) + yc(anchor)
h = exp(dh(p)) * h(anchor)
w = exp(dw(p)) * w(anchor)

'''

##  CALCULATING ANCHOR PARAMETERS

anchors_height = anchors[:, 3] - anchors[:, 1]
anchors_width = anchors[:, 2] - anchors[:, 0]
anchors_xc = anchors[:, 0] + 0.5 * anchors_width
anchors_yc = anchors[:, 1] + 0.5 * anchors_height


print(anchors_height.shape)
print(anchors_yc.shape)



## EXTRACTING PREDICTIONS FROM REGRESSION OUTPUT ##

pred_dxc = pred_reg_scores[0,:,0]
pred_dyc = pred_reg_scores[0,:,1]
pred_dh = pred_reg_scores[0,:,2]
pred_dw = pred_reg_scores[0,:,3]

print(pred_dxc.shape)
print(pred_dw.shape)


## EXTRACTING OBJECTNESS SCORES FROM CLASSIFICATION OUTPUT ## 

objectness_scores = pred_cls_scores[0, :, 0]


## APPLYING FORMULA FOR UNPARAMETERISATION ##


pred_xc = pred_dxc * anchors_width + anchors_xc
pred_yc = pred_dyc * anchors_height + anchors_yc
pred_h = np.exp(pred_dh) * anchors_height
pred_w = np.exp(pred_dw) * anchors_width

print(pred_xc.shape)
print(pred_w.shape)


## CONVERTING TO BOUDNING BOX COORDINATES ##


pred_xmin = pred_xc - 0.5 * pred_w
pred_xmax = pred_xc + 0.5 * pred_w
pred_ymin = pred_yc - 0.5 * pred_h
pred_ymax = pred_yc + 0.5 * pred_h


print(pred_ymax.shape)
print(pred_xmin.shape)

rois = np.zeros((anchors.shape[0], 4), dtype = np.float32)
rois[:, 0] = pred_xmin
rois[:, 1] = pred_ymin
rois[:, 2] = pred_xmax
rois[:, 3] = pred_ymax

print(rois[:3])

(36864,)
(36864,)
(36864,)
(36864,)
(36864,)
(36864,)
(36864,)
(36864,)
[[ -99.520775    38.29225     25.52587    154.25854  ]
 [   2.3767023   42.496452    92.419754   236.17397  ]
 [ -66.855316  -365.69348     -9.587412   391.69382  ]]


In [63]:
rois[:,[1, 3]]

array([[  38.29225 ,  154.25854 ],
       [  42.496452,  236.17397 ],
       [-365.69348 ,  391.69382 ],
       ...,
       [1726.7717  , 2144.5     ],
       [1011.49396 , 1472.5171  ],
       [ 341.45135 , 1796.6119  ]], dtype=float32)

#### As we can see, they are out of image proposals generated. We need to keep them inside by clipping.

In [64]:
## CLIPPING X-COORDINATES ##

rois[:, (0, 2)] = np.clip(rois[:, (0, 2)], 0, width)

## CLIPPING Y-COORDINATES ##

rois[:, (1, 3)] = np.clip(rois[:, (1, 3)], 0, height)

print(rois[:10])

[[   0.          38.29225     25.52587    154.25854  ]
 [   2.3767023   42.496452    92.419754   236.17397  ]
 [   0.           0.           0.         391.69382  ]
 [   0.           0.         369.47177     26.06432  ]
 [  71.26292      6.3297963  256.02853     81.87051  ]
 [   0.          39.532433   157.40517    224.73961  ]
 [   0.           0.        1024.         893.2947   ]
 [   0.           0.         870.039     1024.       ]
 [ 441.94522      0.        1024.         162.75804  ]
 [  38.84046      0.         110.970535   195.47208  ]]


#### Remove Regions whose height or width is lesser than the Threshold.

In [65]:
rois_h = rois[:, 3] - rois[:, 1]
rois_w = rois[:, 2] - rois[:, 0]

rois = rois[(rois_h >= min_size) & (rois_w >= min_size)]
objectness_scores = objectness_scores[(rois_h >= min_size) & (rois_w >= min_size)]


print(rois.shape)
print(objectness_scores.shape)

(28085, 4)
(28085,)


#### Sorting and Choosing the top N Scores

In [66]:
sorted_indices = np.argsort(objectness_scores)[::-1][:n_train_pre_nms]

print(objectness_scores.shape)
print(sorted_indices.shape)


rois = rois[sorted_indices]
objectness_scores = objectness_scores[sorted_indices]

print(objectness_scores.shape)
print(sorted_indices.shape)
print(rois[:5])

(28085,)
(12000,)
(12000,)
(12000,)
[[ 953.44745    318.6558    1024.         430.48746  ]
 [ 369.32147      2.3751333  584.84283    131.1767   ]
 [ 347.8285     220.29532    599.2389     394.81778  ]
 [ 562.86804     46.53373    599.707      382.1605   ]
 [ 157.35573     63.79439    247.46269    255.02077  ]]


### Non-Maximum Suppression

> This is where regions with high overlaps are merged. This reduces the overall number of regions proposed by RPN. 

In [67]:
count = 0

final_rois = []

ordered_indices = np.argsort(objectness_scores)[::-1].astype(np.int32)

print(ordered_indices[:10])
print(objectness_scores.shape)

roi_x1, roi_x2, roi_y1, roi_y2 = rois[:,0], rois[:, 1], rois[:, 2], rois[:, 3]
areas = (roi_x2 - roi_x1 + 1) * (roi_y2 - roi_y1 + 1)

while ordered_indices.size > 0:
    
    top = ordered_indices[0]
    
    final_rois.append(rois[top])

    ## CALCULATE IOUS WITH OTHER REGIONS ##

    x1 = np.maximum(roi_x1[top], roi_x1[ordered_indices[1:]])
    x2 = np.maximum(roi_x2[top], roi_x2[ordered_indices[1:]])
    y1 = np.minimum(roi_y1[top], roi_y1[ordered_indices[1:]])
    y2 = np.minimum(roi_y2[top], roi_y2[ordered_indices[1:]])
    
    w = np.maximum(0.0, x2 - x1 + 1)
    h = np.maximum(0.0, y2 - y1 + 1)
    
    intersection = h * w
    union = areas[top] + areas[ordered_indices[1:]] - intersection
    
    ious = intersection / union
    
    ## REMOVE OVERLAPPING REGIONS
    
    valid_indicdes = np.where(ious <= nms_thresh)[0]
    
    ordered_indices = ordered_indices[valid_indicdes + 1]
    
    
print(len(final_rois))

final_rois = np.asarray(final_rois[:n_train_post_nms], dtype = np.float32)

print(final_rois.shape)

[0 1 2 3 4 5 6 7 8 9]
(12000,)
5806
(2000, 4)


### Generating Targets for R-CNN Network.

> It is from this output that we have to take 128 samples. My earlier confusion should get cleared up here. Let's see. Okay, now, let's see what we are supposed to do here - 
> - No. of Regions to sample - 128
> - Positive Ratio - 0.5
> - Minimum IOU Threshold for Positive Sample - 0.5
> - IOU < 0.5 => Negative Sample or Background Sample.

> Algorithm - 
> - For each possible combination of Region and Ground-truth, find IOU.
> > - For each Region, find which Ground-truth has max IOU. If its greater than threshold, assign the class label.
> > - Then, we randomly sample samples * ratio and consider them as positive labels. If the IOU is lesser than threshold, we assign negative label.
> > - We sample randomly again, the remaining for the 128 - pos regions and assign negative labels.

That's it. We are officialy done with sampling.

In [68]:
## THRESHOLDS ##

n_sample = 128
pos_ratio = 0.25
pos_iou_threshold = 0.5
neg_iou_threshold_low, neg_iou_threshold_high = 0.0, 0.5

In [69]:
## CALCULATING IOUs ##


gt_region_ious = np.empty((bbox.shape[0], final_rois.shape[0]), dtype = np.float32)



for i, gtbox in enumerate(bbox):
    for j, region in enumerate(final_rois):
        gt_region_ious[i, j] = IoU(a = gtbox, b = region)
        
        
print(gt_region_ious.shape)

(2, 2000)


In [70]:
## FINDING GROUNDTRUTH FOR WHICH REGION HAS MAX IOU ##

gt_region_max_ious = np.max(gt_region_ious, axis = 0)
gt_region_argmax_ious = np.argmax(gt_region_ious, axis = 0)

print(gt_region_max_ious.shape)


## GREATER THAN THRESHOLD ##
print(gt_region_max_ious[gt_region_max_ious < pos_iou_threshold].shape)


(2000,)
(2000,)


In [71]:
## CLASS LABELS FOR GROUND TRUTHS ##

labels = np.asarray([6, 8])

In [72]:
## INCASE U FORGET THE POWER OF BROADCASTING ## LOOK HERE

print(labels[[0,0,0,0]])


[6 6 6 6]


In [73]:
## ASSIGNING LABELS TO MAX GROUNDTRUTH PROPOSALS ##

print(labels[gt_region_argmax_ious].shape)

gt_region_labels = labels[gt_region_argmax_ious]

(2000,)


In [74]:
## ASSIGNING POSITIVE LABELS ##

positive_samples = np.where(gt_region_max_ious > pos_iou_threshold)[0]
print(positive_samples)

positive_samples_size = min(positive_samples.size, n_sample * pos_ratio)

if positive_samples.size > 0:
    positive_samples = np.random.choice(positive_samples, size = positive_samples_size, replace = False)

print(positive_samples_size)
print(positive_samples)

[]
0
[]


In [75]:
## ASSIGNING NEGATIVE LABELS ##

negative_samples = np.where((gt_region_max_ious >= neg_iou_threshold_low) & (gt_region_max_ious < neg_iou_threshold_high))[0]
negative_samples_size = min(negative_samples.size, n_sample - positive_samples_size)


if negative_samples.size > 0:
    negative_samples = np.random.choice(negative_samples, size = negative_samples_size, replace = False)
    

print(negative_samples)
print(negative_samples.shape)

[  45  191 1127 1627 1410 1394 1346 1191 1231   73  727 1498  107  495
  792 1967  789 1480 1787  298  846  892 1766 1747 1953  895  591 1465
 1580  934 1433  857 1366  578  865  595  920 1815  289 1610  589  995
 1179 1422  471  390  694  948 1969  521  467  373  695 1417  478  167
 1656  126 1819  563 1604  401 1200  491 1187  742 1710  224  959  312
 1279 1328 1599 1440  828  566 1858  514  898   44 1917 1565  853 1163
  165 1099 1082 1752 1857  481  308 1212  371 1934   88 1980 1224 1458
 1682  177 1421  184  523 1240  212   55  663 1025 1205 1484  185  112
  891 1020  347 1667 1706 1223 1412  425  132  749 1232 1722 1132  771
 1355  950]
(128,)


In [76]:
## COMBINING NEGATIVE AND POSITIVE SAMPLES ##

samples_indices = np.append(positive_samples, negative_samples)
print(samples_indices.shape)


## SAMPLES LABELS ##
samples_labels = np.append(gt_region_labels[positive_samples], np.zeros(negative_samples_size))
print(samples_labels.shape)


## SAMPLE BBOXS ##
samples_rois = final_rois[samples_indices]
print(samples_rois.shape)

(128,)
(128,)
(128, 4)


In [77]:
## GROUNDTRUTH BOXES FOR SAMPLES ROIS ##

gt_samples_rois = bbox[gt_region_argmax_ious[samples_indices]]
print(gt_samples_rois.shape)

print(gt_samples_rois[:10])

(128, 4)
[[  20   40  400  100]
 [  20   40  400  100]
 [  20   40  400  100]
 [ 400  800  800 1000]
 [  20   40  400  100]
 [  20   40  400  100]
 [  20   40  400  100]
 [  20   40  400  100]
 [  20   40  400  100]
 [  20   40  400  100]]


> Now, we have the sample regions, their labels. Before we feed them into the ROI netowrk, we need to parameterize them, like we did before for RPN. I mean, the same formulas.

In [78]:
## PARAMETERISING ##

## XC,YX,H,W Format ##


## SAMPLES ##
samples_rois_h = samples_rois[:, 3] - samples_rois[:, 1]
samples_rois_w = samples_rois[:, 2] - samples_rois[:, 0]

samples_rois_xc = samples_rois[:, 0] + 0.5 * samples_rois_w
samples_rois_yc = samples_rois[:, 1] + 0.5 * samples_rois_h


## GTBBOXS ##
gt_samples_rois_h = gt_samples_rois[:, 3] - gt_samples_rois[:, 1]
gt_samples_rois_w = gt_samples_rois[:, 2] - gt_samples_rois[:, 0]

gt_samples_rois_xc = gt_samples_rois[:, 0] + 0.5 * gt_samples_rois_w
gt_samples_rois_yc = gt_samples_rois[:, 1] + 0.5 * gt_samples_rois_h

In [79]:
## APPLYING FORMULAS ##

samples_rois_h = np.maximum(samples_rois_h, np.finfo(samples_rois_h.dtype).eps)
samples_rois_w = np.maximum(samples_rois_w, np.finfo(samples_rois_w.dtype).eps)

dx = (gt_samples_rois_xc - samples_rois_xc) / samples_rois_w
dy = (gt_samples_rois_yc - samples_rois_yc) / samples_rois_h
dh = np.log(gt_samples_rois_h / samples_rois_h)
dw = np.log(gt_samples_rois_w / samples_rois_w)


samples_locations = np.empty((dx.shape[0], 4), dtype = dx.dtype)

samples_locations[:, 0] = dx
samples_locations[:, 1] = dy
samples_locations[:, 2] = dh
samples_locations[:, 3] = dw

print(samples_locations.shape)
print(samples_locations[:10])

(128, 4)
[[ -3.58641375  -1.09752929  -1.31051314   0.91668593]
 [ -1.69518557  -0.76091868  -2.02585788   1.55021638]
 [ -0.56782545  -0.29478242  -1.73783517   0.22439626]
 [ -0.60517788   0.37890625  -1.63315444   0.04173746]
 [-12.62924466  -0.14045737  -0.82726739   1.82672821]
 [ -0.28279866  -2.89182425  -0.38426139  -0.93386678]
 [ -1.01505184  -0.25612565  -1.56525281   1.49135153]
 [ -1.37374779  -0.12998739  -1.14836886  -0.13384852]
 [ -7.27569092  -1.28879363  -1.44851384   1.46662023]
 [ -0.78387659  -7.32905482  -0.15694393  -0.36477341]]


In [80]:
print(samples_labels.shape)
print(samples_locations.shape)

(128,)
(128, 4)


### ROI Pooling

> We have to extract fix-sized feature maps from the feature maps. This is what that will be fed to the feed-forward network.
> To do this, take a proposal and using the coordinates, crop the corresponding feature map and for the moment, resize it to the desired size.
> In our case, the size is (7,7).

In [84]:
## These are the region proposals in the x1, y1, x2, y2

samples_rois


## We need to resize them to the desired size.
#
#
#
##
# Wait, how do we do that? Okay we have the region proposals. Which is in coordinate system of the entire image.
# So, we need to subsample it to the feauture map size. Now, it will be in terms of feature map extracted from the backbone.
# From this, we need to extract the proposal feature map. 
## This proposal feature map should be resized to the desired pool size.
# Sounds easy... Let's try.
#
#
##

print((samples_rois)[:10])
print((samples_rois/subsampling_ratio)[:10])



downsampled_samples_rois = samples_rois/subsampling_ratio
print(downsampled_samples_rois[downsampled_samples_rois > 64].shape)

[[ 678.9495    202.94104   830.88947   425.4256  ]
 [ 306.37592   188.70674   387.0127    643.6635  ]
 [ 230.59311     0.        534.2125    341.10138 ]
 [ 640.3514      0.       1024.       1024.      ]
 [ 951.78723    20.662256 1012.94415   157.8859  ]
 [   0.        280.74783   966.84485   368.8596  ]
 [ 254.0503      0.        339.57623   287.03305 ]
 [ 589.57654     0.       1024.        189.18274 ]
 [ 804.00775   271.4634    891.67523   526.8704  ]
 [ 365.35754   549.371     912.62885   619.5668  ]]
[[42.434345 12.683815 51.93059  26.5891  ]
 [19.148495 11.794171 24.188293 40.22897 ]
 [14.412069  0.       33.388283 21.318836]
 [40.02196   0.       64.       64.      ]
 [59.486702  1.291391 63.30901   9.867868]
 [ 0.       17.54674  60.427803 23.053724]
 [15.878143  0.       21.223515 17.939566]
 [36.848534  0.       64.       11.823921]
 [50.250484 16.966463 55.729702 32.9294  ]
 [22.834846 34.335686 57.039303 38.722923]]
(0,)


In [85]:
fixed_feature_map_size = (14,14)

In [86]:
final_feature_maps = []

for roi in downsampled_samples_rois:
    x1,y1,x2,y2 = roi.astype(np.int32)
    h,w = y2-y1, x2-x1
    
    ## Extracting feature map for each proposal.
    
    proposal_feature_map = feature_map[:,y1:y2,x1:x2,:]
    print("Original Proposal Feauture Map size: {}.".format(proposal_feature_map.shape))
    
    ## Reisze each map to the pool size.
    
    fixed_roi_feature_map = tf.image.resize_images(proposal_feature_map, (fixed_feature_map_size[0], fixed_feature_map_size[1]))

    print("Fixed Shape Feature Map Size: {}.".format(fixed_roi_feature_map.shape))
    
    final_feature_maps.append(fixed_roi_feature_map)
    
    
final_feature_maps = K.reshape(final_feature_maps, (1, downsampled_samples_rois.shape[0], fixed_feature_map_size[0], fixed_feature_map_size[1], 1024))
print(final_feature_maps.shape)

Original Proposal Feauture Map size: (?, ?, ?, 1024).
Fixed Shape Feature Map Size: (?, 14, 14, 1024).
Original Proposal Feauture Map size: (?, ?, ?, 1024).
Fixed Shape Feature Map Size: (?, 14, 14, 1024).
Original Proposal Feauture Map size: (?, ?, ?, 1024).
Fixed Shape Feature Map Size: (?, 14, 14, 1024).
Original Proposal Feauture Map size: (?, ?, ?, 1024).
Fixed Shape Feature Map Size: (?, 14, 14, 1024).
Original Proposal Feauture Map size: (?, ?, ?, 1024).
Fixed Shape Feature Map Size: (?, 14, 14, 1024).
Original Proposal Feauture Map size: (?, ?, ?, 1024).
Fixed Shape Feature Map Size: (?, 14, 14, 1024).
Original Proposal Feauture Map size: (?, ?, ?, 1024).
Fixed Shape Feature Map Size: (?, 14, 14, 1024).
Original Proposal Feauture Map size: (?, ?, ?, 1024).
Fixed Shape Feature Map Size: (?, 14, 14, 1024).
Original Proposal Feauture Map size: (?, ?, ?, 1024).
Fixed Shape Feature Map Size: (?, 14, 14, 1024).
Original Proposal Feauture Map size: (?, ?, ?, 1024).
Fixed Shape Feature

Fixed Shape Feature Map Size: (?, 14, 14, 1024).
Original Proposal Feauture Map size: (?, ?, ?, 1024).
Fixed Shape Feature Map Size: (?, 14, 14, 1024).
Original Proposal Feauture Map size: (?, ?, ?, 1024).
Fixed Shape Feature Map Size: (?, 14, 14, 1024).
Original Proposal Feauture Map size: (?, ?, ?, 1024).
Fixed Shape Feature Map Size: (?, 14, 14, 1024).
Original Proposal Feauture Map size: (?, ?, ?, 1024).
Fixed Shape Feature Map Size: (?, 14, 14, 1024).
Original Proposal Feauture Map size: (?, ?, ?, 1024).
Fixed Shape Feature Map Size: (?, 14, 14, 1024).
Original Proposal Feauture Map size: (?, ?, ?, 1024).
Fixed Shape Feature Map Size: (?, 14, 14, 1024).
Original Proposal Feauture Map size: (?, ?, ?, 1024).
Fixed Shape Feature Map Size: (?, 14, 14, 1024).
Original Proposal Feauture Map size: (?, ?, ?, 1024).
Fixed Shape Feature Map Size: (?, 14, 14, 1024).
Original Proposal Feauture Map size: (?, ?, ?, 1024).
Fixed Shape Feature Map Size: (?, 14, 14, 1024).
Original Proposal Feautu

In [105]:
7*7*512

25088

In [87]:
def identity_block_td(input_tensor, kernel_size, filters, stage, block, trainable=True):
    # identity block time distributed
    nb_filter1, nb_filter2, nb_filter3 = filters
    
    if K.image_dim_ordering() == 'tf':
        bn_axis = 3
    else:
        bn_axis = 1

    conv_name_base = 'res' + str(stage) + block + '_branch'
    bn_name_base = 'bn' + str(stage) + block + '_branch'

    x = TimeDistributed(Convolution2D(nb_filter1, (1, 1), trainable=trainable, kernel_initializer='normal'), name=conv_name_base + '2a')(input_tensor)
    x = TimeDistributed(BatchNormalization(axis=bn_axis), name=bn_name_base + '2a')(x)
    x = Activation('relu')(x)
    x = TimeDistributed(Convolution2D(nb_filter2, (kernel_size, kernel_size), trainable=trainable, kernel_initializer='normal',padding='same'), name=conv_name_base + '2b')(x)
    x = TimeDistributed(BatchNormalization(axis=bn_axis), name=bn_name_base + '2b')(x)
    x = Activation('relu')(x)
    x = TimeDistributed(Convolution2D(nb_filter3, (1, 1), trainable=trainable, kernel_initializer='normal'), name=conv_name_base + '2c')(x)
    x = TimeDistributed(BatchNormalization(axis=bn_axis), name=bn_name_base + '2c')(x)
    x = Add()([x, input_tensor])
    x = Activation('relu')(x)
    
    return x

def conv_block_td(input_tensor, kernel_size, filters, stage, block, input_shape, strides=(2, 2), trainable=True):
    # conv block time distributed
    
    nb_filter1, nb_filter2, nb_filter3 = filters
    if K.image_dim_ordering() == 'tf':
        bn_axis = 3
    else:
        bn_axis = 1

    conv_name_base = 'res' + str(stage) + block + '_branch'
    bn_name_base = 'bn' + str(stage) + block + '_branch'

    x = TimeDistributed(Convolution2D(nb_filter1, (1, 1), strides=strides, trainable=trainable, kernel_initializer='normal'), input_shape=input_shape, name=conv_name_base + '2a')(input_tensor)
    x = TimeDistributed(BatchNormalization(axis=bn_axis), name=bn_name_base + '2a')(x)
    x = Activation('relu')(x)
    x = TimeDistributed(Convolution2D(nb_filter2, (kernel_size, kernel_size), padding='same', trainable=trainable, kernel_initializer='normal'), name=conv_name_base + '2b')(x)
    x = TimeDistributed(BatchNormalization(axis=bn_axis), name=bn_name_base + '2b')(x)
    x = Activation('relu')(x)
    x = TimeDistributed(Convolution2D(nb_filter3, (1, 1), kernel_initializer='normal'), name=conv_name_base + '2c', trainable=trainable)(x)
    x = TimeDistributed(BatchNormalization(axis=bn_axis), name=bn_name_base + '2c')(x)
    
    shortcut = TimeDistributed(Convolution2D(nb_filter3, (1, 1), strides=strides, trainable=trainable, kernel_initializer='normal'), name=conv_name_base + '1')(input_tensor)
    shortcut = TimeDistributed(BatchNormalization(axis=bn_axis), name=bn_name_base + '1')(shortcut)
    
    x = Add()([x, shortcut])
    x = Activation('relu')(x)
    return x

def classifier_layers(x, input_shape, trainable=False):
    if K.backend() == 'tensorflow':
        x = conv_block_td(x, 3, [512, 512, 2048], stage=5, block='a', input_shape=input_shape, strides=(2, 2), trainable=trainable)
    elif K.backend() == 'theano':
        x = conv_block_td(x, 3, [512, 512, 2048], stage=5, block='a', input_shape=input_shape, strides=(1, 1), trainable=trainable)
    x = identity_block_td(x, 3, [512, 512, 2048], stage=5, block='b', trainable=trainable)
    x = identity_block_td(x, 3, [512, 512, 2048], stage=5, block='c', trainable=trainable)
    x = TimeDistributed(AveragePooling2D((7, 7)), name='avg_pool')(x)
    return x

In [88]:
input_shape = (n_sample,14,14,1024)
out = classifier_layers(final_feature_maps, input_shape=input_shape, trainable=True)

W1204 23:38:17.959216 17700 deprecation_wrapper.py:119] From C:\Users\Dyanesh\Anaconda3\envs\tensorflow_gpu\lib\site-packages\keras\backend\tensorflow_backend.py:3980: The name tf.nn.avg_pool is deprecated. Please use tf.nn.avg_pool2d instead.



In [89]:
print(out)

Tensor("avg_pool/transpose_1:0", shape=(1, ?, 1, 1, 2048), dtype=float32)


In [90]:
out = TimeDistributed(Flatten())(out)

nb_classes = len(labels) + 1
out_class = TimeDistributed(Dense(nb_classes, activation='softmax', kernel_initializer='zero'), name='dense_class_{}'.format(nb_classes))(out)
out_regr = TimeDistributed(Dense(4 * (nb_classes-1), activation='linear', kernel_initializer='zero'), name='dense_regress_{}'.format(nb_classes))(out)

In [91]:
print(out)
print(out_class)
print(out_regr)

Tensor("time_distributed_1/transpose_1:0", shape=(1, ?, 2048), dtype=float32)
Tensor("dense_class_3/transpose_1:0", shape=(1, ?, 3), dtype=float32)
Tensor("dense_regress_3/transpose_1:0", shape=(1, ?, 8), dtype=float32)


#### Defining ROI Pooling as a Layer

In [92]:
from keras.engine.topology import Layer

### DEFINING A KERASS LAYER ###

'''
Define the follwoing functions and wrap it in a class.
1. Call()
2. Build()
3. ComputeShape()
'''
###############################

class RoiPoolingConv(Layer):

    def __init__(self, pool_size, num_rois, **kwargs):
        self.pool_size = pool_size
        self.num_rois = num_rois
        super(RoiPoolingConv, self).__init__(**kwargs)

    def build(self, input_shape):
        self.nb_channels = input_shape[0][3]

    def compute_output_shape(self, input_shape):
        return None, self.num_rois, self.pool_size, self.pool_size, self.nb_channels

    def call(self, x, mask=None):
        assert(len(x) == 2)
        img = x[0]
        rois = x[1]
        input_shape = K.shape(img)
        outputs = []

        for roi_idx in range(self.num_rois):
            x = rois[0, roi_idx, 0]
            y = rois[0, roi_idx, 1]
            w = rois[0, roi_idx, 2]
            h = rois[0, roi_idx, 3]
            
            row_length = w / float(self.pool_size)
            col_length = h / float(self.pool_size)

            num_pool_regions = self.pool_size

            x = K.cast(x, 'int32')
            y = K.cast(y, 'int32')
            w = K.cast(w, 'int32')
            h = K.cast(h, 'int32')

            rs = tf.image.resize_images(img[:, y:y+h, x:x+w, :], (self.pool_size, self.pool_size))
            outputs.append(rs)

        final_output = K.concatenate(outputs, axis=0)
        final_output = K.reshape(final_output, (1, self.num_rois, self.pool_size, self.pool_size, self.nb_channels))

        # final_output = K.permute_dimensions(final_output, (0, 1, 2, 3, 4))
        return final_output

    def get_config(self):
        config = {'pool_size': self.pool_size,
                  'num_rois': self.num_rois}

        base_config = super(RoiPoolingConv, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [93]:
def classifier(base_layers, input_rois, num_rois, nb_classes = 21, trainable=False):
    if K.backend() == 'tensorflow':
        pooling_regions = 14
        input_shape = (num_rois,14,14,1024)

    out_roi_pool = RoiPoolingConv(pooling_regions, num_rois)([base_layers, input_rois])
    out = classifier_layers(out_roi_pool, input_shape=input_shape, trainable=True)
    out = TimeDistributed(Flatten())(out)
    out_class = TimeDistributed(Dense(nb_classes, activation='softmax', kernel_initializer='zero'), name='dense_class_{}'.format(nb_classes))(out)
    out_regr = TimeDistributed(Dense(4 * (nb_classes-1), activation='linear', kernel_initializer='zero'), name='dense_regress_{}'.format(nb_classes))(out)
    return [out_class, out_regr]

In [94]:
feature_map_input = Input(shape=(None, None, 1024))
roi_input = Input(shape=(n_sample, 4))
classifier_network = classifier(base_layers=feature_map_input, input_rois=roi_input, num_rois=128, nb_classes=nb_classes)

In [104]:
out_sample_rois = RoiPoolingConv(pool_size=14, num_rois=samples_rois.shape[0])([feature_map, tf.expand_dims(samples_rois, axis=0)])
print(out_sample_rois.shape)
print(out_sample_rois[0,:10])

(1, 128, 14, 14, 1024)
Tensor("strided_slice_258:0", shape=(10, 14, 14, 1024), dtype=float32)
