In [2]:
import torch
import numpy as np
import torchvision
import torch.nn as nn

In [3]:
image = torch.zeros((1, 3, 800, 800)).float()

bbox = torch.FloatTensor([[20, 30, 400, 500], [300, 400, 500, 600]])
# [y1, x1, y2, x2] format
labels = torch.LongTensor([6, 8])  # 0 represents background
sub_sample = 16

In [4]:
#Using a dummy image to pick enough layers to get required dimension
dummy_img = torch.zeros((1, 3, 800, 800)).float()

#Using pretrained VGG16-Net as the head network for feature extraction
model = torchvision.models.vgg16(pretrained=True)
fe = list(model.features)

In [7]:
#Using the dummy image to pick out enough layers to reduce the image dimensions
#to w/16 and h/16 as per the original paper
req_features = []
k = dummy_img.clone()
out_channels = None
for i in fe:
    k = i(k)
    if k.size()[2] < 800 // 16:
        break
    req_features.append(i)
    out_channels = k.size()[1]

print(len(req_features))
print(out_channels)

faster_rcnn_fe_extractor = nn.Sequential(*req_features)
out_map = faster_rcnn_fe_extractor(image)
print(out_map.size())

#the map is as per the dimensions required

30
512
torch.Size([1, 512, 50, 50])


In [10]:
#Sub sample of 16 used since the input img was reduced by 16th dimension
#Every pixel in the output feature map corresponds to a 16x16 region in the input image
ratios = [0.5, 1, 2]
anchor_scales = [8, 16, 32]

anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4), dtype=np.float32)

print(anchor_base)
# test bounding boxes creation from single anchor
center_y = 50.
center_x = 120.

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [11]:
#Generate the four coordinates of the 9 anchor boxes for one pixel in feature map
#This is just a check to see if its working, not used anywhere finally
for i in range(len(ratios)):
    print(f'ratio {ratios[i]}:')
    for j in range(len(anchor_scales)):
        h = sub_sample * anchor_scales[j] * np.sqrt(ratios[i])
        w = sub_sample * anchor_scales[j] * np.sqrt(1. / ratios[i])

        index = i * len(anchor_scales) + j

        anchor_base[index, 0] = center_y - h / 2.
        anchor_base[index, 1] = center_x - w / 2.
        anchor_base[index, 2] = center_y + h / 2.
        anchor_base[index, 3] = center_x + w / 2.
        print(f'\tscale {anchor_scales[j]}: {anchor_base[index]}')

ratio 0.5:
	scale 8: [  4.745166  29.490332  95.25484  210.50967 ]
	scale 16: [-40.509666 -61.019337 140.50967  301.01935 ]
	scale 32: [-131.01933 -242.03867  231.01933  482.03867]
ratio 1:
	scale 8: [-14.  56. 114. 184.]
	scale 16: [-78.  -8. 178. 248.]
	scale 32: [-206. -136.  306.  376.]
ratio 2:
	scale 8: [-40.509666  74.74516  140.50967  165.25484 ]
	scale 16: [-131.01933    29.490332  231.01933   210.50967 ]
	scale 32: [-312.03867   -61.019337  412.03867   301.01935 ]


In [12]:
#generating the centers for all of the pixels
fe_size = 800 // 16
center_x = np.arange(16, (fe_size + 1) * 16, 16)
center_y = np.arange(16, (fe_size + 1) * 16, 16)

centers = np.zeros((len(center_x) * len(center_x), 2))

index = 0
for x in range(len(center_x)):
    for y in range(len(center_y)):
        centers[index, 0] = center_y[y] - 8
        centers[index, 1] = center_x[x] - 8
        index += 1

anchors = np.zeros((len(centers) * 9, 4), dtype=np.float32)

In [13]:
index = 0
for c in centers:
    ctr_y, ctr_x = c
    for i in range(len(ratios)):
        for j in range(len(anchor_scales)):
            h = sub_sample * anchor_scales[j] * np.sqrt(ratios[i])
            w = sub_sample * anchor_scales[j] * np.sqrt(1. / ratios[i])

            anchors[index, 0] = ctr_y - h / 2.
            anchors[index, 1] = ctr_x - w / 2.
            anchors[index, 2] = ctr_y + h / 2.
            anchors[index, 3] = ctr_x + w / 2.
            index += 1

print(anchors.shape)
print(anchors)

(22500, 4)
[[ -37.254833  -82.50967    53.254833   98.50967 ]
 [ -82.50967  -173.01933    98.50967   189.01933 ]
 [-173.01933  -354.03867   189.01933   370.03867 ]
 ...
 [ 701.49036   746.7452    882.50964   837.2548  ]
 [ 610.98065   701.49036   973.01935   882.50964 ]
 [ 429.96133   610.98065  1154.0387    973.01935 ]]


In [15]:
#First finding the index of all the valid anchor boxes, ie, the ones whose coordinates are inside the image
inside_indexes = np.where(
    (anchors[:, 0] >= 0) &
    (anchors[:, 1] >= 0) &
    (anchors[:, 2] <= 800) &
    (anchors[:, 3] <= 800)
)[0]
print(inside_indexes.shape)
n_box = len(inside_indexes)

#initially all the boxes are taken to be inconsequential
bbox_labels = np.empty((n_box,), dtype=np.int32)
bbox_labels.fill(-1)
print(bbox_labels.shape)

(8940,)
(8940,)


In [16]:
#pick out the anchor coordinates corresponding to the correct boxes
valid_anchor_boxes = anchors[inside_indexes]
print(valid_anchor_boxes.shape)

(8940, 4)


In [17]:
ious = np.empty((len(valid_anchor_boxes), 2), dtype=np.float32)
ious.fill(0)

for num1, i in enumerate(valid_anchor_boxes):
    ya1, xa1, ya2, xa2 = i
    anchor_area = (ya2 - ya1) * (xa2 - xa1)
    for num2, j in enumerate(bbox):
        yb1, xb1, yb2, xb2 = j
        box_area = (yb2 - yb1) * (xb2 - xb1)

        inter_x1 = max([xb1, xa1])
        inter_y1 = max([yb1, ya1])
        inter_x2 = min([xb2, xa2])
        inter_y2 = min([yb2, ya2])

        if (inter_x1 < inter_x2) and (inter_y1 < inter_y2):
            iter_area = (inter_y2 - inter_y1) * (inter_x2 - inter_x1)
            iou = iter_area / (anchor_area + box_area - iter_area)
        else:
            iou = 0.

        ious[num1, num2] = iou
print(ious.shape)

(8940, 2)


In [13]:
#Assigning labels to the objects according to the paper guideline
#1. The anchors which have the highest IoU overlap with the ground truth box
#2. Acnchors with an IoU overlap higher than the positive threshold

#3. Negative label to the anchors with IoU lesser than the negative threshold
#4. Ignore the anchors which are neither positive nor negative

In [19]:
#This is for the anchors with the highest IoU overlap with the ground truth box
gt_argmax_ious = ious.argmax(axis=0)
print(gt_argmax_ious)

gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])]
print(gt_max_ious)

#This in a way the reverse of the above condition, 
#it tells which ground truth box has the highest IoU overlap with an anchor box
argmax_ious = ious.argmax(axis=1)
print(argmax_ious.shape)
print(argmax_ious)

max_ious = ious[np.arange(len(ious)), argmax_ious]
print(max_ious)

#Finding the IoU with this max_ious values
gt_argmax_ious = np.where(ious == gt_max_ious)[0]
print(gt_argmax_ious)

[2262 5620]
[0.68130493 0.61035156]
(8940,)
[0 0 0 ... 0 0 0]
[0.06811669 0.07083762 0.07083762 ... 0.         0.         0.        ]
[2262 2508 5620 5628 5636 5644 5866 5874 5882 5890 6112 6120 6128 6136
 6358 6366 6374 6382]


In [20]:
#Condition 2 and 3, IoU(s) above and below the thresholds
pos_iou_threshold = 0.7
neg_iou_threshold = 0.3

bbox_labels[max_ious < neg_iou_threshold] = 0
bbox_labels[max_ious >= pos_iou_threshold] = 1


#Condition 1, assigning positive labels to all IoU with max overlap with the ground truth box
bbox_labels[gt_argmax_ious] = 1

In [21]:
#The paper says that each mini batch arises from a single image with many positive and negative samples 
#but has a bias towards negative samples, due to their higher numbers
#To sort this problem, we have to randomly sample 256 anchors in the image to compute the loss function
#and keep the positive and negavite samples in a 1:1 ratio

pos_ratio = 0.5
n_sample = 256
n_pos = pos_ratio * n_sample

pos_index = np.where(bbox_labels == 1)[0]

#If there are enough positive labels, then we sample n_positive and ignore the remaining ones
if len(pos_index) > n_pos:
    disable_index = np.random.choice(pos_index, size=(len(pos_index) - n_pos), replace=False)
    bbox_labels[disable_index] = -1
    
n_neg = n_sample - np.sum(bbox_labels == 1)
neg_index = np.where(bbox_labels == 0)[0]
print(len(neg_index), n_neg)
#Ignoring the extra negative boxes
if len(neg_index) > n_neg:
    disable_index = np.random.choice(neg_index, size=(len(neg_index) - n_neg), replace=False)
    bbox_labels[disable_index] = -1

7690 238


In [22]:
#for each anchor box, finding the ground truth box which has max_iou
max_iou_bbox = bbox[argmax_ious]
print(max_iou_bbox)
#This should be of the shape len(valid_anchor_boxes, 4)

#Location of the anchor boxes wrt the ground truth box
height = valid_anchor_boxes[:, 2] - valid_anchor_boxes[:, 0]
width = valid_anchor_boxes[:, 3] - valid_anchor_boxes[:, 1]
ctr_y = valid_anchor_boxes[:, 0] + 0.5 * height
ctr_x = valid_anchor_boxes[:, 1] + 0.5 * width

base_height = (max_iou_bbox[:, 2] - max_iou_bbox[:, 0]).cpu().numpy()
base_width = (max_iou_bbox[:, 3] - max_iou_bbox[:, 1]).cpu().numpy()
base_ctr_y = max_iou_bbox[:, 0].cpu().numpy() + 0.5 * base_height
base_ctr_x = max_iou_bbox[:, 1].cpu().numpy() + 0.5 * base_width

eps = np.finfo(height.dtype).eps
height = np.maximum(height, eps)
width = np.maximum(width, eps)

dy = (base_ctr_y - ctr_y) / height
dx = (base_ctr_x - ctr_x) / width
dh = np.log(base_height / height)
dw = np.log(base_width / width)

anchor_locs = np.vstack((dy, dx, dh, dw)).transpose()
print(anchor_locs)

tensor([[ 20.,  30., 400., 500.],
        [ 20.,  30., 400., 500.],
        [ 20.,  30., 400., 500.],
        ...,
        [ 20.,  30., 400., 500.],
        [ 20.,  30., 400., 500.],
        [ 20.,  30., 400., 500.]])
[[ 0.5855727   2.3091455   0.7415673   1.647276  ]
 [ 0.49718437  2.3091455   0.7415673   1.647276  ]
 [ 0.40879607  2.3091455   0.7415673   1.647276  ]
 ...
 [-2.50802    -5.292254    0.7415677   1.6472763 ]
 [-2.5964084  -5.292254    0.7415677   1.6472763 ]
 [-2.6847968  -5.292254    0.7415677   1.6472763 ]]


In [28]:
#Mapping the anchor locations and the labels to the original anchor boxes
#This is to all the anchor boxes, the invalid ones are marked
anchor_labels = np.empty((len(anchors),), dtype=bbox_labels.dtype)
anchor_labels.fill(-1)
anchor_labels[inside_indexes] = bbox_labels

anchor_locations = np.empty((len(anchors),) + anchors.shape[1:], dtype=anchor_locs.dtype)
anchor_locations.fill(0)
anchor_locations[inside_indexes, :] = anchor_locs

print(anchor_locations.shape)
print(anchor_labels.shape)

#INPUTS TO THE RPN NETWORK ARE OBTAINED FINALLY!!!

(22500, 4)
(22500,)


In [34]:
#Faster RCNN uses deep learning to obtain the region proposals
#Using n=3 for the conv network as noted in the original network and keeping the number of channels same

mid_channels = 512
in_channels = 512  # depends on the output feature map. in vgg 16 it is equal to 512
n_anchor = len(ratios) * len(anchor_scales)
conv_rpn = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)
reg_layer = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0)
cls_layer = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0)

#These layers were intialized with zero mean and 0.01 standard deviation as per the paper
#Convolutional Layer
conv_rpn.weight.data.normal_(0, 0.01)
conv_rpn.bias.data.zero_()
#regression layer
reg_layer.weight.data.normal_(0, 0.01)
reg_layer.bias.data.zero_()
#Classification layer
cls_layer.weight.data.normal_(0, 0.01)
cls_layer.bias.data.zero_()

outputs_rpn = conv_rpn(out_map)
pred_anchor_locs = reg_layer(outputs_rpn)
pred_cls_scores = cls_layer(outputs_rpn)

print(pred_cls_scores.shape, pred_anchor_locs.shape)


torch.Size([1, 18, 50, 50]) torch.Size([1, 36, 50, 50])


In [35]:
pred_anchor_locs = pred_anchor_locs.permute(0, 2, 3, 1).contiguous().view(1, -1, 4)
print(pred_anchor_locs.shape)

pred_cls_scores = pred_cls_scores.permute(0, 2, 3, 1).contiguous()
print(pred_cls_scores.shape)

objectness_score = pred_cls_scores.view(1, 50, 50, 9, 2)[:, :, :, :, 1].contiguous().view(1, -1)
print(objectness_score.shape)

pred_cls_scores = pred_cls_scores.view(1, -1, 2)
print(pred_cls_scores.shape)


torch.Size([1, 22500, 4])
torch.Size([1, 50, 50, 18])
torch.Size([1, 22500])
torch.Size([1, 22500, 2])


In [36]:
#The RoI network
#First applying the NMS algorithm to reduce the number of redundant proposals
#minimum size here is the minimum height of the object required to create a proposal

nms_thresh = 0.7
n_train_pre_nms = 12000
n_train_post_nms = 2000
n_test_pre_nms = 6000
n_test_post_nms = 300
min_size = 16

#Doing the reverse transformation from the absolute point coordinates to the anchors
anc_height = anchors[:, 2] - anchors[:, 0]
anc_width = anchors[:, 3] - anchors[:, 1]
anc_ctr_y = anchors[:, 0] + 0.5 * anc_height
anc_ctr_x = anchors[:, 1] + 0.5 * anc_width

#Transformations are easier using numpy arrays
pred_anchor_locs_numpy = pred_anchor_locs[0].data.numpy()
objectness_score_numpy = objectness_score[0].data.numpy()

dy = pred_anchor_locs_numpy[:, 0::4]
dx = pred_anchor_locs_numpy[:, 1::4]
dh = pred_anchor_locs_numpy[:, 2::4]
dw = pred_anchor_locs_numpy[:, 3::4]

#Decoding the predictions by unparameterizing them
ctr_y = dy * anc_height[:, np.newaxis] + anc_ctr_y[:, np.newaxis]
ctr_x = dx * anc_width[:, np.newaxis] + anc_ctr_x[:, np.newaxis]
h = np.exp(dh) * anc_height[:, np.newaxis]
w = np.exp(dw) * anc_width[:, np.newaxis]

#Now reconverting them back to the point coordinates format
roi = np.zeros(pred_anchor_locs_numpy.shape, dtype=anchor_locs.dtype)
roi[:, 0::4] = ctr_y - 0.5 * h
roi[:, 1::4] = ctr_x - 0.5 * w
roi[:, 2::4] = ctr_y + 0.5 * h
roi[:, 3::4] = ctr_x + 0.5 * w

print(roi)

[[ -34.0528    -80.697914   56.19092    97.80956 ]
 [ -78.48703  -162.73393   102.09204   190.97133 ]
 [-180.16808  -337.2321    183.98601   357.8995  ]
 ...
 [ 702.1114    742.87775   887.80695   836.73004 ]
 [ 607.1709    700.6685    973.23914   883.0776  ]
 [ 442.34058   600.0958   1159.5668    962.56335 ]]


In [37]:
#Clipping the predicted boxes to the original image
img_size = (800, 800)  # Image size
roi[:, slice(0, 4, 2)] = np.clip(roi[:, slice(0, 4, 2)], 0, img_size[0])
roi[:, slice(1, 4, 2)] = np.clip(roi[:, slice(1, 4, 2)], 0, img_size[1])
print(roi)

[[  0.        0.       56.19092  97.80956]
 [  0.        0.      102.09204 190.97133]
 [  0.        0.      183.98601 357.8995 ]
 ...
 [702.1114  742.87775 800.      800.     ]
 [607.1709  700.6685  800.      800.     ]
 [442.34058 600.0958  800.      800.     ]]


In [38]:
#Removing the anchor boxes which do not satisfy the threshold criteria
hs = roi[:, 2] - roi[:, 0]
ws = roi[:, 3] - roi[:, 1]
keep = np.where((hs >= min_size) & (ws >= min_size))[0]
roi = roi[keep, :]
scores = objectness_score_numpy[keep]
print(scores.shape)
print(roi.shape)

(22500,)
(22500, 4)


In [39]:
#Sorting the pairs from the highest to lowest
ordered_scores = scores.ravel().argsort()[::-1]
print(ordered_scores)

#Taking the top pre_nms_topN
ordered_scores = ordered_scores[:n_train_pre_nms]
roi = roi[ordered_scores, :]
print(roi.shape)

[21607   461     6 ...   868   877   886]
(12000, 4)


In [40]:
#The NMS algorithm
y1 = roi[:, 0]
x1 = roi[:, 1]
y2 = roi[:, 2]
x2 = roi[:, 3]

areas = (x2 - x1 + 1) * (y2 - y1 + 1)

order = ordered_scores.argsort()[::-1]
keep = []

while order.size > 0:
    i = order[0]
    keep.append(i)
    xx1 = np.maximum(x1[i], x1[order[1:]])
    yy1 = np.maximum(y1[i], y1[order[1:]])
    xx2 = np.minimum(x2[i], x2[order[1:]])
    yy2 = np.minimum(y2[i], y2[order[1:]])
    w = np.maximum(0.0, xx2 - xx1 + 1)
    h = np.maximum(0.0, yy2 - yy1 + 1)
    inter = w * h
    ovr = inter / (areas[i] + areas[order[1:]] - inter)
    inds = np.where(ovr <= nms_thresh)[0]
    order = order[inds + 1]

#Check the training and testing variables
keep = keep[:n_train_post_nms]
roi = roi[keep]

#Final region proposals to be used as inputs to the fast_rcnn !!!

In [41]:
#Most of this is redundant and analogous to the one used for anchor boxes
#defining the parameters to the detection network
n_samples = 128
pos_ratio = 0.25
pos_iou_thresh = 0.5
neg_iou_thresh_hi = 0.5
neg_iou_thresh_lo = 0.0

#Find the IoU of ground truth objects with the region proposals, copy code of anchor boxes
ious = np.empty((len(roi), 2), dtype=np.float32)
ious.fill(0)
for num1, i in enumerate(roi):
    ya1, xa1, ya2, xa2 = i
    anchor_area = (ya2 - ya1) * (xa2 - xa1)
    for num2, j in enumerate(bbox):
        yb1, xb1, yb2, xb2 = j
        box_area = (yb2 - yb1) * (xb2 - xb1)
        inter_x1 = max([xb1, xa1])
        inter_y1 = max([yb1, ya1])
        inter_x2 = min([xb2, xa2])
        inter_y2 = min([yb2, ya2])
        if (inter_x1 < inter_x2) and (inter_y1 < inter_y2):
            iter_area = (inter_y2 - inter_y1) * (inter_x2 - inter_x1)
            iou = iter_area / (anchor_area + box_area - iter_area)
        else:
            iou = 0.
        ious[num1, num2] = iou
print(ious.shape)

(2000, 2)


In [42]:
#Find the ground truth proposal which has the max IoU for each proposal, and assign the label to it
gt_assignment = ious.argmax(axis=1)
max_ious = ious.max(axis=1)
print(gt_assignment)
print(max_ious)

gt_roi_label = labels[gt_assignment]
print(gt_roi_label)

[0 0 0 ... 0 0 0]
[0.         0.         0.         ... 0.03279276 0.09336255 0.        ]
tensor([6, 6, 6,  ..., 6, 6, 6])


In [43]:
#Select the foreground and background RoI with the threshold, and sample till the max value
pos_roi_per_image = int(n_samples * pos_ratio)
pos_index = np.where(max_ious >= pos_iou_thresh)[0]
pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size))
if pos_index.size > 0:
    pos_index = np.random.choice(pos_index, size=pos_roi_per_this_image, replace=False)
print(pos_roi_per_this_image)
print(pos_index)

neg_index = np.where((max_ious < neg_iou_thresh_hi) &
                     (max_ious >= neg_iou_thresh_lo))[0]
neg_roi_per_this_image = n_sample - pos_roi_per_this_image
neg_roi_per_this_image = int(min(neg_roi_per_this_image, neg_index.size))

if neg_index.size > 0:
    neg_index = np.random.choice(neg_index, size=neg_roi_per_this_image, replace=False)
print(neg_roi_per_this_image)
print(neg_index)

18
[1901 1501  839 1632  844 1373 1148 1251 1783 1659 1275 1062 1779 1535
 1741 1492 1491 1409]
238
[ 734 1712 1975 1147  773 1681 1908  638 1292 1102 1058  676 1810 1328
  786 1224  994 1804  187 1939  892  720 1933  927 1836 1084  386  945
 1173  862 1862 1932  866 1385   78 1007  692   31 1584 1846  635  978
 1520  883  890  886  832 1142  614 1511  286  733 1566 1896   35   66
  482   73 1909  836  229  908 1154  319  825 1949  668  268 1288 1634
 1952  742 1919 1912 1196 1544 1452 1341 1803  748 1841  452 1972  986
  133  897  813  242  544  364  407 1718 1315 1192  215  487 1702 1046
  338 1571 1904 1664 1516 1679  601 1745  409 1290 1891  449 1252  750
 1653  685 1066   83 1186 1451 1766 1699 1019 1583 1554 1479  316 1657
 1938  900 1958 1781  817 1343 1014 1000 1140  171 1011  303  585 1255
  254  656  658 1320 1393 1221 1171 1139  983 1993 1950  117 1788  489
 1167  233 1426  390 1588  593  299  941  341  736  230 1240 1792  625
  578 1179 1400  223 1777 1964 1707  135  616  8

In [44]:
#Gather the indices, the respective labels and the RoI
keep_index = np.append(pos_index, neg_index)
gt_roi_labels = gt_roi_label[keep_index]
gt_roi_labels[pos_roi_per_this_image:] = 0  # negative labels --> 0
sample_roi = roi[keep_index]
print(sample_roi.shape)

#Pick the ground truth objects for the RoI and parameterize like anchor boxes
bbox_for_sampled_roi = bbox[gt_assignment[keep_index]]
print(bbox_for_sampled_roi.shape)

height = sample_roi[:, 2] - sample_roi[:, 0]
width = sample_roi[:, 3] - sample_roi[:, 1]
ctr_y = sample_roi[:, 0] + 0.5 * height
ctr_x = sample_roi[:, 1] + 0.5 * width
base_height = bbox_for_sampled_roi[:, 2] - bbox_for_sampled_roi[:, 0]
base_width = bbox_for_sampled_roi[:, 3] - bbox_for_sampled_roi[:, 1]
base_ctr_y = (bbox_for_sampled_roi[:, 0] + 0.5 * base_height).cpu().numpy()
base_ctr_x = (bbox_for_sampled_roi[:, 1] + 0.5 * base_width).cpu().numpy()

eps = np.finfo(height.dtype).eps
height = np.maximum(height, eps)
width = np.maximum(width, eps)
dy = (base_ctr_y - ctr_y) / height
dx = (base_ctr_x - ctr_x) / width
dh = np.log(base_height.cpu().numpy() / height)
dw = np.log(base_width.cpu().numpy() / width)
gt_roi_locs = np.vstack((dy, dx, dh, dw)).transpose()
print(gt_roi_locs)


(256, 4)
torch.Size([256, 4])
[[ 0.23342858  0.09424514  0.28303862  0.05253953]
 [ 0.19494893 -0.06281741  0.22914681 -0.25440148]
 [-0.11636309  0.06441679 -0.5916455   0.10253166]
 ...
 [-2.603347    1.419128    0.7366678   1.6449996 ]
 [-5.2580996  -3.1489475   1.4121658   1.1649048 ]
 [-1.381585    1.3034242   0.19216321  1.16269   ]]


In [45]:
#Fast RNN end network for classification
#RoI indices is for the image index, since there is only one image, its a vector of 256 zeros for all RoIs
rois = torch.from_numpy(sample_roi).float()
roi_indices = 0 * np.ones((len(rois),), dtype=np.int32)
roi_indices = torch.from_numpy(roi_indices).float()
print(rois.shape, roi_indices.shape)

#Concatinating and rearranging
indices_and_rois = torch.cat([roi_indices[:, None], rois], dim=1)
xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]]
indices_and_rois = xy_indices_and_rois.contiguous()
print(xy_indices_and_rois.shape)

torch.Size([256, 4]) torch.Size([256])
torch.Size([256, 5])


In [47]:
#RoI pooling layer, max pool of size 7x7
size = 7
adaptive_max_pool = nn.AdaptiveMaxPool2d(size)
output = []
rois = indices_and_rois.data.float()
rois[:, 1:].mul_(1 / 16.0)  # Subsampling ratio skipping the index
rois = rois.long()
num_rois = rois.size(0)
for i in range(num_rois):
    roi = rois[i]
    im_idx = roi[0]
    im = out_map.narrow(0, im_idx, 1)[..., roi[2]:(roi[4] + 1), roi[1]:(roi[3] + 1)]
    output.append(adaptive_max_pool(im))

output = torch.cat(output, 0)
print(output.size())

#Reshape the output according to the dimensions required in the feed forward layer
k = output.view(output.size(0), -1)
print(k.shape)

torch.Size([256, 512, 7, 7])
torch.Size([256, 25088])


In [49]:
#This is the input to a classifier layer, which branches out as another classifier/regression layer
roi_head_classifier = nn.Sequential(*[nn.Linear(25088, 4096), nn.Linear(4096, 4096)])
cls_loc = nn.Linear(4096, 21 * 4)
cls_loc.weight.data.normal_(0, 0.01)
cls_loc.bias.data.zero_()
score = nn.Linear(4096, 21)  # (VOC 20 classes + 1 background)

#Passing the output of the RoI pooling layer to the network defined above
k = roi_head_classifier(k)
roi_cls_loc = cls_loc(k)
roi_cls_score = score(k)
print(roi_cls_loc.shape, roi_cls_score.shape)

torch.Size([256, 84]) torch.Size([256, 21])


In [51]:
#Computing the different types of losses
print(pred_anchor_locs.shape)
print(pred_cls_scores.shape)
print(anchor_locations.shape)
print(anchor_labels.shape)

#Rearranging to remove errors and unnecessary dimensions
rpn_loc = pred_anchor_locs[0]
rpn_score = pred_cls_scores[0]
gt_rpn_loc = torch.from_numpy(anchor_locations)
gt_rpn_score = torch.from_numpy(anchor_labels)
print(rpn_loc.shape, rpn_score.shape, gt_rpn_loc.shape, gt_rpn_score.shape)

torch.Size([1, 22500, 4])
torch.Size([1, 22500, 2])
(22500, 4)
(22500,)
torch.Size([22500, 4]) torch.Size([22500, 2]) torch.Size([22500, 4]) torch.Size([22500])


In [54]:
import torch.nn.functional as F

rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_score.long(), ignore_index=-1)
print(rpn_cls_loss)

# RPN LOSS

#The regression loss is only applicable to bounding boxes with positive labels
pos = gt_rpn_score > 0
mask = pos.unsqueeze(1).expand_as(rpn_loc)
print(mask.shape)
mask_loc_preds = rpn_loc[mask].view(-1, 4)
mask_loc_targets = gt_rpn_loc[mask].view(-1, 4)
print(mask_loc_preds.shape, mask_loc_preds.shape)
#Using smooth L1 loss for regression, with sigma =1,
x = torch.abs(mask_loc_targets - mask_loc_preds)
rpn_loc_loss = ((x < 1).float() * 0.5 * x**2) + ((x >= 1).float() * (x-0.5))
print(rpn_loc_loss.sum())

#Lambda is a hyperparameter which is used to calculate the total rpn loss
rpn_lambda = 5
N_reg = (gt_rpn_score > 0).float().sum()
rpn_loc_loss = rpn_loc_loss.sum() / N_reg
rpn_loss = rpn_cls_loss + (rpn_lambda * rpn_loc_loss)
print(rpn_loss)

tensor(0.6946, grad_fn=<NllLossBackward>)
torch.Size([22500, 4])
torch.Size([18, 4]) torch.Size([18, 4])
tensor(1.1147, grad_fn=<SumBackward0>)
tensor(1.0042, grad_fn=<AddBackward0>)


In [55]:
#Fast RCNN Loss

print(roi_cls_loc.shape)
print(roi_cls_score.shape)
print(gt_roi_locs.shape)
print(gt_roi_labels.shape)

gt_roi_loc = torch.from_numpy(gt_roi_locs)
gt_roi_label = torch.from_numpy(np.float32(gt_roi_labels)).long()
print(gt_roi_loc.shape, gt_roi_label.shape)

#Classification loss
roi_cls_loss = F.cross_entropy(roi_cls_score, gt_roi_label, ignore_index=-1)
print(roi_cls_loss)

#Regression loss
n_sample = roi_cls_loc.shape[0]
roi_loc = roi_cls_loc.view(n_sample, -1, 4)
print(roi_loc.shape)
roi_loc = roi_loc[torch.arange(0, n_sample).long(), gt_roi_label]
print(roi_loc.shape)
x_roi = torch.abs(gt_roi_loc - roi_loc)
roi_loc_loss = ((x_roi < 1).float() * 0.5 * x_roi ** 2) + ((x_roi >= 1).float() * (x_roi - 0.5))
print(roi_loc_loss.sum())

#Total loss
roi_lambda = 10.
N_reg_roi = (gt_rpn_score > 0).float().sum()
roi_loc_loss = roi_loc_loss.sum() / N_reg_roi
roi_loss = roi_cls_loss + (roi_lambda * roi_loc_loss)
print(roi_loss)

total_loss = rpn_loss + roi_loss
print(total_loss)

torch.Size([256, 84])
torch.Size([256, 21])
(256, 4)
torch.Size([256])
torch.Size([256, 4]) torch.Size([256])
tensor(3.0175, grad_fn=<NllLossBackward>)
torch.Size([256, 21, 4])
torch.Size([256, 4])
tensor(900.5510, grad_fn=<SumBackward0>)
tensor(503.3236, grad_fn=<AddBackward0>)
tensor(504.3279, grad_fn=<AddBackward0>)
