# Faster RCNN Model

I will be using the Pascal Visual Object Classes 2007 Dataset

https://www.kaggle.com/datasets/zaraks/pascal-voc-2007?select=VOCtrainval_06-Nov-2007

This model will be based off the guidelines outlined in this instructional video: 
https://www.youtube.com/watch?v=Qq1yfWDdj5Y&list=WL&index=1 

timestamp 42:41

## implement simple Faster-RCNN Model

In [7]:
#import the libraries
import torch
import torch.nn as nn
import torchvision
import math

In [3]:
#choose the accelerator if available. On Mac so using mps.
device= torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(device)

mps


In [16]:
def get_iou(boxes1, boxes2):
    '''
    boxes1: (N x 4)
    boxes2: (M x 4)

    return: IOU matrix of shape (N x M)
    '''

    #area of boxes (x2-x1) * (y2-y1)
    area1= (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1]) 
    area2= (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1]) 

    #get top left x1, y1
    x_left= torch.max(boxes1[:, None, 0], boxes2[:, 0]) #(N, M)
    y_top= torch.max(boxes1[:, None, 1], boxes2[:, 1]) #(N, M)

    #get bottom right x2, y2
    x_right= torch.min(boxes1[:, None, 2], boxes2[:, 2]) #(N, M)
    y_bottom= torch.min(boxes1[:, None, 3], boxes2[:, 3]) #(N, M)

    intersection_area= (x_right - x_left).clamp(min=0) * (y_bottom - y_top).clamp(min=0)
    union = area1[:, None] + area2 - intersection_area
    return intersection_area/union #(N, M)

In [11]:
def apply_regression_pred_to_anchors_or_proposals(
    box_transform_pred, anchors_or_proposals):

    """
    box_transform_pred: (num_anchors_or_proposals, num_classes, 4)
    anchors_or_proposals: (num_anchors_or_proposals, 4)

    return > pred_boxes: (num_anchors_or_proposals, num_classes, 4)
    """

    box_transform_pred= box_transform_pred.reshape(
    box_transform_pred.size(0), -1, 4
    )

    #get cx, cy, w, h, from x1, y1, x2, y2
    w= anchors_or_proposals[:, 2] - anchors_or_proposals[:, 0]
    h= anchors_or_proposals[:, 3] - anchors_or_proposals[:, 1]
    center_x= anchors_or_proposals[:, 0] + 0.5*w
    center_y= anchors_or_proposals[:, 1] + 0.5*h

    dx= box_transform_pred[..., 0]
    dy= box_transform_pred[..., 1]
    dw= box_transform_pred[..., 2]
    dh= box_transform_pred[..., 3]
    #dh > (num_anchors_or_proposals, num_classes)

    pred_center_x= dx*w[:, None] + center_x[:, None]
    pred_center_y= dy*h[:, None] + center_y[:, None]
    pred_w= torch.exp(dw)*w[:, None]
    pred_h= torch.exp(dh)*h[:, None]
    #pred_center_x > (num_anchors_or_proposals, num_classes)

    pred_box_x1= pred_center_x - 0.5*pred_w
    pred_box_y1= pred_center_y - 0.5*pred_h
    pred_box_x2= pred_center_x + 0.5*pred_w
    pred_box_y2= pred_center_y + 0.5*pred_h

    pred_boxes= torch.stack((
        pred_box_x1,
        pred_box_y1,
        pred_box_x2,
        pred_box_y2
    ), dim= 2)
    #pred_boxes > (num_anchors_or_proposals, num_classes, 4)

    return pred_boxes
    

In [17]:
def clamp_boxes_to_image_boundary(boxes, image_shape):
    boxes_x1= boxes[..., 0]
    boxes_y1= boxes[..., 1]
    boxes_x2= boxes[..., 2]
    boxes_y2= boxes[..., 3]
    
    height, width= image_shape[-2:]
    
    boxes_x1= boxes_x1.clamp(min= 0, max= width)
    boxes_x2= boxes_x2.clamp(min=0, max= width) 
    boxes_y1= boxes_y1.clamp(min= 0, max= height)
    boxes_y2= boxes_y2.clamp(min= 0, max= height)

    boxes= torch.cat((
        boxes_x1[..., None],
        boxes_x2[..., None],
        boxes_y1[..., None],
        boxes_y2[..., None]
    ), dim= -1)
    return boxes

def boxes_to_transformation_targets(ground_truth_boxes, anchors_or_proposals):
    #get center_x, center_y, w, h from x1, y1, x2, y2 for anchors
    widths= anchors_or_proposals[:, 2] - anchors_or_proposals[:, 0]
    heights= anchors_or_proposals[:, 3] - anchors_or_proposals[:, 1]
    center_x= anchors_or_proposals[:, 0] + 0.5*widths
    center_y= anchors_or_proposals[:, 1] + 0.5*heights

    #get center_x, center_y, w, h from x1, y1, x2, y2 for gt boxes
    gt_widths= ground_truth_boxes[:, 2] - ground_truth_boxes[:, 0]
    gt_heights= ground_truth_boxes[:, 3] - ground_truth_boxes[:, 1]
    gt_center_x= ground_truth_boxes[:, 0] + 0.5*gt_widths
    gt_center_y= ground_truth_boxes[:, 1] + 0.5*gt_heights

    target_dx= (gt_center_x - center_x) / widths
    target_dy= (gt_center_y - center_y) / heights
    target_dw= torch.log(gt_widths / widths)
    target_dh= torch.log(gt_heights / heights)

    regression_targets= torch.stack((
        target_dx,
        target_dy,
        target_dw,
        target_dh
    ), dim= 1)

    return regression_targets

def sample_positive_negative(labels, positive_count, total_count):
    positive= torch.where(labels >= 1)[0]
    negative= torch.where(labels == 0)[0]
    num_pos= positive_count
    num_pos= min(positive.numel(), num_pos)
    num_neg= total_count - num_pos
    num_neg= min(negative.numel(), num_neg)

    perm_positive_idxs= torch.randperm(positive.numel(),
                                        device= positive.device)[:num_pos]
    perm_negative_idxs= torch.randperm(negative.numel(),
                                        device= negative.device)[:num_neg]
    pos_idxs= positive[perm_positive_idxs]
    neg_idxs= negative[perm_negative_idxs]
    sampled_pos_idx_mask= torch.zeros_like(labels, dtype= torch.bool)
    sampled_neg_idx_mask= torch.zeros_like(labels, dtype= torch.bool)
    sampled_pos_idx_mask[pos_idxs]= True
    sampled_neg_idx_mask[neg_idxs]= True

    return sampled_neg_idx_mask, sampled_pos_idx_mask

In [12]:
#create Region Proposal Network class
class RegionProposalNetwork(nn.Module):

"""
RPN Summary

For Training and Inference:
    Call RPN Layers
    Generate Anchors
    Convert Anchors to Proposals using Box Transformation Prediction
    Filter Proposals

For Training Only:
    Assign Ground Truth Boxes to Anchors
    Compute Labels and Regression Targets for Anchors
    Sample Positive and Negative Anchors
    Compute Classification Loss Using Sampled Anchors
    Compute Localization Loss Using Sampled Positive Anchors
"""
    
    #512 is input created by feature map output of the backbone we are using
    def __init__(self, in_channels= 512):
        super(RegionProposalNetwork, self).__init__()
        #specify scales and aspect ratio's for the anchor boxes
        self.scales= [128, 256, 512]
        self.aspect_ratios= [0.5, 1, 2]
        self.num_anchors= len(self.scales) * len(self.aspect_ratios)

        # 3 layers used
        #Layer1: 3x3 convolutional layer
        self.rpn_conv= nn.Conv2d(in_channels,
                                in_channels,
                                kernel_size= 3,
                                stride= 1,
                                padding=1)
        #Layer2: 1x1 classification layer
        self.cls_layer= nn.Conv2d(in_channels,
                                  self.num_anchors, 
                                  kernel_size= 1,
                                  stride= 1)
        #Layer#3: 1x1 regression layer
        self.bbox_reg_layer= nn.Conv2d(in_channels,
                                        self.num_anchors *4,
                                        kernel_size= 1,
                                        stride= 1)
        
        """
        relevant shapes
        image              [1x3x600x800]
        feat               [1x512x37x50]
        target['bboxes']   [1x6x4]
        target['labels']   [1x6]
        cls_scores         [1x9x37x50]
        box_transform_pred [1x36x37x50]
        stride_h/w         [16]
        base_anchors       [9x4]
        anchors            [16650x4]
        proposals          [16650x4]
        scores             [2000,]
        gt_boxes           [6x4]
        iou_matrix         [6x16650]
        matched_gt_boxes   [16650x4]
        labels             [16650,]
        regression_targets [16650x4]
        """

        def assign_targets_to_anchors(self, anchors, gt_boxes):
            #get (gt_boxes, num_anchors) IOU matrix
            iou_matrix= get_iou(gt_boxes, anchors)

            #for each anchor box get best gt box index
            best_match_iou, best_match_gt_index= iou_matrix.max(dim=0)

            #this copy will be needed later ot add low quality boxes
            best_match_gt_idx_pre_threshold= best_match_gt_index.clone()

            below_low_threshold= best_match_iou < 0.3
            between_threshold= (best_match_iou >= 0.3) and (best_match_iou < 0.7)
            best_match_gt_index[below_low_threshold]= -1
            best_match_gt_index[between_threshold]= -2
              
            #low quality anchor boxes
            best_anchor_iou_for_gt, _ = iou_matrix.max(dim=1)
            gt_pred_pair_with_highest_iou= torch.where(iou_matrix == best_anchor_iou_for_gt[:, None])

            #get all the anchors indexes to update
            pred_inds_to_update= gt_pred_pair_with_highest_iou[1]
            best_match_gt_index[pred_inds_to_update]= best_match_gt_idx_pre_threshold[pred_inds_to_update]
            
            #best match index is either valid or -1(background) or -2(to ignore)
            matched_gt_boxes= gt_boxes[best_match_gt_index.clamp(min= 0)]

            #set all foreground lanchor labels as 1
            labels= best_match_gt_index >= 0
            labels= labels.to(dtype= torch.float32)

            #set all background labels as 0
            background_anchors= best_match_gt_index == -1
            labels[background_anchors]= 0.0

            #set all to be ignored labels as -1
            ingored_anchors= best_match_gt_index == -2
            labels[ignored_anchors] = -1.0

            #later for classification we pick labels which have >= 0
            return labels, matched_gt_boxes
            
        
        def filter_proposals(self, proposals, cls_scores, image_shape):
            #pre NMS filtering
            cls_scores= cls_scores.reshape(-1)
            cls_scores= torch.sigmoid(cls_scores)
            _, top_n_idx= cls_scores.topk(10000)
            cls_scores= cls_scores[top_n_idx]
            proposals= proposals[top_n_idx]

            #clamp boxes to image boundary
            proposals= clamp_boxes_to_image_boundary(proposals, image_shape)

            #NMS based on objectness
            keep_mask- torch.zeros_like(cls_scores, dtype= torch.bool)
            keep_indices= torch.ops.torchvision.nms(proposals, 
                                                    cls_scores, 
                                                    0.7)

            post_nms_keep_indices= keep_indices[
                cls_scores[keep_indices].sort(descending= True)[1]
                ]

            #post NMS top k filtering
            proposals= proposals[post_nms_keep_indices[:2000]]
            cls_scores= cls_scores[post_nms_keep_indices[:2000]]
            return proposals, cls_scores
        
        def generate_anchors(self, image, feat):
            grid_h, grid_w= feat.shape[-2:]
            image_h, image_w= image.shape[-2:]
            
            stride_h= torch.tensor(image_h // grid_h,
                                  dtype= torch.int32, #mps can only handle 32 instead of cuda 64
                                  device= feat.device)
            
            stride_w= torch.tensor(image_w // grid_w,
                                  dtype= torch.int32,
                                  device= feat.device)

            scales= torch.as_tensor(self.scales,
                                    dtype= feat.dtype,
                                    device= feat.device)

            aspect_ratios= torch.as_tensor(self.aspect_ratios,
                                    dtype= feat.dtype,
                                    device= feat.device)

            #ensure h/w= aspect_ratios and h*w= 1
            h_ratios= torch.sqrt(aspect_ratios)
            w_ratios= 1/h_ratios

            ws= (w_ratios[:, None] * scales[None, :]).view(-1)
            hs= (h_ratios[:, None] * scales[None, :]).view(-1)

            base_anchors= torch.stack([-ws, -hs, ws, hs], dim=1) /2
            base_anchors= base_anchors.round()

            #convert all base anchors to grid of all anchors through
            #shifts in x axis (0,1,...,W_feat-1) * stride_w
            shifts_x= torch.arange(0, grid_w,
                                   dtype= torch.int32,
                                   device= feat.device) * stride_w

            #shifts in y axis (0,1,...,H_feat-1) * stride_h
            shifts_y= torch.arange(0, grid_h,
                                   dtype= torch.int32,
                                   device= feat.device) * stride_h

            shifts_y, shifts_x= torch.meshgrid(shifts_y, shifts_x,
                                               indexing= 'ij')

            #(H_feat, W_feat)
            shifts_x= shifts_x.reshape(-1)
            shifts_y= shifts_y.reshape(-1)
            shifts= torch.stack((shifts_x,
                                 shifts_y,
                                 shifts_x,
                                 shifts_y), dim= 1)
            #shifts > (H_feat * W_feat, 4)

            #base_anchors > (num_anchors_per_location, 4)
            #shifts > (H_feat * W_feat, 4)
            anchors= (shifts.view(-1,1,4) + base_anchors.view(1,-1,4))
            #(H_feat * W_feat, num_anchors_per_location, 4)

            anchors= anchors.reshape(-1, 4)
            #anchors > (H_feat 8 W_feat * num_anchors_per_location, 4)
            return anchors
            
        #forward pass
        def forward(self, image, feat, target): 
            #call RPN layers
            rpn_feat= nn.Relu()(self.rpn_conv(feat))
            cls_scores= self.cls_layer(rpn_feat)
            box_transform_pred= self.bbox_reg_layer(rpn_feat)

            #generate anchors
            anchors= self.generate_anchors(image, feat)

            # cls_scores > (Batch, Number of anchors per location, H_feat, W_feat)
            number_of_anchors_per_location= cls_scores.size(1)
            cls_scores= cls_scores.permute(0,2,3,1)
            cls_scores= cls_scores.reshape(-1,1)
            #cls_score > (Batch*H_feat*W_feat*Number_of_anchors_per_location, 1)

            #box_transform_pred > (Batch, number_of_anchors_per_location*4, H_feat, W_feat)
            box_transform_pred=  box_transform_pred.view(
                box_transform_pred.size(0),
                number_of_anchors_per_location,
                4,
                rpn_feat.shape(-2),
                rpn_feat.shape(-1)
            )
             
            #box_transform_pred > (B*H_feat*W_feat*num_of_anchors_per_location, 4)

            #transform generated anchors according to box_transform_pred
            proposals= apply_regression_pred_to_anchors_or_proposals(
                box_transform_pred.detach().reshape(-1,1,4),
                anchors
            )

            proposals= proposals.reshape(proposals.size(0),4)
            propopsals, scores= self.filter_proposals(proposals,
                                                    cls_scores.detach(),
                                                    image.shape)

            rpn_output= {
                'proposals': proposals,
                'scores': score
            }

            if not self.training or target is None:
                return rpn_output
            else:
                #in training
                #assign gt box and label for each anchor
                labels_for_anchors, matched_gt_boxes_for_anchors= self.assign_targets_to_anchors(
                    anchors,
                    target['bboxes'][0]
                )

                #based on gt assignment above, get regression targets for anchors
                #matched_gt_boxes_for_anchors -> (number of anchors in image, 4)
                #anchors -> (number of anchors in image, 4)
                regression_targets= boxes_to_transformation_targets(
                    matched_gt_boxes_for_anchors,
                    anchors
                )

                #sample positive and negative anchors for training 
                sampled_neg_idx_mask, sampled_pos_idx_mask= sample_positve_negative(
                    labels_for_anchors,
                    positive_count= 128,
                    total_count= 256
                )

                sampled_idxs= torch.where(sampled_pos_idx_mask | sampled_neg_idx_mask)[0]
                localization_loss= (
                    torch.nn.function.smooth_l1_loss(
                        box_transform_pred[sampled_pos_idx_mask],
                        regression_targets[sampled_pos_idx_mask],
                        beta= 1/9,
                        reduction= 'sum'
                    ) / (sampled_idxs.numel())
                )

                cls_loss= torch.nn.functional.binary_cross_entropy_with_logits(
                    cls_scores[sampled_idxs].flatten90,
                    labels_for_anchors[sampled_idxs].flatten()
                )

                rpn_output['rpn_classification_loss']= cls_loss
                rpn_output['rpn_localization_loss']= localization_loss

                return rpn_output
                

In [None]:
class ROIHead(nn.Module): 
    def __init__(self, num_classes= 21, in_channels= 512):
        super(ROIHead, slef).__init__()
        self.num_classes= num_classes
        self.pool_size= 7
        self.fc_inner_dim= 1024

        self.fc6= nn.Linear(in_channels * self.pool_size *self.pool_size, 
                            self.fc_inner_dim)

        self.fc7= nn.Linear(self.fc_inner_dim, self.fc_inner_dim)
        self.cls_layer= nn.Linear(self.fc_inner_dim, self.num_classes)
        self.bbox_reg_layer= nn.Linear(self.fc_inner_dim, self.num_classes * 4)

    """
    Shapes

    feat                 [1x512x37x50]
    proposals            [<=2000x4]
    image_shape          [600, 800]
    target['labels']     [1x6]
    target['bboxes']     [1x6x4]
    gt_boxes             [6x4]
    gt_labels            [6,]

    """

    def assign_target_to_proposals(self, proposals, gt_boxes, gt_labels):
        iou_matrix= get_iou(gt_boxes, proposals)
        best_match_iou, best_match_gt_idx= iou_matrix.max(dim= 0)
        below_low_threshhold= best_match_iou < 0.5

        best_match_gt_idx[below_low_threshhold]= -1
        matched_gt_boxes_for_proposals= gt_boxes[best_match_gt_idx.clamp(min=0)]

        labels= gt_labels[best_match_gt_idx.clamp(min=0)]
        labels= labels.to(dtype= torch.int32)

        background_proposals= best_match_gt_idx == -1
    
    def forwars(self, feat, proposals, image_shape, target):
        if self.training and target is not None:
            gt_boxes= target['bboxes'][0]
            gt_labels= target['labels'][0]

            #assign labels and gt boxes for proposals 
            