# Import Libraries

In [4]:
import torch
import torch.nn as nn

# Model Architecture

In [2]:
architecture_config = [
    #Tuple: (kernel_size, number_of_filters, strides, padding)
    # "M": Max Pool Layer
    (7, 64, 2, 3),
    "M",
    (3, 192, 1, 1),
    "M",
    (1, 128, 1, 1),
    (3, 256, 1, 1),
    (1, 256, 1, 0),
    (3, 512, 1, 1),
    "M",
    #List: [(tuple), (tuple), how many times to repeat]
    [(1, 256, 1, 0), (3, 512, 1, 1), 4],
    (1, 512, 1, 0),
    (3, 1024, 1, 1),
    "M",
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],
    (3, 1024, 1, 1),
    (3, 1024, 2, 1),
    (3, 1024, 1, 1),
    (3, 1024, 1, 1),
    #Doesn't include FC layers
]

In [3]:
class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLu(0.1)
    
    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))

class YoloV1(nn.Module):
    def __init___(self, in_channels=3, **kwargs):
        super(YoloV1, self).__init__()
        self.architecture = architecture_config
        self.in_channels = in_channels
        self.darknet = self._create_conv_layers(self.architecture)
        self.fcs = self._create_fcs(**kwargs)

    def forward(self, x):
        x = self.darknet(x)
        return self.fcs(torch.flatten(x, start_dim=1))

    def _create_conv_layers(self, architecture):
        layers = []
        in_channels = self.in_channels

        for x in architecture:
            if type(x) == tuple:
                layers+= [CNNBlock(in_channels, x[1], kernel_size=x[0], stride=x[2], padding=x[3])]
                in_channels = x[1]
            elif type(x) == str:
                layers+= [nn.MaxPool2d(kernel_size=2, stride=2)]
            elif type(x) == list:
                conv1 = x[0] #tuple
                conv2 = x[1] #tuple
                repeats = x[2] #int

                for _ in range(repeats):
                    layers+=[CNNBlock(in_channels, conv1[1], kernel_size=conv1[0], stride=conv1[2], padding=conv1[3])]
                    layers+=[CNNBlock(conv1[1], conv2[1], kernel_size=conv2[0], stride=conv2[2], padding=conv2[3])]
                    in_channels=conv2[1]

        return nn.Sequential(*layers)

    def _create_fcs(self, split_size, num_boxes, num_classes):
        S, B, C = split_size, num_boxes, num_classes
        return nn.Sequential(nn.Flatten(), nn.Linear(1024*S*S, 496), nn.Dropout(0.0), nn.LeakyReLU(0.1), nn.Linear(496, S*S*(C+B*5)))

# Utility Functions

## Intersection over Union

In [5]:
def intersection_over_union(boxes_preds, boxes_labels, box_format = 'midpoint'):
    """
    Calculates Intersection over union

    Parameters:
        boxes_preds (tensor): Prediction of Bounding boxes(BATCH_SIZE, 4)
        boxes_labels (tensor): Correct labels of Bounding Boxes (BATCH_SIZE, 4)
        box_format (str): midpoint/corners, if boxes are (x,y,w,h) or (x1,y1,x2,y2) respectively

    Returns:
        tensor: Intersection over union for all examples
    """
    # boxes_preds shape is (N,4) where N is the number of predicted bboxes
    # boxes_labels shape is (n, 4)

    if box_format == "midpoint":
        box1_x1 = boxes_preds[...,0:1] - boxes_preds[...,2:3] / 2
        box1_y1 = boxes_preds[...,1:2] - boxes_preds[...,3:4] / 2
        box1_x2 = boxes_preds[...,0:1] + boxes_preds[...,2:3] / 2
        box1_y2 = boxes_preds[...,1:2] + boxes_preds[...,3:4] / 2

        box2_x1 = boxes_labels[...,0:1] - boxes_labels[...,2:3] / 2
        box2_y1 = boxes_labels[...,1:2] - boxes_labels[...,3:4] / 2
        box2_x2 = boxes_labels[...,0:1] + boxes_labels[...,2:3] / 2
        box2_y2 = boxes_labels[...,1:2] + boxes_labels[...,3:4] / 2

    if box_format == "corners":
        box1_x1 = boxes_preds[...,0:1]
        box1_y1 = boxes_preds[...,1:2]
        box1_x2 = boxes_preds[...,2:3]
        box1_y2 = boxes_preds[...,3:4] #output tensor should (N,1). If we only use 3, we go to (N)

        box2_x1 = boxes_labels[...,0:1]
        box2_y1 = boxes_labels[...,1:2]
        box2_x2 = boxes_labels[...,2:3]
        box2_y2 = boxes_labels[...,3:4]

    
    x1 = torch.max(box1_x1, box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    x2 = torch.max(box1_x2, box2_x2)
    y2 = torch.max(box1_y2, box2_y2)

    # .clamp(0) is for the case when they don't intersect. Since when they don't intersect one of these will be negetive so they should become 0
    intersection = (x2 - x1).clamp(0) * (y2-y1).clamp(0)

    box1_area = abs((box1_x2- box1_x1)*(box1_y2-box1_y1))
    box2_area = abs((box2_x2- box2_x1)*(box2_y2-box2_y1))

    return intersection / (box1_area + box2_area - intersection + 1e-6)

## Non Maximal Supression

In [None]:
def non_max_supression(bboxes, iou_threshold, thresold, box_format="corners"):
    """
    Given Bounding Boxes does Non-maximal supression

    Parameters:
        bboxes (list): list of lists containing all bounding boxes with each box specified as [class_preds, prob_score, x1, y2, x2, y2]
        iou_threshold: (float): threshold where predicted bounding box is correct
        threshold (float): threshold to remove predicted bboxes (Independent of IOU)
        box_format (str): midpoint/corners, if boxes are (x,y,w,h) or (x1,y1,x2,y2) respectively
    
    Returns:
        bboxes_after_nms (list): bboxes after performing NMS given a specific IOU Threshold
    """

    assert type(bboxes) == list

    bboxes = [box for box in bboxes if box[1]> threshold ]
    bboxes = sorted(bboxes, key= lambda x: x[1], reverse=True)
    bboxes_after_nms = []

    while bboxes:
        chosen_box = bboxes.pop(0)
        bboxes = [box for box in bboxes 
        if box[0]!=chosen_box[0] or intersection_over_union(torch.tensor(chosen_box[2:]), torch.tensor(box[2:]), box_format = box_format) < iou_threshold
        ]
        bboxes_after_nms.append(chosen_box)
    
    return bboxes_after_nms

## Mean Average Precision

In [None]:
def mean_average_precision(pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint", num_classes=20):
    """
    Calculates Mean Average Precision

    Parameters:
        pred_boxes (list): list of lists containing all bounding boxes with each box specified as [train_idx, class_preds, prob_score, x1, y2, x2, y2]
        true_boxes (list): similar to pred boxes except all the correct ones
        iou_threshold (float): threshold above which predicted box is correct
        box_format (str): midpoint/corners, if boxes are (x,y,w,h) or (x1,y1,x2,y2) respectively
        num_classes (int): number of classes

    Returns:
        float: mAP value across all classes given a specific IOU threshold

    """

    #list storing all AP for respective classes
    average_precisions = []

    #used for numerical stability later on
    epsilon = 1e-6

    for c in range(num_classes):
        detections = []
        ground_truths = []

        # Go through all the predictions and targets and only add the ones that belong to the current class c
        for detection in pred_boxes:
            if detection[1]==c:
                detections.append(detection)

        for true_box in true_boxes:
            if true_box[1]==c:
                ground_truths.append(true_box)

        # find the amount of bboxes for each training example, Counter here finds how many ground truth boxes we get for each training example.
        # so lets say img 0 has 3 and img 1 has 5 then we will get dictionary with amount_bboxes={0:3, 1:5}
        amount_bboxes = Counter([gt[0] for gt in ground_truths])

        # We then go through each key, val in this dictionary and convert to the following (w.r.t. same example)
        #  amount_bboxes = {0: torch.tensor[0,0,0], 1:torch.tensor[0,0,0,0,0]}
        for key, val in amount_bboxes.items():
            amount_bboxes[key] = torch.zeros(val)

        # sort by box_probabilities which is index 2
        detections.sort(key= lambda x:x[2], reverse=True)
        TP = torch.zeros((len(detections)))
        FP = torch.zeros((len(detections)))
        total_true_bboxes = len(ground_truths)

        # if none exists for this class then we can safely skip
        if total_true_bboxes == 0:
            continue

        for detection_idx, detection in enumerate(detections):
            # only take out ground truths that have the same training_idx as detection
            ground_truth_img = [bbox for bbox in ground_truths if bbox[0] == detection[0]]

            num_gts = len(ground_truth_img)
            best_iou = 0

            for idx, gt in enumerate(ground_truth_img):
                iou = intersection_over_union(torch.tensor(detection[3:]), torch.tensor(gt[3:]), box_format= box_format)
                if iou > best_iou:
                    best_iou = iou
                    best_gt_idx = idx
            
            if best_iou > iou_threshold:
                # only detect ground truth detection once
                if amount_bboxes[detection[0]][best_gt_idx]==0:
                    #true positive and add this bounding box to seen
                    TP[detection_idx] = 1
                    amount_bboxes[detection[0]][best_gt_idx] = 1
                else:
                    FP[detection_idx] = 1
            # if iou is lower then the detection is false positive
            else:
                FP[detection_idx] = 1
        
        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)
        recalls = TP_cumsum / (total_true_bboxes + epsilon)
        precisions = torch.divide(TP_cumsum, (TP_cumsum + FP_cumsum + epsilon))
        precisions = torch.cat((torch.tensor[1]), precisions)
        recalls = torch.cat((torch.tenosor[0]), recalls)
        # torch.trapz for numerical integration
        average_precisions.append(torch.trapz(precisions, recalls))

    return sum(average_precisions)/ len(average_precisions)
