<a href="https://colab.research.google.com/github/Dawn-2-Winter/MachineLearning/blob/master/Object%20Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# /content/deeplearning_assignment_1_dataset

In [None]:
# ! unzip /content/drive/MyDrive/Colab_Notebooks/deeplearning_assignment_1_dataset.zip

In [2]:
!git clone https://github.com/Liang-ZX/HKU-DASC7606-A1.git

Cloning into 'HKU-DASC7606-A1'...
remote: Enumerating objects: 37, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 37 (delta 10), reused 23 (delta 2), pack-reused 0[K
Unpacking objects: 100% (37/37), 1.05 MiB | 6.39 MiB/s, done.


## PreDefine

In [25]:
import torch
import torch.nn as nn
import torch.utils.model_zoo as model_zoo
import torch.nn.functional as F
import os
import tqdm
import numpy as np
import argparse
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torch.utils.data as data
import json
import random
import cv2
import torchvision
import pickle
from collections import defaultdict

In [4]:
def redefine_conv3x3(in_planes, out_planes, stride=1):
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)

def redefine_conv1x1(in_planes, out_planes, stride=1):
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride,
                     padding=0, bias=False)


## ResNet Backbone

In [5]:
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = redefine_conv3x3(in_planes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = redefine_conv3x3(planes, planes * BasicBlock.expansion)
        self.bn2 = nn.BatchNorm2d(planes * BasicBlock.expansion)
        self.downsample = downsample

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

In [6]:
class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        ##############################################################
        # TODO: Please define your layers with the BottleNeck from the paper "Deep Residual Learning for Image Recognition"
        #
        # Note: You **must not** use the nn.Conv2d here but use **redefine_conv3x3** and **redefine_conv1x1** in this script instead
        ##############################################################
        self.conv1 = redefine_conv1x1(in_planes, planes)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = redefine_conv3x3(planes, planes)
        self.conv3 = redefine_conv1x1(planes, planes * Bottleneck.expansion)
        self.bn2 = nn.BatchNorm2d(planes * Bottleneck.expansion)
        ###############################################################
        self.downsample = downsample

        if stride != 1 or in_planes != planes * Bottleneck.expansion:
          self.downsample = nn.Sequential(
              nn.Conv2d(in_planes, planes * Bottleneck.expansion, stride=stride, kernel_size=1, bias=False),
              nn.BatchNorm2d(planes * Bottleneck.expansion)
          )

    def forward(self, x):
        residual = x

        ##############################################################
        # TODO: Please write the forward function with your defined layers
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn2(out)
        # print('out shape', out.shape)
        ##############################################################
        # out = x    you can delete this line if it's not needed

        ###############################################################
        if self.downsample is not None:
            # print('downsample is not None')
            residual = self.downsample(x)
        # print('residual shape', residual.shape)
        out += residual
        out = self.relu(out)

        return out


## mmdetection_head

In [7]:
class mmdetection_head(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1, do_downsample=False):
        super(mmdetection_head, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=True)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=2, bias=True, dilation=2)  # dilation
        self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=True)

        ####################################################################
        # TODO: Please complete the downsample module
        # Hint: Use a "kernel_size=1"'s convolution layer to align the dimension
        # Hint: We don't suggest using any batch normalization on detection head.
        #####################################################################
        self.downsample = nn.Sequential()
        if do_downsample or stride != 1 or in_planes != self.expansion * planes:
            self.downsample = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride = stride)
            )

        ##################################################################

    def forward(self, x):
        out = self.conv1(x)
        out = F.relu(out)
        out = self.conv2(out)
        out = F.relu(out)
        out = self.conv3(out)
        out += self.downsample(x)
        out = F.relu(out)
        return out

In [8]:
model_urls = {
    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}


def resnet18(pretrained=False, **kwargs):
    """Constructs a ResNet-18 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
    return model

## ResNet

In [9]:
class ResNet(nn.Module):

    def __init__(self, block, layers, args): 
        super(ResNet, self).__init__()
        self.in_planes = 64
        
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)

        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        ###################################################################
        # TODO: Please fill the codes below with the *self._make_layer()* function
        ##################################################################
        self.conv2 = self._make_layer(block, 64, layers[0], 1)
        self.conv3 = self._make_layer(block, 128, layers[1], 1)
        self.conv4 = self._make_layer(block, 256, layers[2], 1)
        self.conv5 = self._make_layer(block, 512, layers[3], 1)

        # adjust = redefine_conv1x1()
        ##################################################################


        ###################################################################
        # TODO: Please substitute the "?" with specific numbers
        ###################################################################
        # change ResNet to suit the detection requirement
        yolo_S, yolo_B, yolo_C = args['yolo_S'], args['yolo_B'], args['yolo_C']

        self.det_head = self._make_detection_head(in_channels=512*block.expansion, out_channels=yolo_B*5+yolo_C)
        ###################################################################
                
        def _weights_init(m):
            """ kaiming init (https://arxiv.org/abs/1502.01852v1)"""
            if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
        
        self.apply(_weights_init)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_planes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_planes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )
        layers = [block(self.in_planes, planes, stride, downsample)]
        self.in_planes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.in_planes, planes))

        return nn.Sequential(*layers)

    def _make_detection_head(self, in_channels, out_channels):
        layers = [
            mmdetection_head(in_planes=in_channels, planes=256, do_downsample=True),
            mmdetection_head(in_planes=256, planes=256),
            mmdetection_head(in_planes=256, planes=256),
            nn.Conv2d(256, out_channels, kernel_size=3, stride=1, padding=1, bias=True),
            nn.Sigmoid()
        ]
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        ###################################################################
        # TODO: Please fill the codes below
        ##################################################################
        # print('after conv1', x.shape)
        x = self.conv2(x)
        x = self.maxpool(x)
        x = self.conv3(x)
        x = self.maxpool(x)
        x = self.conv4(x)
        x = self.maxpool(x)
        x = self.conv5(x)
        
        ##################################################################

        x = self.det_head(x)

        x = x.permute(0, 2, 3, 1)

        return x
        ##################################################################

## Model define

In [10]:
def resnet18(pretrained=False, **kwargs):
    """Constructs a ResNet-18 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
    return model


def resnet34(pretrained=False, **kwargs):
    """Constructs a ResNet-34 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
    return model

In [11]:
def resnet50(pretrained=False, **kwargs):
    """Constructs a ResNet-50 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
    return model

In [12]:
def resnet101(pretrained=False, **kwargs):
    """Constructs a ResNet-101 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
    return model

In [13]:
def resnet152(pretrained=False, **kwargs):
    """Constructs a ResNet-152 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
    return model

In [14]:
args = dict()
args['yolo_S'] = 14
args['yolo_B'] = 2
args['yolo_C'] = 5
args['num_epochs'] = 1
args['batch_size'] = 20
args['learning_rate'] = 1e-5
args['seed'] = 666
args['dataset_root'] = '/content/deeplearning_assignment_1_dataset'
args['output_dir'] = '/content/checkpoints'
args['l_coord'] = 5.
args['l_noobj'] = 0.5
args['nms_threshold'] = 0.5
args['image_path'] = '/content/deeplearning_assignment_1_dataset/val/image/000001.jpg'
args['model_path'] = "/content/checkpoints/hku_mmdetector_best.pth"
args['unsave_img'] = False
args['vis_dir'] = '/content/vis_results'
args['nms_threshold'] =0.5
args['split'] = 'val'     # val/test
args['output_file'] = "/content/checkpoints/result.pkl"
args['pos_threshold'] = 0.3

CAR_CLASSES = ['Pedestrian', 'Cyclist', 'Car', 'Truck','Tram']

COLORS = {'Pedestrian': (0, 0, 0),
          'Cyclist': (128, 0, 0),
          'Car': (0, 128, 0),
          'Truck': (128, 128, 0),
          'Tram': (0, 0, 128)}

In [15]:
hku_mmdetector34 = resnet34(args = args)
hku_mmdetector50 = resnet50(args = args)
hku_mmdetector101 = resnet101(args = args)

## DataSet

In [16]:
def BGR2RGB(img):
    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)


def BGR2HSV(img):
    return cv2.cvtColor(img, cv2.COLOR_BGR2HSV)


def HSV2BGR(img):
    return cv2.cvtColor(img, cv2.COLOR_HSV2BGR)


def RandomBrightness(bgr):
    if random.random() < 0.5:
        hsv = BGR2HSV(bgr)
        h, s, v = cv2.split(hsv)
        adjust = random.choice([0.5, 1.5])
        v = v * adjust
        v = np.clip(v, 0, 255).astype(hsv.dtype)
        hsv = cv2.merge((h, s, v))
        bgr = HSV2BGR(hsv)
    return bgr


def RandomSaturation(bgr):
    if random.random() < 0.5:
        hsv = BGR2HSV(bgr)
        h, s, v = cv2.split(hsv)
        adjust = random.choice([0.5, 1.5])
        s = s * adjust
        s = np.clip(s, 0, 255).astype(hsv.dtype)
        hsv = cv2.merge((h, s, v))
        bgr = HSV2BGR(hsv)
    return bgr


def RandomHue(bgr):
    if random.random() < 0.5:
        hsv = BGR2HSV(bgr)
        h, s, v = cv2.split(hsv)
        adjust = random.choice([0.5, 1.5])
        h = h * adjust
        h = np.clip(h, 0, 255).astype(hsv.dtype)
        hsv = cv2.merge((h, s, v))
        bgr = HSV2BGR(hsv)
    return bgr


def randomBlur(bgr):
    if random.random() < 0.5:
        bgr = cv2.blur(bgr, (5, 5))
    return bgr


def randomShift(bgr, boxes, labels):
    center = (boxes[:, 2:] + boxes[:, :2]) / 2
    if random.random() < 0.5:
        height, width, c = bgr.shape
        after_shfit_image = np.zeros((height, width, c), dtype=bgr.dtype)
        after_shfit_image[:, :, :] = (104, 117, 123)  # bgr
        shift_x = random.uniform(-width * 0.2, width * 0.2)
        shift_y = random.uniform(-height * 0.2, height * 0.2)

        if shift_x >= 0 and shift_y >= 0:
            after_shfit_image[int(shift_y):, int(shift_x):, :] = bgr[:height - int(shift_y), :width - int(shift_x),
                                                                 :]
        elif shift_x >= 0 and shift_y < 0:
            after_shfit_image[:height + int(shift_y), int(shift_x):, :] = bgr[-int(shift_y):, :width - int(shift_x),
                                                                          :]
        elif shift_x < 0 and shift_y >= 0:
            after_shfit_image[int(shift_y):, :width + int(shift_x), :] = bgr[:height - int(shift_y), -int(shift_x):,
                                                                         :]
        elif shift_x < 0 and shift_y < 0:
            after_shfit_image[:height + int(shift_y), :width + int(shift_x), :] = bgr[-int(shift_y):,
                                                                                  -int(shift_x):, :]

        shift_xy = torch.FloatTensor([[int(shift_x), int(shift_y)]]).expand_as(center)
        center = center + shift_xy
        mask1 = (center[:, 0] > 0) & (center[:, 0] < width)
        mask2 = (center[:, 1] > 0) & (center[:, 1] < height)
        mask = (mask1 & mask2).view(-1, 1)
        boxes_in = boxes[mask.expand_as(boxes)].view(-1, 4)
        if len(boxes_in) == 0:
            return bgr, boxes, labels
        box_shift = torch.FloatTensor([[int(shift_x), int(shift_y), int(shift_x), int(shift_y)]]).expand_as(
            boxes_in)
        boxes_in = boxes_in + box_shift
        labels_in = labels[mask.view(-1)]
        return after_shfit_image, boxes_in, labels_in
    return bgr, boxes, labels


def randomScale(bgr, boxes):
    if random.random() < 0.5:
        scale = random.uniform(0.8, 1.2)
        height, width, c = bgr.shape
        bgr = cv2.resize(bgr, (int(width * scale), height))
        scale_tensor = torch.FloatTensor([[scale, 1, scale, 1]]).expand_as(boxes)
        boxes = boxes * scale_tensor
        return bgr, boxes
    return bgr, boxes


def randomCrop(bgr, boxes, labels):
    if random.random() < 0.5:
        center = (boxes[:, 2:] + boxes[:, :2]) / 2
        height, width, c = bgr.shape
        h = random.uniform(0.6 * height, height)
        w = random.uniform(0.6 * width, width)
        x = random.uniform(0, width - w)
        y = random.uniform(0, height - h)
        x, y, h, w = int(x), int(y), int(h), int(w)

        center = center - torch.FloatTensor([[x, y]]).expand_as(center)
        mask1 = (center[:, 0] > 0) & (center[:, 0] < w)
        mask2 = (center[:, 1] > 0) & (center[:, 1] < h)
        mask = (mask1 & mask2).view(-1, 1)

        boxes_in = boxes[mask.expand_as(boxes)].view(-1, 4)
        if len(boxes_in) == 0:
            return bgr, boxes, labels
        box_shift = torch.FloatTensor([[x, y, x, y]]).expand_as(boxes_in)

        boxes_in = boxes_in - box_shift
        boxes_in[:, 0] = boxes_in[:, 0].clamp_(min=0, max=w)
        boxes_in[:, 2] = boxes_in[:, 2].clamp_(min=0, max=w)
        boxes_in[:, 1] = boxes_in[:, 1].clamp_(min=0, max=h)
        boxes_in[:, 3] = boxes_in[:, 3].clamp_(min=0, max=h)

        labels_in = labels[mask.view(-1)]
        img_croped = bgr[y:y + h, x:x + w, :]
        return img_croped, boxes_in, labels_in
    return bgr, boxes, labels


def subMean(bgr, mean):
    mean = np.array(mean, dtype=np.float32)
    bgr = bgr - mean
    return bgr


def subMeanDividedStd(rgb, mean, std):
    mean = np.array(mean, dtype=np.float32)
    std = np.array(std, dtype=np.float32)
    rgb = (rgb - mean) / std
    return rgb


def random_flip(im, boxes):
    if random.random() < 0.5:
        im_lr = np.fliplr(im).copy()
        h, w, _ = im.shape
        xmin = w - boxes[:, 2]
        xmax = w - boxes[:, 0]
        boxes[:, 0] = xmin
        boxes[:, 2] = xmax
        return im_lr, boxes
    return im, boxes


def random_bright(im, delta=16):
    alpha = random.random()
    if alpha > 0.3:
        im = im * alpha + random.randrange(-delta, delta)
        im = im.clip(min=0, max=255).astype(np.uint8)
    return im


def load_json(path):
    with open(path, mode="r") as f:
        data = json.load(f)
    return data

In [17]:
class Dataset(data.Dataset):
    image_size = 448

    def __init__(self, args, split, transform):
        print('DATASET INITIALIZATION')
        self.args = args
        root = args
        self.root_images = os.path.join(root, split, 'image')
        if split == "train":
            self.train = True
        else:
            self.train = False

        self.transform = transform
        self.f_names, self.boxes, self.labels = [], [], []
        self.mean = [123.675, 116.280, 103.530]  # RGB
        self.std = [58.395, 57.120, 57.375]
        annotation_path = os.path.join(root, 'annotations', 'instance_' + split + '.json')
        annotations = load_json(annotation_path)

        for annotation in annotations['annotations']:
            if annotation['image_name'] not in self.f_names:
                if len(self.f_names) != 0:
                    self.boxes.append(torch.Tensor(box))
                    self.labels.append(torch.LongTensor(label))
                box, label = [], []
                self.f_names.append(annotation['image_name'])

            bbox = annotation['bbox']
            x1, y1, x2, y2 = float(bbox[0]), float(bbox[1]), float(bbox[0] + bbox[2]), float(bbox[1] + bbox[3])
            box.append([x1, y1, x2, y2])
            label.append(int(annotation['category_id']))

        self.num_samples = len(self.boxes)

    def __getitem__(self, idx):
        f_name = self.f_names[idx]
        img = cv2.imread(os.path.join(self.root_images, f_name))
        boxes = self.boxes[idx].clone()
        labels = self.labels[idx].clone()

        if self.train:
            # img = self.random_bright(img)
            img, boxes = random_flip(img, boxes)
            img, boxes = randomScale(img, boxes)
            img = randomBlur(img)
            img = RandomBrightness(img)
            img = RandomHue(img)
            img = RandomSaturation(img)
            img, boxes, labels = randomShift(img, boxes, labels)
            img, boxes, labels = randomCrop(img, boxes, labels)

        h, w, _ = img.shape
        boxes /= torch.Tensor([w, h, w, h]).expand_as(boxes)
        img = BGR2RGB(img)
        img = subMeanDividedStd(img, self.mean, self.std)
        img = cv2.resize(img, (self.image_size, self.image_size))
        target = self.encoder(boxes, labels)  # S*S*(B*5+C)
        for t in self.transform:
            img = t(img)

        return img, target

    def __len__(self):
        return self.num_samples

    def encoder(self, boxes, labels):
        S, B, C = 14, 2, 5
        grid_num = S
        target = torch.zeros((grid_num, grid_num, B * 5 + C))
        cell_size = 1.0 / grid_num
        wh = boxes[:, 2:] - boxes[:, :2]
        cxcy = (boxes[:, 2:] + boxes[:, :2]) / 2
        for i in range(cxcy.size()[0]):
            cxcy_sample = cxcy[i]
            ij = (cxcy_sample / cell_size).ceil() - 1
            target[int(ij[1]), int(ij[0]), 4] = 1
            target[int(ij[1]), int(ij[0]), 9] = 1
            target[int(ij[1]), int(ij[0]), int(labels[i]) + 9] = 1
            xy = ij * cell_size
            delta_xy = (cxcy_sample - xy) / cell_size
            target[int(ij[1]), int(ij[0]), 2:4] = wh[i]
            target[int(ij[1]), int(ij[0]), :2] = delta_xy
            target[int(ij[1]), int(ij[0]), 7:9] = wh[i]
            target[int(ij[1]), int(ij[0]), 5:7] = delta_xy
        return target


## YOLO Loss

In [18]:
# criterion = yololoss(args, l_coord=args.l_coord, l_noobj=args.l_noobj)
class yololoss(nn.Module):
    def __init__(self, args, l_coord, l_noobj):
        super(yololoss, self).__init__()
        self.S = args['yolo_S']
        self.B = args['yolo_B']
        self.C = args['yolo_C']
        self.len_pred = (5 * self.B) + self.C
        self.l_coord = l_coord
        self.l_noobj = l_noobj

    def compute_iou(self, box1, box2):
        """ compute IOU between boxes
            - box1 (bs, 4)  4: [x1, y1, x2, y2]  left top and right bottom
            - box2 (bs, 4)  4: [x1, y1, x2, y2]  left top and right bottom
        """
        N = box1.size(0)
        M = box2.size(0)
        
        lt = torch.max(
            box1[:, :2].unsqueeze(1).expand(N, M, 2),  # [N,2] -> [N,1,2] -> [N,M,2]
            box2[:, :2].unsqueeze(0).expand(N, M, 2),  # [M,2] -> [1,M,2] -> [N,M,2]
        )

        rb = torch.min(
            box1[:, 2:].unsqueeze(1).expand(N, M, 2),  # [N,2] -> [N,1,2] -> [N,M,2]
            box2[:, 2:].unsqueeze(0).expand(N, M, 2),  # [M,2] -> [1,M,2] -> [N,M,2]
        )

        wh = rb - lt  # [N,M,2]
        wh[wh < 0] = 0  # clip at 0
        inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]

        area1 = (box1[:, 2] - box1[:, 0]) * (box1[:, 3] - box1[:, 1])  # [N,]
        area2 = (box2[:, 2] - box2[:, 0]) * (box2[:, 3] - box2[:, 1])  # [M,]
        area1 = area1.unsqueeze(1).expand_as(inter)  # [N,] -> [N,1] -> [N,M]
        area2 = area2.unsqueeze(0).expand_as(inter)  # [M,] -> [1,M] -> [N,M]

        iou = inter / (area1 + area2 - inter)
        return iou

    def forward(self, prediction, target):
        """   [1, 14, 14, 15]
            - prediction: (bs, S, S, B*5+C)    [x1, y1, w1, h1, c1, x2, y2, w2, h2, c2, confidence for C classes]
            - target: (bs, S, S, B*5+C)    [x, y, w, h, c, x, y, w, h, c, confidence for C classes]
        """        

        '''
        coo_mask torch.Size([12, 14, 14])
        noo_mask torch.Size([12, 14, 14])
        coo_mask torch.Size([12, 14, 14, 15])
        coo_pred torch.Size([74, 15])
        box_pred torch.Size([148, 5])
        class_pred torch.Size([74, 5])
        '''
        N = prediction.size()[0]
        coo_mask = target[:, :, :, 4] > 0     # 有目标[1, 14, 14] [bool type]
        noo_mask = target[:, :, :, 4] == 0    # 无目标[1, 14, 14] 
        
        coo_mask = coo_mask.unsqueeze(-1).expand_as(target)   # [1, 14, 14, 15]
        noo_mask = noo_mask.unsqueeze(-1).expand_as(target)   # [1, 14, 14, 15]

        coo_pred = prediction[coo_mask].view(-1, self.len_pred)             # [bs中所有的the prediction # grid that contain the object的和 , 15]
        box_pred = coo_pred[:, :self.B * 5].contiguous().view(-1, 5)        # [bs*the prediction # grid that contain the object*2, 5] grids [[x1,y1,w1,h1,c1],[x2,y2,w2,h2,c2], [],[], [],[]]
        class_pred = coo_pred[:, self.B * 5:]                               # [bs*the prediction # grid that contain the object, 5]
        
        coo_target = target[coo_mask].view(-1, self.len_pred)               # [the truth # grid that contain the object, 15]
        box_target = coo_target[:, :self.B * 5].contiguous().view(-1, 5)    # [the truth # grid that contain the object*2, 5] boxes [[x1,y1,w1,h1,c1],[x2,y2,w2,h2,c2], [],[], [],[]]
        class_target = coo_target[:, self.B * 5:]                           # [the truth # grid that contain the object, 5]
        
        """Non Maximum Suppression"""
        coo_response_mask = torch.cuda.BoolTensor(box_target.size())        # [[F,F,F,F,F],[F,F,F,F,F], [],[], [],[]]
        coo_response_mask.zero_()                                             # 全变成False

        coo_not_response_mask = torch.cuda.BoolTensor(box_target.size())    # coo_response_mask 和 coo_not_response_mask 是含有目标的box
        coo_not_response_mask.zero_()
        box_target_iou = torch.zeros(box_target.size()).cuda()              # [[0,0,0,0,0],[0,0,0,0,0], [],[], [],[]]

        for i in range(0, box_target.size()[0], self.B):   # (0, # truth that contain*2, 2)
            box1 = box_pred[i:i + self.B]                  # [[x1,y1,w1,h1,c1],[x2,y2,w2,h2,c2]]
            box1_xyxy = torch.FloatTensor(box1.size())     # [2, 5]
            """ from [x,y,w,h] to [x1,y1,x2,y2]"""
            box1_xyxy[:, :2] = box1[:, :2] / self.S - 0.5 * box1[:, 2:4]           # [[x1_left, y1_left, x1_right, y1_right, c1]
                                                                                   #  [x2_left, y2_left, x2_right, y2_right, c2]]
            box1_xyxy[:, 2:4] = box1[:, :2] / self.S + 0.5 * box1[:, 2:4]
            
            box1 = box1_xyxy[:,:4]
            
            # box2 = box_target[i].view(-1, 5)             # ？？？？？？？？？？这里我不理解？？？？？？？？？？？？？？？？？
            # box2_xyxy = torch.FloatTensor(box2.size())   # ？？？？？？？？？这两行原码未注释？？？？？？？？？？？？？？？？
            ###################################################################
            # TODO: Please fill the codes below to calculate the iou of the two boxes and substite the "?"
            # Note: return variable: iou_res (self.B, 1)
            ##################################################################
            box2 = box_target[i:i + self.B]                  # [[x1,y1,w1,h1,c1],[x2,y2,w2,h2,c2]]
            box2_xyxy = torch.FloatTensor(box2.size())       # [2, 5]
            """ from [x,y,w,h] to [x1,y1,x2,y2]"""
            box2_xyxy[:, :2] = box2[:, :2] / self.S - 0.5 * box2[:, 2:4]  # [[x1_left, y1_left, x1_right, y1_right, c1]
                                                                          #  [x2_left, y2_left, x2_right, y2_right, c2]]

            box2_xyxy[:, 2:4] = box2[:, :2] / self.S + 0.5 * box2[:, 2:4]
            box2 = box2_xyxy[:, 0:4]
            iou_res = self.compute_iou(box2, box1)[0]
            ##################################################################
            max_iou, max_index = iou_res.max(0)
            max_index = max_index.data.cuda()

            coo_response_mask[i + max_index] = True
            
            for j in range(self.B):
                if j == max_index:
                    continue
                else:
                    coo_not_response_mask[i + j] = True

            box_target_iou[i + max_index, torch.LongTensor([4]).cuda()] = max_iou.data.cuda()
        
        box_target_iou = box_target_iou.cuda()
        
        """Compute Term1 + Term2: Location Loss"""
        box_pred = box_pred.cuda()                                              # [[], [], [], ..., []]
        box_target = box_target.cuda()
        box_pred_response = box_pred[coo_response_mask].view(-1, 5)
        box_target_response = box_target[coo_response_mask].view(-1, 5)
        ###################################################################
        # TODO: Please fill the codes below to calculate the location loss
        ##################################################################
        loc_loss = 0
        x_2 = torch.square(box_pred[:,0] - box_target[:, 0])
        y_2 = torch.square(box_pred[:,1] - box_target[:, 1])
        loc_loss += self.l_coord*(torch.sum(x_2 + y_2))

        w = torch.square(torch.sqrt(box_pred[:,2]) - torch.sqrt(box_target[:, 2]))
        h = torch.square(torch.sqrt(box_pred[:,3]) - torch.sqrt(box_target[:, 3]))
        loc_loss += self.l_coord*(torch.sum(w+h))
        ##################################################################
        
        """Compute the 3rd Term: IOU loss for boxes containing the objects"""
        box_target_response_iou = box_target_iou[coo_response_mask].view(-1, 5)
        contain_loss = F.mse_loss(box_pred_response[:, 4], box_target_response_iou[:, 4], reduction='sum')
        
        """Compute the 4th Term (Part I): Not Response Loss"""
        ###################################################################
        # TODO: Please fill the codes below to calculate the Not Response Loss   
        ##################################################################
        box_pred_not_response = box_pred[coo_not_response_mask].view(-1, 5)
        box_target_not_response = box_target[coo_not_response_mask].view(-1, 5)
        not_response_loss = F.mse_loss(box_pred_not_response[:, 4], box_target_not_response[:, 4], reduction = 'sum')
        ##################################################################
        
        """Compute the 4th Term (Part II): No Object Contain Loss"""
        noo_pred = prediction[noo_mask].view(-1, self.len_pred).cuda()
        noo_target = target[noo_mask].view(-1, self.len_pred).cuda()
        noo_pred_mask = torch.cuda.BoolTensor(noo_pred.size()).cuda()
        noo_pred_mask.zero_()
        noo_pred_mask[:, 4] = 1
        noo_pred_mask[:, 9] = 1
        noo_pred_c = noo_pred[noo_pred_mask]
        noo_target_c = noo_target[noo_pred_mask]
        nooobj_loss = F.mse_loss(noo_pred_c, noo_target_c, reduction='sum')
        
        """Compute the 5th Term: Class Loss"""
        class_loss = F.mse_loss(class_pred, class_target, reduction='sum')
        
        """Summarize the five terms"""
        loss = self.l_coord * loc_loss + 2 * contain_loss + not_response_loss + self.l_noobj * nooobj_loss + class_loss

        return loss / N
loss = yololoss(args, 5, 0.5)

## train

In [20]:
file_root = '/content/deeplearning_assignment_1_dataset'
train_dataset = Dataset(file_root, split='train',
                        transform=[transforms.ToTensor()])
train_loader = DataLoader(train_dataset, batch_size=args['batch_size'], shuffle=True, num_workers=os.cpu_count() - 2)

val_dataset = Dataset(args['dataset_root'], split='val', transform=[transforms.ToTensor()])
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args['batch_size'], shuffle=False, num_workers=4)

DATASET INITIALIZATION
DATASET INITIALIZATION




In [32]:
len(train_dataset[0])

2

In [18]:
def load_pretrained(net):
    resnet = torchvision.models.resnet50(pretrained=True)
    resnet_state_dict = resnet.state_dict()

    net_dict = net.state_dict()
    for k in resnet_state_dict.keys():
        if k in net_dict.keys() and not k.startswith('fc'):
            net_dict[k] = resnet_state_dict[k]
    net.load_state_dict(net_dict)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('NUMBER OF CUDA DEVICES:', torch.cuda.device_count())

# Other settings
args['load_pretrain'] = False
print(args)

output_dir = args['output_dir']
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

np.random.seed(args['seed'])
torch.manual_seed(args['seed'])

####################################################################
criterion = yololoss(args, l_coord=args['l_coord'], l_noobj=args['l_noobj'])

hku_mmdetector = resnet50(args=args)
if args['load_pretrain']:
    load_pretrained(hku_mmdetector)
hku_mmdetector = hku_mmdetector.to(device)

hku_mmdetector.train()

# initialize optimizer
optimizer = torch.optim.AdamW(hku_mmdetector.parameters(), betas=(0.9, 0.999), lr=args['learning_rate'])

# initialize dataset
train_dataset = Dataset(args['dataset_root'], split='train', transform=[transforms.ToTensor()])
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args['batch_size'], shuffle=True, num_workers=4)

###################################################################
# TODO: Please fill the codes below to initialize the validation dataset
##################################################################
val_dataset = Dataset(args['dataset_root'], split='val', transform=[transforms.ToTensor()])
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args['batch_size'], shuffle=False, num_workers=4)
##################################################################
bs = args['batch_size']
print(f'NUMBER OF DATA SAMPLES: {len(train_dataset)}')
print(f'BATCH SIZE: {bs}')

train_dict = dict(iter=[], loss=[])
best_val_loss = np.inf
lossList = []
for epoch in range(args['num_epochs']):
    hku_mmdetector.train()

    # training
    total_loss = 0.
    print(('\n' + '%10s' * 3) % ('epoch', 'loss', 'gpu'))
    progress_bar = tqdm.tqdm(enumerate(train_loader), total=len(train_loader))
    for i, (images, target) in progress_bar:
        images = images.to(device)
        target = target.to(device)

        pred = hku_mmdetector(images)
        loss = criterion(pred, target)

        total_loss += loss.data
        lossList.append(loss.data)
        ###################################################################
        # TODO: Please fill the codes here to complete the gradient backward
        ##################################################################
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        ##################################################################

        mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0)
        s = ('%10s' + '%10.4g' + '%10s') % ('%g/%g' % (epoch + 1, args['num_epochs']), total_loss / (i + 1), mem)
        progress_bar.set_description(s)

    # validation
    validation_loss = 0.0
    hku_mmdetector.eval()
    progress_bar = tqdm.tqdm(enumerate(val_loader), total=len(val_loader))
    for i, (images, target) in progress_bar:
        images = images.to(device)
        target = target.to(device)

        prediction = hku_mmdetector(images)
        loss = criterion(prediction, target)
        validation_loss += loss.data
    validation_loss /= len(val_loader)
    print("validation loss:", validation_loss.item())

    if best_val_loss > validation_loss:
        best_val_loss = validation_loss

        save = {'state_dict': hku_mmdetector.state_dict()}
        torch.save(save, os.path.join(output_dir, 'hku_mmdetector_best.pth'))

    save = {'state_dict': hku_mmdetector.state_dict()}
    torch.save(save, os.path.join(output_dir, 'hku_mmdetector_epoch_'+str(epoch+1)+'.pth'))

    torch.cuda.empty_cache()


NUMBER OF CUDA DEVICES: 1
{'yolo_S': 14, 'yolo_B': 2, 'yolo_C': 5, 'num_epochs': 1, 'batch_size': 20, 'learning_rate': 1e-05, 'seed': 666, 'dataset_root': '/content/deeplearning_assignment_1_dataset', 'output_dir': '/content/checkpoints', 'l_coord': 5.0, 'l_noobj': 0.5, 'nms_threshold': 0.5, 'image_path': '/content/deeplearning_assignment_1_dataset/val/image/000001.jpg', 'model_path': '/content/checkpoints/hku_mmdetector_best.pth', 'unsave_img': False, 'vis_dir': '/content/vis_results', 'split': 'val', 'output_file': '/content/checkpoints/result.pkl', 'pos_threshold': 0.3, 'load_pretrain': False}
DATASET INITIALIZATION




DATASET INITIALIZATION
NUMBER OF DATA SAMPLES: 4964
BATCH SIZE: 20

     epoch      loss       gpu


       1/1     146.4     8.81G: 100%|██████████| 249/249 [09:52<00:00,  2.38s/it]
  2%|▏         | 3/125 [00:07<05:07,  2.52s/it]


OutOfMemoryError: ignored

## Non_maximum_suppression

In [34]:
# boxes.shape = [367, 4], cls_indexes.shape = [367], confidences.shape = [367]
def non_maximum_suppression(boxes, scores, threshold=0.5):
    """
    Input:
        - boxes: (bs, 4)  4: [x1, y1, x2, y2] left top and right bottom
        - scores: (bs, )   confidence score
        - threshold: int    delete bounding box with IoU greater than threshold
    Return:
        - A long int tensor whose size is (bs, )
    """
    ###################################################################
    # TODO: Please fill the codes below to calculate the iou of the two boxes
    # Hint: You can refer to the nms part implemented in loss.py but the input shapes are different here
    ##################################################################
    if len(boxes) == 0:
        return [], []

    # coordinates of bounding boxes
    start_x = boxes[:, 0]
    start_y = boxes[:, 1]
    end_x = boxes[:, 2]
    end_y = boxes[:, 3]

    # Confidence scores of bounding boxes

    # Picked bounding boxes
    picked_index = []

    # Compute areas of bounding boxes
    areas = (end_x - start_x + 1) * (end_y - start_y + 1)

    # Sort by confidence score of bounding boxes
    order = np.argsort(scores)

    # Iterate bounding boxes
    while order.shape[0] > 0:
        # The index of largest confidence score
        index = order[-1]

        # Pick the bounding box with largest confidence score
        picked_index.append(index)

        # Compute ordinates of intersection-over-union(IOU)
        x1 = np.maximum(start_x[index], start_x[order[:-1]])
        x2 = np.minimum(end_x[index], end_x[order[:-1]])
        y1 = np.maximum(start_y[index], start_y[order[:-1]])
        y2 = np.minimum(end_y[index], end_y[order[:-1]])

        # Compute areas of intersection-over-union
        w = np.maximum(0.0, x2 - x1 + 1)
        h = np.maximum(0.0, y2 - y1 + 1)
        intersection = w * h

        # Compute the ratio between intersection and union
        ratio = intersection / (areas[index] + areas[order[:-1]] - intersection)

        left = np.where(ratio < threshold)
        order = order[left]

    return torch.tensor(picked_index)
    ##################################################################


def pred2box(args, prediction):     # 1xSxSx(B*5+C)
    """
    This function calls non_maximum_suppression to transfer predictions to predicted boxes.
    """
    S, B, C = args['yolo_S'], args['yolo_B'], args['yolo_C']
    
    boxes, cls_indexes, confidences = [], [], []
    prediction = prediction.data.squeeze(0)  # [14, 14, 15]
    
    contain = [] 
    for b in range(B):
        tmp_contain = prediction[:, :, b * 5 + 4].unsqueeze(2) # confidences: [14, 14, 1]
        contain.append(tmp_contain)

    contain = torch.cat(contain, 2) # [14, 14, 2] 每个位置上有两个 confidence
    mask1 = contain > 0.1
    mask2 = (contain == contain.max())
    mask = mask1 + mask2           # 同时为真是2，只有一个为真是1，同时为假为0
    
    for i in range(S):
        for j in range(S):
            for b in range(B):
                if mask[i, j, b] == 1:
                    box = prediction[i, j, b * 5:b * 5 + 4]             # 若只有一个为真，那就把对应prediction里的box取出来 -> [x1, y1, w1, h1]
                    contain_prob = torch.FloatTensor([prediction[i, j, b * 5 + 4]]) # 把 confidence 取出来
                    xy = torch.FloatTensor([j, i]) * 1.0 / S
                    box[:2] = box[:2] * 1.0 / S + xy
                    box_xy = torch.FloatTensor(box.size())
                    box_xy[:2] = box[:2] - 0.5 * box[2:]
                    box_xy[2:] = box[:2] + 0.5 * box[2:]
                    max_prob, cls_index = torch.max(prediction[i, j, B*5:], 0)
                    cls_index = torch.LongTensor([cls_index])
                    if float((contain_prob * max_prob)[0]) > 0.1:
                        boxes.append(box_xy.view(1, 4))
                        cls_indexes.append(cls_index)
                        confidences.append(contain_prob * max_prob)

    if len(boxes) == 0:
        boxes = torch.zeros((1, 4))
        confidences = torch.zeros(1)
        cls_indexes = torch.zeros(1)
    else:
        boxes = torch.cat(boxes, 0)
        confidences = torch.cat(confidences, 0)
        cls_indexes = torch.cat(cls_indexes, 0)
    # print('boxes.shape, cls_indexes.shape, confidences.shape',boxes.shape, cls_indexes.shape, confidences.shape)
    #     !!!!!!!!!!!!!!!!!!!!!!!!!      # 
    keep = non_maximum_suppression(boxes, confidences, threshold=args['nms_threshold'])
    # print(keep.shape)
    return boxes[keep], cls_indexes[keep], confidences[keep]


def inference(args, model, img_path):
    """
    Inference the image with trained model to get the predicted bounding boxes
    """
    results = []
    img = cv2.imread(img_path)
    h, w, _ = img.shape
    img = cv2.resize(img, (448, 448))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    mean = (123.675, 116.280, 103.530)  # RGB
    std = (58.395, 57.120, 57.375)
    ###################################################################
    # TODO: Please fill the codes here to do the image normalization
    ##################################################################
    # Inew = (I - I.mean) / I.std
    img[0, :, :] = (img[0, :, :] - mean[0]) / std[0]
    img[1, :, :] = (img[1, :, :] - mean[1]) / std[1]
    img[2, :, :] = (img[2, :, :] - mean[2]) / std[2]
    ##################################################################

    transform = transforms.Compose([transforms.ToTensor(), ])
    img = transform(img).unsqueeze(0)
    img = img.cuda()
    model.cuda()

    # [1, 3, 448, 448]
    with torch.no_grad():
        prediction = model(img).cpu()  # 1xSxSx(B*5+C)
        boxes, cls_indices, confidences = pred2box(args, prediction)
    # !!!!!!!!!!!!!!!!!
    # print('boxed.shape',boxes.shape)
    # print('cls_indices.shape',cls_indices.shape)
    # print('confidences.shape',confidences.shape)
    for i, box in enumerate(boxes):
        x1 = int(box[0] * w)
        x2 = int(box[2] * w)
        y1 = int(box[1] * h)
        y2 = int(box[3] * h)
        cls_index = cls_indices[i]
        cls_index = int(cls_index)  # convert LongTensor to int
        conf = confidences[i]
        conf = float(conf)
        results.append([(x1, y1), (x2, y2), CAR_CLASSES[cls_index], img_path.split('/')[-1], conf])
    # print(results)
    return results

In [47]:
results = inference(args, hku_mmdetector50, '/content/deeplearning_assignment_1_dataset/val/image/000002.jpg')

In [48]:
results

[[(289, 221), (819, 1028), 'Car', '000002.jpg', 0.9999924898147583],
 [(1371, 190), (1371, 425), 'Car', '000002.jpg', 0.9999922513961792],
 [(775, -209), (872, 530), 'Car', '000002.jpg', 0.9999704360961914],
 [(1858, -222), (1859, 376), 'Car', '000002.jpg', 0.9998140335083008],
 [(685, 745), (687, 850), 'Car', '000002.jpg', 0.999272346496582],
 [(1245, 847), (1245, 887), 'Car', '000002.jpg', 0.999169111251831],
 [(415, 520), (416, 548), 'Car', '000002.jpg', 0.9968346357345581],
 [(144, -461), (162, 614), 'Car', '000002.jpg', 0.9948241710662842],
 [(1650, 526), (1656, 774), 'Car', '000002.jpg', 0.9931192398071289],
 [(-633, 868), (999, 1141), 'Car', '000002.jpg', 0.8075276017189026],
 [(1753, 907), (1943, 1137), 'Car', '000002.jpg', 0.6508580446243286],
 [(1088, -7), (2751, 469), 'Car', '000002.jpg', 0.6457266807556152],
 [(1222, 601), (2341, 1404), 'Car', '000002.jpg', 0.4876706898212433]]

## Valid

In [23]:
hku_mmdetector = resnet50(args=args)

In [51]:
class Evaluation:
    def __init__(self, predictions, targets, threshold):
        super(Evaluation, self).__init__()
        self.predictions = predictions
        self.targets = targets
        self.threshold = threshold

    @staticmethod
    def compute_ap(recall, precision):
        # average precision calculation
        recall = np.concatenate(([0.], [recall], [1.]))
        precision = np.concatenate(([0.], [precision], [0.]))

        for i in range(precision.size - 1, 0, -1):
            precision[i - 1] = max(precision[i - 1], precision[i])

        ap = 0.0  # average precision (AUC of the precision-recall curve).
        for i in range(precision.size - 1):
            ap += (recall[i + 1] - recall[i]) * precision[i + 1]

        return ap

    # targets = {('00001.jpg', 'Cyclist'):[[x1, y1, x2, y2], [], []], (,):[[], []]}
    # predictions:{'Cyclist':[['00001.jpg', conf, x1, y1, x2, y2], [], []], ''}
    # aps = Evaluation(predictions, targets, threshold=args['pos_threshold']).evaluate()
    def evaluate(self):
        aps = []
        print('CLASS'.ljust(25, ' '), 'AP')
        for class_name in CAR_CLASSES:
            class_preds = self.predictions[class_name]  # [[image_name,confidence,x1,y1,x2,y2],...]
            if len(class_preds) == 0:
                ap = 0
                print(f'{class_name}'.ljust(25, ' '), f'{ap:.2f}')
                aps.append(ap)
                continue
            image_ids = [x[0] for x in class_preds]                        # ['00001.jpg', '']
            confidence = np.array([float(x[1]) for x in class_preds])      # [0.99, 0.98]
            BB = np.array([x[2:] for x in class_preds])                    # [[x1, y1, x2, y2], [], []]
            # sort by confidence
            sorted_ind = np.argsort(-confidence)                           # 降序index
            sorted_scores = np.sort(-confidence)                           
            BB = BB[sorted_ind, :]                                         # 降序BB
            image_ids = [image_ids[x] for x in sorted_ind]                 # 降序id

            # go down dets and mark TPs and FPs
            npos = 0.          # target里属于当前类的所有的BB的个数
            truth_images_num = 0
            for (key1, key2) in self.targets:
                if key2 == class_name:
                    npos += len(self.targets[(key1, key2)])
                    truth_images_num += 1
            nd = len(image_ids) # 图片个数
            tp = np.zeros(nd)
            fp = np.zeros(nd)

            for d, image_id in enumerate(image_ids):       # 当前预测predict里认为是当前类别的所有image
                bb = BB[d]                                 # predict的bb
                if (image_id, class_name) in self.targets: # 如果target里有这个组合，说明预测的也许是对的
                    print('have probability')
                    BBGT = self.targets[(image_id, class_name)]
                    for x1y1_x2y2 in BBGT:
                        # compute overlaps
                        # intersection
                        x_min = np.maximum(x1y1_x2y2[0], bb[0])
                        y_min = np.maximum(x1y1_x2y2[1], bb[1])
                        x_max = np.minimum(x1y1_x2y2[2], bb[2])
                        y_max = np.minimum(x1y1_x2y2[3], bb[3])
                        w = np.maximum(x_max - x_min + 1., 0.)
                        h = np.maximum(y_max - y_min + 1., 0.)
                        intersection = w * h

                        union = (bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + (x1y1_x2y2[2] - x1y1_x2y2[0] + 1.) * (
                                x1y1_x2y2[3] - x1y1_x2y2[1] + 1.) - intersection
                        if union == 0:
                            print(bb, x1y1_x2y2)

                        overlaps = intersection / union
                        if overlaps > self.threshold:
                            tp[d] = 1
                            BBGT.remove(x1y1_x2y2)
                            if len(BBGT) == 0:
                                del self.targets[(image_id, class_name)]
                            break
                    fp[d] = 1 - tp[d]
                else:     # target没有这个组合，一定是错的，所以fp+1
                    fp[d] = 1
            ###################################################################
            # TODO: Please fill the codes to compute recall and precision
            ##################################################################
            print('tp',tp)
            print('fp', fp)
            truth_images_num
            recall = 0.
            precision = 0.
            recall = (np.sum(tp) / truth_images_num)
            precision = (np.sum(tp) / nd)
            print('recall', recall)
            print('precision', precision)
            ##################################################################
            ap = self.compute_ap(recall, precision)
            print(f'{class_name}'.ljust(25, ' '), f'{ap*100:.2f}')
            aps.append(ap)
        print('aps', aps)
        return aps



In [36]:
targets = defaultdict(list)
predictions = defaultdict(list)
image_list = []

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('DATA PREPARING...')
annotation_path = os.path.join(args['dataset_root'], 'annotations', 'instance_%s.json' % args['split'])
annotations = load_json(annotation_path)

for annotation in annotations['annotations']:
    image_name = annotation['image_name']
    if image_name not in image_list:
        image_list.append(image_name)
    bbox = annotation['bbox']
    x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[0] + bbox[2]), int(bbox[1] + bbox[3])
    c = int(annotation['category_id'])
    class_name = CAR_CLASSES[c-1]
    targets[(image_name, class_name)].append([x1, y1, x2, y2])    # targets = {('00001.jpg', 'Cyclist'):[[x1, y1, x2, y2], [], []], (,):[[], []]}
print('DONE.')
print('START EVALUATION...')
print('len(targets)',len(targets))
model = resnet50(args=args).to(device)

# if torch.cuda.device_count() > 1:
#     model = nn.DataParallel(model)

# model.load_state_dict(torch.load(args.model_path)['state_dict'])
model.eval()

for image_name in image_list: # 原来是tqdm(image_list)
    image_path = os.path.join(args['dataset_root'], args['split'], 'image', image_name)
    result = inference(args, model, image_path)
    # # [(x1, y1), (x2, y2), CAR_CLASSES[cls_index], image_name, conf]
    for (x1, y1), (x2, y2), class_name, image_name, conf in result:
        predictions[class_name].append([image_name, conf, x1, y1, x2, y2]) # predictions:{'Cyclist':[['00001.jpg', conf, x1, y1, x2, y2], [], []], ''}
aps = Evaluation(predictions, targets, threshold=args['pos_threshold']).evaluate()
print(f'mAP: {np.mean(aps):.2f}')
# write the prediction result
f = open(args['output_file'], 'wb')
pickle.dump(args, f)
pickle.dump(predictions, f)
f.close()
print(predictions.shape)
print('BEGIN CALCULATE MAP...')
# aps = Evaluation(predictions, targets, threshold=args['pos_threshold']).evaluate()
# print(f'mAP: {np.mean(aps):.2f}')
print('DONE.')

DATA PREPARING...
DONE.
START EVALUATION...
len(targets) 5577
CLASS                     AP
recall 0.0
precision 0.0


ValueError: ignored

In [None]:
aps = Evaluation(predictions, targets, threshold=args['pos_threshold']).evaluate()

## Predict

In [None]:
def predict(args, model):
    image_path = args.image_path
    image = cv2.imread(image_path)

    print('PREDICTING...')
    result = inference(args, model, image_path)

    for x1y1, x2y2, class_name, _, prob in result:
        color = COLORS[class_name]
        cv2.rectangle(image, x1y1, x2y2, color, 2)

        label = class_name + str(round(prob, 2))
        text_size, baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.4, 1)

        p1 = (x1y1[0], x1y1[1] - text_size[1])
        cv2.rectangle(image, (p1[0] - 2 // 2, p1[1] - 2 - baseline), (p1[0] + text_size[0], p1[1] + text_size[1]),
                      color, -1)
        cv2.putText(image, label, (p1[0], p1[1] + baseline), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1, 8)

    if not args.unsave_img:
        vis_dir = args.vis_dir
        if not os.path.exists(vis_dir):
            os.makedirs(vis_dir)
        save_path = os.path.join(vis_dir, image_path.split('/')[-1])
        cv2.imwrite(save_path, image)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--yolo_S', default=14, type=int, help='YOLO grid num')
    parser.add_argument('--yolo_B', default=2, type=int, help='YOLO box num')
    parser.add_argument('--yolo_C', default=5, type=int, help='detection class num')

    parser.add_argument('--image_path', default="./ass1_dataset/val/image/000001.jpg", help='Path to Image file')
    parser.add_argument('--model_path', default="./checkpoints/hku_mmdetector_best.pth", help='Pretrained Model Path')
    parser.add_argument('--unsave_img', action='store_true', help='Do not save the image after detection')
    parser.add_argument('--vis_dir', default="./vis_results", help='Dir for Visualization')

    parser.add_argument('--nms_threshold', default=0.5, type=float, help='Threshold for non maximum suppression')
    args = parser.parse_args()

    ####################################################################
    # Prediction
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = resnet50(args=args).to(device)

    print('LOADING MODEL...')
    #     if torch.cuda.device_count() > 1:
    #         model = nn.DataParallel(model)

    # If you have single gpu then please modify model loading process
    model.load_state_dict(torch.load(args.model_path)['state_dict'])
    model.eval()
    predict(args, model)

In [6]:
torch.cat?