# Object Detection
* 2 people in a group
* Deadline: 10/13

## Dataset

- PASCAL VOC 2007
  - Number of class: 20
  - The data list is provided in the google drive. However, you have to download the training/testing data from http://host.robots.ox.ac.uk/pascal/VOC/voc2007/. 
    - Train/Val data: 5011
        - Each row contains one image and its bounding boxes.
        - filename ($x_{min}$, $y_{min}$, $x_{max}$, $y_{max}$, $label$) $\times$ object_num
        - class idx starts from 1
    - Test data: 4951
        - filename ($x_{min}$, $y_{min}$, $x_{max}$, $y_{max}$, $label$) $\times$ object_num
        - class idx starts from 0
    


### Loading your data into Google Colab with Google Drive

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Resnet50 backbone

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models.resnet import BasicBlock, Bottleneck
from torchvision.models.resnet import model_urls
from torchsummary import summary

class classify_bottleneck(nn.Module):
  expansion = 1

  def __init__(self, inplanes, planes, stride=1, block_type='A'):
    super(classify_bottleneck, self).__init__()
    self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
    self.bn1 = nn.BatchNorm2d(planes)
    self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=2, bias=False,dilation=2)
    self.bn2 = nn.BatchNorm2d(planes)
    self.conv3 = nn.Conv2d(planes, planes, kernel_size=1, bias=False)
    self.bn3 = nn.BatchNorm2d(planes)

    self.downsample = nn.Sequential()
    if stride != 1 or block_type=='B':
        self.downsample = nn.Sequential(
            nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False),
            nn.BatchNorm2d(self.expansion*planes)
        )

  def forward(self, x):
    out = F.relu(self.bn1(self.conv1(x)))
    out = F.relu(self.bn2(self.conv2(out)))
    out = self.bn3(self.conv3(out))
    out += self.downsample(x)
    out = F.relu(out)
    return out

class ResNetYoloV1(nn.Module):

    def __init__(self, resnet_type):
	
        resnet_spec = {18: (BasicBlock, [2, 2, 2, 2], [64, 64, 128, 256, 512], 'resnet18'),
		       34: (BasicBlock, [3, 4, 6, 3], [64, 64, 128, 256, 512], 'resnet34'),
		       50: (Bottleneck, [3, 4, 6, 3], [64, 256, 512, 1024, 2048], 'resnet50'),
		       101: (Bottleneck, [3, 4, 23, 3], [64, 256, 512, 1024, 2048], 'resnet101'),
		       152: (Bottleneck, [3, 8, 36, 3], [64, 256, 512, 1024, 2048], 'resnet152')}
        block, layers, channels, name = resnet_spec[resnet_type]
        
        self.name = name
        self.inplanes = 64
        super(ResNetYoloV1, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        self.layer5 = self._make_classify_layer(in_channels=2048) #2048*14*14

        self.conv_end = nn.Conv2d(256, 30, kernel_size=3, stride=2, padding=1, bias=False)#30*7*7
        self.bn_end = nn.BatchNorm2d(30)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                nn.init.normal_(m.weight, mean=0, std=0.001)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)
    
    def _make_classify_layer(self,in_channels):
        layers = []
        layers.append(classify_bottleneck(inplanes=in_channels, planes=256, block_type='B'))
        layers.append(classify_bottleneck(inplanes=256, planes=256))
        layers.append(classify_bottleneck(inplanes=256, planes=256))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x1 = self.layer1(x)
        x2 = self.layer2(x1)
        x3 = self.layer3(x2)
        x4 = self.layer4(x3)
        # x4 layer output size: (B, 2048, 7, 7)
        x5 = self.layer5(x4)
        x = self.conv_end(x5)
        x = self.bn_end(x)
        x = torch.sigmoid(x) #归一化到0-1
        # x = x.view(-1,7,7,30)
        x = x.permute(0,2,3,1) #(-1,7,7,30)
        return x

    def init_weights(self):
        org_resnet = torch.utils.model_zoo.load_url(model_urls[self.name])
        # drop orginal resnet fc layer, add 'None' in case of no fc layer, that will raise error
        org_resnet.pop('fc.weight', None)
        org_resnet.pop('fc.bias', None)

        self.load_state_dict(org_resnet)
        print("Initialize resnet from model zoo")

def load_change_weights(model, model_name):
  
  org_resnet = torch.utils.model_zoo.load_url(model_urls[model_name])
  org_resnet.pop('fc.weight', None)
  org_resnet.pop('fc.bias', None)

  dd = model.state_dict()
  for k in org_resnet.keys():
      # print(k)
      if k in dd.keys() and not k.startswith('fc'):
          # print('yes')
          dd[k] = org_resnet[k]
  model.load_state_dict(dd)
  return model

### Assignment
You are required to build a model to perform object detection on the provided Pascal VOC dataset in this project.
Here are some hints that help you to accomplish the project successfully.

### Hints
- YOLOv1 is the simplest and suggested model to be implemented.
- Be careful of the normalization techniques on bounding boxes.
    1. normalize the height and width with image size to fall into 0 and 1
    2. x and y coordinates are parameterized to be the offsets of a particular grid cell and also bounded by 0 and 1
- Loss function has a great impact on training stability.
    1. loss function is the most important in this project, especially in calculating IOU
    2. only one bounding box predictor is responsible for each object
    3. weights for different types of losses
    4. predict the square root of height and width instead of predicting them directly
- Data augmentation.
    1. It contains only 5011 images in total. Furthermore, the labels are highly imbalanced.
    2. Random scaling and translations are applied when training YOLO.
    3. Note that the bounding box coordinates have to be changed accordingly if the image was transformed.

### Evaluation Metric
- Please evaluate your model on Pascal VOC testing set using Mean Average Precision (mAP).
- Write a brief report including your implementation, performance and  qualitative results(visualize bounding box on some images). 
- For more detailed explanation of mAP, please follow https://github.com/rafaelpadilla/Object-Detection-Metrics

In [6]:
from torch.utils.data import DataLoader
from torch.autograd import Variable
import torchvision.transforms as transforms
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Dataset & data augmentation

In [10]:
import os.path
import random
import numpy as np
import torch
import torch.utils.data as data
import torchvision.transforms as transforms
import cv2
import matplotlib.pyplot as plt

class yoloDataset(data.Dataset):
    image_size = 448 # Size to be aligned

    # Parsing data list
    def __init__(self,root,list_file,train,transform):
        self.root = root
        self.train = train
        self.transform = transform
        self.fnames = []
        self.boxes = []
        self.labels = []
        self.mean = (123,117,104) # RGB

        # Cat multiple list files together.
        '''if isinstance(list_file, list):
            # This is especially useful for voc07/voc12 combination.
            tmp_file = '/tmp/listfile.txt'
            os.system('cat %s > %s' % (' '.join(list_file), tmp_file))
            list_file = tmp_file'''

        with open(list_file) as f:
            lines  = f.readlines()

        # format of each line: filename (x_min, y_min, x_max, y_max, label) * object_num
        for line in lines:
            splited = line.strip().split() # .strip(): reomove space, tab from the end of each line
            self.fnames.append(splited[0])
            num_boxes = (len(splited) - 1) // 5
            box=[]
            label=[]
            for i in range(num_boxes):
                x = float(splited[1+5*i])
                y = float(splited[2+5*i])
                x2 = float(splited[3+5*i])
                y2 = float(splited[4+5*i])
                c = splited[5+5*i]
                box.append([x,y,x2,y2])
                label.append(int(c)+1) # +1: since the idx start from 0
            self.boxes.append(torch.Tensor(box))
            self.labels.append(torch.LongTensor(label))
        self.num_samples = len(self.boxes)

    # Getting single transformed, preprocessed image and its target
    def __getitem__(self,idx):
        fname = self.fnames[idx]
        img = cv2.imread(os.path.join(self.root+fname))
        boxes = self.boxes[idx].clone()
        labels = self.labels[idx].clone()

        # Randomly transforming image
        if self.train:
            #img = self.random_bright(img)
            img, boxes = self.random_flip(img, boxes)
            img,boxes = self.randomScale(img,boxes)
            img = self.randomBlur(img)
            img = self.RandomBrightness(img)
            img = self.RandomHue(img)
            img = self.RandomSaturation(img)
            img,boxes,labels = self.randomShift(img,boxes,labels)
            img,boxes,labels = self.randomCrop(img,boxes,labels)

        # #debug: showing the transformed image
        # box_show = boxes.numpy().reshape(-1)
        # # print(box_show)
        # img_show = self.BGR2RGB(img)
        # pt1=(int(box_show[0]),int(box_show[1])); pt2=(int(box_show[2]),int(box_show[3]))
        # cv2.rectangle(img_show,pt1=pt1,pt2=pt2,color=(0,255,0),thickness=1)
        # plt.figure()
        
        # plt.imshow(img_show)
        # plt.show()
        # #debug

        h,w,_ = img.shape
        boxes /= torch.Tensor([w,h,w,h]).expand_as(boxes) 
        # .expand_as(other): expand this tensor as other
        # [w, h, w, h] (1, 4) will be expanded to (#box, 4)

        img = self.BGR2RGB(img) # because pytorch pretrained model use RGB
        img = self.subMean(img,self.mean)
        img = cv2.resize(img,(self.image_size,self.image_size))
        target = self.encoder(boxes,labels) # 7x7x30, where 30 = 5*2(xywh+confidence for 2 boxes) + 20(classes)
        for t in self.transform:
            img = t(img)

        return img,target

    def __len__(self):
        return self.num_samples

    # Utils
    # Encoding the boxes, labels for single image
    def encoder(self,boxes,labels):
        grid_num = 7
        target = torch.zeros((grid_num,grid_num,30))
        cell_size = 1./grid_num
        wh = boxes[:,2:]-boxes[:,:2]
        cxcy = (boxes[:,2:]+boxes[:,:2])/2
        for i in range(cxcy.size()[0]):
            cxcy_sample = cxcy[i]
            ij = (cxcy_sample/cell_size).ceil()-1 #
            target[int(ij[1]),int(ij[0]),4] = 1
            target[int(ij[1]),int(ij[0]),9] = 1
            target[int(ij[1]),int(ij[0]),int(labels[i])+9] = 1
            xy = ij*cell_size # upper left coordinates of corresponding grid
            delta_xy = (cxcy_sample -xy)/cell_size
            target[int(ij[1]),int(ij[0]),2:4] = wh[i]
            target[int(ij[1]),int(ij[0]),:2] = delta_xy
            target[int(ij[1]),int(ij[0]),7:9] = wh[i]
            target[int(ij[1]),int(ij[0]),5:7] = delta_xy
        return target

    def BGR2RGB(self,img):
        return cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    def BGR2HSV(self,img):
        return cv2.cvtColor(img,cv2.COLOR_BGR2HSV)
    def HSV2BGR(self,img):
        return cv2.cvtColor(img,cv2.COLOR_HSV2BGR)
    
    def subMean(self,bgr,mean):
        mean = np.array(mean, dtype=np.float32)
        bgr = bgr - mean
        return bgr
    
    def RandomBrightness(self,bgr):
        if random.random() < 0.5:
            hsv = self.BGR2HSV(bgr)
            h,s,v = cv2.split(hsv)
            adjust = random.choice([0.5,1.5])
            v = v*adjust
            v = np.clip(v, 0, 255).astype(hsv.dtype)
            hsv = cv2.merge((h,s,v))
            bgr = self.HSV2BGR(hsv)
        return bgr

    def RandomSaturation(self,bgr):
        if random.random() < 0.5:
            hsv = self.BGR2HSV(bgr)
            h,s,v = cv2.split(hsv)
            adjust = random.choice([0.5,1.5])
            s = s*adjust
            s = np.clip(s, 0, 255).astype(hsv.dtype)
            hsv = cv2.merge((h,s,v))
            bgr = self.HSV2BGR(hsv)
        return bgr

    def RandomHue(self,bgr):
        if random.random() < 0.5:
            hsv = self.BGR2HSV(bgr)
            h,s,v = cv2.split(hsv)
            adjust = random.choice([0.5,1.5])
            h = h*adjust
            h = np.clip(h, 0, 255).astype(hsv.dtype)
            hsv = cv2.merge((h,s,v))
            bgr = self.HSV2BGR(hsv)
        return bgr

    def randomBlur(self,bgr):
        if random.random()<0.5:
            bgr = cv2.blur(bgr,(5,5))
        return bgr

    def randomShift(self,bgr,boxes,labels):
        center = (boxes[:,2:]+boxes[:,:2])/2
        if random.random() <0.5:
            height,width,c = bgr.shape
            after_shfit_image = np.zeros((height,width,c),dtype=bgr.dtype)
            after_shfit_image[:,:,:] = (104,117,123) #bgr
            shift_x = random.uniform(-width*0.2,width*0.2)
            shift_y = random.uniform(-height*0.2,height*0.2)
            #print(bgr.shape,shift_x,shift_y)
            #原图像的平移
            if shift_x>=0 and shift_y>=0:
                after_shfit_image[int(shift_y):,int(shift_x):,:] = bgr[:height-int(shift_y),:width-int(shift_x),:]
            elif shift_x>=0 and shift_y<0:
                after_shfit_image[:height+int(shift_y),int(shift_x):,:] = bgr[-int(shift_y):,:width-int(shift_x),:]
            elif shift_x <0 and shift_y >=0:
                after_shfit_image[int(shift_y):,:width+int(shift_x),:] = bgr[:height-int(shift_y),-int(shift_x):,:]
            elif shift_x<0 and shift_y<0:
                after_shfit_image[:height+int(shift_y),:width+int(shift_x),:] = bgr[-int(shift_y):,-int(shift_x):,:]

            shift_xy = torch.FloatTensor([[int(shift_x),int(shift_y)]]).expand_as(center)
            center = center + shift_xy
            mask1 = (center[:,0] >0) & (center[:,0] < width)
            mask2 = (center[:,1] >0) & (center[:,1] < height)
            mask = (mask1 & mask2).view(-1,1)
            boxes_in = boxes[mask.expand_as(boxes)].view(-1,4)
            if len(boxes_in) == 0:
                return bgr,boxes,labels
            box_shift = torch.FloatTensor([[int(shift_x),int(shift_y),int(shift_x),int(shift_y)]]).expand_as(boxes_in)
            boxes_in = boxes_in+box_shift
            labels_in = labels[mask.view(-1)]
            return after_shfit_image,boxes_in,labels_in
        return bgr,boxes,labels

    def randomScale(self,bgr,boxes):
        #固定住高度，以0.8-1.2伸缩宽度，做图像形变
        if random.random() < 0.5:
            scale = random.uniform(0.8,1.2)
            height,width,c = bgr.shape
            bgr = cv2.resize(bgr,(int(width*scale),height))
            scale_tensor = torch.FloatTensor([[scale,1,scale,1]]).expand_as(boxes)
            boxes = boxes * scale_tensor
            return bgr,boxes
        return bgr,boxes

    def randomCrop(self,bgr,boxes,labels):
        if random.random() < 0.5:
            center = (boxes[:,2:]+boxes[:,:2])/2
            height,width,c = bgr.shape
            h = random.uniform(0.6*height,height)
            w = random.uniform(0.6*width,width)
            x = random.uniform(0,width-w)
            y = random.uniform(0,height-h)
            x,y,h,w = int(x),int(y),int(h),int(w)

            center = center - torch.FloatTensor([[x,y]]).expand_as(center)
            mask1 = (center[:,0]>0) & (center[:,0]<w)
            mask2 = (center[:,1]>0) & (center[:,1]<h)
            mask = (mask1 & mask2).view(-1,1)

            boxes_in = boxes[mask.expand_as(boxes)].view(-1,4)
            if(len(boxes_in)==0):
                return bgr,boxes,labels
            box_shift = torch.FloatTensor([[x,y,x,y]]).expand_as(boxes_in)

            boxes_in = boxes_in - box_shift
            boxes_in[:,0]=boxes_in[:,0].clamp_(min=0,max=w)
            boxes_in[:,2]=boxes_in[:,2].clamp_(min=0,max=w)
            boxes_in[:,1]=boxes_in[:,1].clamp_(min=0,max=h)
            boxes_in[:,3]=boxes_in[:,3].clamp_(min=0,max=h)

            labels_in = labels[mask.view(-1)]
            img_croped = bgr[y:y+h,x:x+w,:]
            return img_croped,boxes_in,labels_in
        return bgr,boxes,labels

    def random_flip(self, im, boxes):
        if random.random() < 0.5:
            im_lr = np.fliplr(im).copy()
            h,w,_ = im.shape
            xmin = w - boxes[:,2]
            xmax = w - boxes[:,0]
            boxes[:,0] = xmin
            boxes[:,2] = xmax
            return im_lr, boxes
        return im, boxes

    def random_bright(self, im, delta=16): # unused
        alpha = random.random()
        if alpha > 0.3:
            im = im * alpha + random.randrange(-delta,delta)
            im = im.clip(min=0,max=255).astype(np.uint8)
        return im

## Yolov1 Loss


In [8]:
# Loss function
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class yolov1Loss(nn.Module):
    def __init__(self, S, B, C, lambda_coord, lambda_noobj):
        # Args:
        #    S: size of grid
        #    B: number of box
        #    C: number of class
        super(yolov1Loss, self).__init__()
        self.S = S 
        self.B = B 
        self.C = C 
        self.l_coord = lambda_coord
        self.l_noobj = lambda_noobj
    
    def calculateIoU(self, box1, box2):
        # calculate the intersection over the union of two sets of boxes, each box contains [xmin,ymin,xmax,ymax]
        # Args:
        #    size of box1 = [n,4]
        #    size of box2 = [m,4]
        # Return:
        #    size of Iou of two sets of boxes = [n,m]
        n = box1.size(0)
        m = box2.size(0)
        
        # take the max of left-bottom point and the min of right-top point 
        # to calculate the left-top point and the right-bottom point of the intersection
        lt = torch.max(
            box1[:,:2].unsqueeze(1).expand(n,m,2), # take [xmin,ymin]: [n,2] -> [n,1,2] -> [n,m,2]
            box2[:,:2].unsqueeze(0).expand(n,m,2)  # take [xmin,ymin]: [m,2] -> [1,m,2] -> [n,m,2]
        )
        
        rb = torch.min(
            box1[:,2:].unsqueeze(1).expand(n,m,2), # take [xmax,ymax]: [n,2] -> [n,1,2] -> [n,m,2]
            box2[:,2:].unsqueeze(0).expand(n,m,2)  # take [xmax,ymax]: [m,2] -> [1,m,2] -> [n,m,2]
        )
        
        # calculate weight and height of intersection areas and check if intersection area is 0
        wh = rb - lt # [n,m,2]
        wh[wh<0] = 0 # if max_left >= min_right or max_bottom >= min_top, then there is no intersection
        intersection = wh[:,:,0] * wh[:,:,1] # [n,m]
        
        area1 = (box1[:,2]-box1[:,0])*(box1[:,3]-box1[:,1])  #[n,]     
        area2 = (box2[:,2]-box2[:,0])*(box2[:,3]-box2[:,1])  #[m,]
        area1 = area1.unsqueeze(1).expand(n,m) # [n,] -> [n,1] -> [n,m]
        area2 = area2.unsqueeze(0).expand(n,m) # [m,] -> [1,m] -> [n,m]
        
        iou = intersection / (area1 + area2 - intersection)
        return iou
    def forward(self, preds, targets):
        # Args:
        #    size of preds = [batchsize, S, S, Bx5+20]: Bx5 means each box has [x,y,w,h,c] 5 values
        #    size of targets = [batchsize, S, S, Bx5+20]
        S, B, C = self.S, self.B, self.C
        N = B * 5 + C 
        batchsize = preds.size(0)
        coord_mask = targets[:,:,:,4] > 0        
        noobj_mask = targets[:,:,:,4] == 0
        coord_mask = coord_mask.unsqueeze(-1).expand(batchsize, S, S, N)        
        noobj_mask = noobj_mask.unsqueeze(-1).expand(batchsize, S, S, N)
        
        coord_pred = preds[coord_mask].view(-1, N)
        box_pred = coord_pred[:,:5*B].contiguous().view(-1, 5)
        class_pred = coord_pred[:,5*B:]
        
        coord_target = targets[coord_mask].view(-1, N)
        box_target = coord_target[:,:5*B].contiguous().view(-1, 5)
        class_target = coord_target[:,5*B:]
        
        # compute noobj_loss: only calculate confidence loss
        noobj_pred = preds[noobj_mask].view(-1, N)
        noobj_target = targets[noobj_mask].view(-1, N)
        noobj_pred_mask = torch.cuda.BoolTensor(noobj_pred.size())
        noobj_pred_mask.zero_()
        for b in range(B):
            noobj_pred_mask[:, 4+b*5] = 1
        noobj_pred_conf = noobj_pred[noobj_pred_mask]
        noobj_target_conf = noobj_target[noobj_pred_mask]  
        loss_noobj = F.mse_loss(noobj_pred_conf, noobj_target_conf, reduction = 'sum')
        
        # compute coord_loss
        coord_response_mask = torch.cuda.BoolTensor(box_target.size()).fill_(0) # only compute the loss of the box containing the center of object
        box_target_iou = torch.zeros(box_target.size()).cuda()
        
        # Choose the pred box having the highest IoU for each target boxes
        for i in range(0, box_target.size(0), B):
            # take all predict boxes at i-th cell
            pred_boxes = box_pred[i:i+B]
            pred_xyxy = Variable(torch.FloatTensor(pred_boxes.size()))
            pred_xyxy[:, :2] = pred_boxes[:, :2]/float(S) - 0.5*pred_boxes[:,2:4]
            pred_xyxy[:, 2:4] = pred_boxes[:, :2]/float(S) + 0.5*pred_boxes[:,2:4]   
            
            # take all target boxes at i-th cell
            # Since target boxes contained by each cell are identical in current implement,thus just take the first one
            target_boxes = box_target[i].view(-1, 5)
            target_xyxy = Variable(torch.FloatTensor(target_boxes.size()))
            target_xyxy[:, :2] = target_boxes[:, :2]/float(S) - 0.5*target_boxes[:,2:4]
            target_xyxy[:, 2:4] = target_boxes[:, :2]/float(S) + 0.5*target_boxes[:,2:4]
                                   
            iou = self.calculateIoU(pred_xyxy[:,:4], target_xyxy[:,:4]) # [B,1]
            max_iou, max_index = iou.max(0)
            max_index = max_index.data.cuda()
            
            coord_response_mask[i+max_index] = 1
            box_target_iou[i+max_index, torch.LongTensor([4]).cuda()] = (max_iou).data.cuda()
        
        # calculate the loss of the response boxes
        box_target_iou = Variable(box_target_iou).cuda()
        box_pred_response = box_pred[coord_response_mask].view(-1, 5)
        box_target_response = box_target[coord_response_mask].view(-1, 5)
        target_iou = box_target_iou[coord_response_mask].view(-1, 5)
        loss_xy = F.mse_loss(box_pred_response[:,:2], box_target_response[:,:2], reduction = 'sum')
        loss_wh = F.mse_loss(torch.sqrt(box_pred_response[:,2:4]), torch.sqrt(box_target_response[:,2:4]), reduction = 'sum')                
        loss_obj = F. mse_loss(box_pred_response[:,4], target_iou[:,4], reduction = 'sum')
        
        # calculate the class probability loss of cells containing objects
        loss_class = F.mse_loss(class_pred, class_target, reduction = 'sum')
        
        # total loss
        loss = self.l_coord * (loss_xy + loss_wh) + loss_obj + self.l_noobj*loss_noobj + loss_class
        loss = loss/float(batchsize)
                                   
        return loss

## Training Process


In [8]:
device = torch.device("cuda" if torch.cuda.is_available else "cpu")

In [11]:
batch_size = 8 #64
momentum = 0.9
decay = 0.0005
epochs = 50

file_root = 'VOCdevkit/VOC2007/JPEGImages/'

def train():
  ## model initiate
  learning_rate = 0.001
  yolov1 = ResNetYoloV1(50)
  yolov1 = load_change_weights(yolov1, 'resnet50')

  net = yolov1
  net.cuda()
  optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum = momentum, weight_decay=decay)
  # load data
  train_dataset = yoloDataset(root=file_root,list_file=file_root+'voc2007.txt', train=True,transform = [transforms.ToTensor()] )
  train_loader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True,num_workers=4)
  test_dataset = yoloDataset(root=file_root,list_file=file_root+'voc2007test.txt',train=False,transform = [transforms.ToTensor()] )
  test_loader = DataLoader(test_dataset,batch_size=batch_size,shuffle=False,num_workers=4)
  print('the dataset has %d images' % (len(train_dataset)))
  print('the batch_size is %d' % (batch_size))
  # training process
  net.train()
  criterion = yolov1Loss(7,2,20,5,0.5)
  for ep in range(epochs):
    if ep >= 2:
      learning_rate = 0.01
    if ep >= 30:
      learning_rate = 0.001
    if ep >= 45:
      learning_rate = 0.0001
    for param_group in optimizer.param_groups:
      param_group['lr'] = learning_rate
    total_loss = 0.

    for i, (images, target) in enumerate(train_loader):
        images = Variable(images)
        target = Variable(target)
        images,target = images.cuda(),target.cuda()
        
        pred = net(images)
        loss = criterion(pred,target)
#         total_loss += loss.data[0]
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 5 == 0:
            print ('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f, average_loss: %.4f' 
            %(ep+1, epochs, i+1, len(train_loader), loss.item(), total_loss / (i+1)))
    #validation process
    validation_loss = 0.0
    net.eval()
    for (images, target) in enumerate(test_loader):
      images = Variable(images)
      target = Variable(target)
      images, target = images.cuda(), target.cuda()

      pred = net(images)
      loss = criterion(pred, target)
      validation_loss += loss.item()
    validation_loss/=len(test_loader)
    print('Test epoch [%d/%d], average_loss: %.4f', ep+1, epochs, validation_loss)

train()

the dataset has 5010 images
the batch_size is 8
Epoch [1/50], Iter [5/627] Loss: 20.0711, average_loss: 23.9822
Epoch [1/50], Iter [10/627] Loss: 21.8621, average_loss: 22.4023
Epoch [1/50], Iter [15/627] Loss: 20.8807, average_loss: 21.1255
Epoch [1/50], Iter [20/627] Loss: 14.5793, average_loss: 19.8461
Epoch [1/50], Iter [25/627] Loss: 15.6770, average_loss: 19.0664
Epoch [1/50], Iter [30/627] Loss: 16.4223, average_loss: 18.2948
Epoch [1/50], Iter [35/627] Loss: 16.9417, average_loss: 17.6521
Epoch [1/50], Iter [40/627] Loss: 14.5529, average_loss: 16.8268
Epoch [1/50], Iter [45/627] Loss: 25.2479, average_loss: 16.4145
Epoch [1/50], Iter [50/627] Loss: 9.7385, average_loss: 15.7722
Epoch [1/50], Iter [55/627] Loss: 13.8839, average_loss: 15.2486
Epoch [1/50], Iter [60/627] Loss: 8.6060, average_loss: 14.6320
Epoch [1/50], Iter [65/627] Loss: 9.0687, average_loss: 14.2661
Epoch [1/50], Iter [70/627] Loss: 12.9680, average_loss: 14.0402
Epoch [1/50], Iter [75/627] Loss: 12.2042, ave

[ WARN:0@1643.273] global /io/opencv/modules/imgcodecs/src/loadsave.cpp (239) findDecoder imread_('VOCdevkit/VOC2007/JPEGImages/002824.jpg'): can't open/read file: check file path/integrity
[ WARN:0@1643.274] global /io/opencv/modules/imgcodecs/src/loadsave.cpp (239) findDecoder imread_('VOCdevkit/VOC2007/JPEGImages/005505.jpg'): can't open/read file: check file path/integrity
[ WARN:0@1643.274] global /io/opencv/modules/imgcodecs/src/loadsave.cpp (239) findDecoder imread_('VOCdevkit/VOC2007/JPEGImages/007962.jpg'): can't open/read file: check file path/integrity
[ WARN:0@1643.275] global /io/opencv/modules/imgcodecs/src/loadsave.cpp (239) findDecoder imread_('VOCdevkit/VOC2007/JPEGImages/001912.jpg'): can't open/read file: check file path/integrity
[ WARN:0@1643.275] global /io/opencv/modules/imgcodecs/src/loadsave.cpp (239) findDecoder imread_('VOCdevkit/VOC2007/JPEGImages/003490.jpg'): can't open/read file: check file path/integrity
[ WARN:0@1643.275] global /io/opencv/modules/imgco

AttributeError: Caught AttributeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/andrea/anaconda3/envs/Lab_env/lib/python3.9/site-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/andrea/anaconda3/envs/Lab_env/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/andrea/anaconda3/envs/Lab_env/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/tmp/ipykernel_298275/510945355.py", line 83, in __getitem__
    h,w,_ = img.shape
AttributeError: 'NoneType' object has no attribute 'shape'


## Evaluation on VOC


In [20]:
'''import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"'''
import numpy as np
VOC_CLASSES = ('aeroplane', 'bicycle', 'bird', 'boat',
        'bottle', 'bus', 'car', 'cat', 'chair',
        'cow', 'diningtable', 'dog', 'horse',
        'motorbike', 'person', 'pottedplant',
        'sheep', 'sofa', 'train', 'tvmonitor')

def voc_ap(rec,prec,use_07_metric=False):
    if use_07_metric:
        # 11 point metric
        ap = 0.
        for t in np.arange(0.,1.1,0.1):
            if np.sum(rec >= t) == 0:
                p = 0
            else:
                p = np.max(prec[rec>=t])
            ap = ap + p/11.

    else:
        # correct ap caculation
        mrec = np.concatenate(([0.],rec,[1.]))
        mpre = np.concatenate(([0.],prec,[0.]))

        for i in range(mpre.size -1, 0, -1):
            mpre[i-1] = np.maximum(mpre[i-1],mpre[i])

        i = np.where(mrec[1:] != mrec[:-1])[0]

        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])

    return ap

def voc_eval(preds,target,VOC_CLASSES=VOC_CLASSES,threshold=0.5,use_07_metric=False,):
    '''
    preds {'cat':[[image_id,confidence,x1,y1,x2,y2],...],'dog':[[],...]}
    target {(image_id,class):[[],]}
    '''
    aps = []
    for i,class_ in enumerate(VOC_CLASSES):
        pred = preds[class_] #[[image_id,confidence,x1,y1,x2,y2],...]
        if len(pred) == 0: #如果这个类别一个都没有检测到的异常情况
            ap = -1
            print('---class {} ap {}---'.format(class_,ap))
            aps += [ap]
            break
        #print(pred)
        image_ids = [x[0] for x in pred]
        confidence = np.array([float(x[1]) for x in pred])
        BB = np.array([x[2:] for x in pred])
        # sort by confidence
        sorted_ind = np.argsort(-confidence)
        sorted_scores = np.sort(-confidence)
        BB = BB[sorted_ind, :]
        image_ids = [image_ids[x] for x in sorted_ind]

        # go down dets and mark TPs and FPs
        npos = 0.
        for (key1,key2) in target:
            if key2 == class_:
                npos += len(target[(key1,key2)]) #统计这个类别的正样本，在这里统计才不会遗漏
        nd = len(image_ids)
        tp = np.zeros(nd)
        fp = np.zeros(nd)
        for d,image_id in enumerate(image_ids):
            bb = BB[d] #预测框
            if (image_id,class_) in target:
                BBGT = target[(image_id,class_)] #[[],]
                for bbgt in BBGT:
                    # compute overlaps
                    # intersection
                    ixmin = np.maximum(bbgt[0], bb[0])
                    iymin = np.maximum(bbgt[1], bb[1])
                    ixmax = np.minimum(bbgt[2], bb[2])
                    iymax = np.minimum(bbgt[3], bb[3])
                    iw = np.maximum(ixmax - ixmin + 1., 0.)
                    ih = np.maximum(iymax - iymin + 1., 0.)
                    inters = iw * ih

                    union = (bb[2]-bb[0]+1.)*(bb[3]-bb[1]+1.) + (bbgt[2]-bbgt[0]+1.)*(bbgt[3]-bbgt[1]+1.) - inters
                    if union == 0:
                        print(bb,bbgt)
                    
                    overlaps = inters/union
                    if overlaps > threshold:
                        tp[d] = 1
                        BBGT.remove(bbgt) #这个框已经匹配到了，不能再匹配
                        if len(BBGT) == 0:
                            del target[(image_id,class_)] #删除没有box的键值
                        break
                fp[d] = 1-tp[d]
            else:
                fp[d] = 1
        fp = np.cumsum(fp)
        tp = np.cumsum(tp)
        rec = tp/float(npos)
        prec = tp/np.maximum(tp + fp, np.finfo(np.float64).eps)
        #print(rec,prec)
        ap = voc_ap(rec, prec, use_07_metric)
        print('---class {} ap {}---'.format(class_,ap))
        aps += [ap]
    print('---map {}---'.format(np.mean(aps)))