In [1]:
import torch
from dataset import PASCALVOC
import torchvision
import cv2

In [2]:
device = torch.device('cuda')

In [3]:
VOC_CLASSES = ('aeroplane', 'bicycle', 'bird', 'boat',
               'bottle', 'bus', 'car', 'cat', 
               'chair', 'cow', 'diningtable', 'dog', 
               'horse', 'motorbike', 'person', 'pottedplant',
               'sheep', 'sofa', 'train', 'tvmonitor')

In [4]:
def nms(bboxes, scores, threshold=0.5):
    '''
    bboxes(tensor) [N, 4]
    scores(tensor) [N, ]
    '''
    x1 = bboxes[:, 0]
    y1 = bboxes[:, 1]
    x2 = bboxes[:, 2]
    y2 = bboxes[:, 3]
    areas = (x2 - x1) * (y2 - y1)
    _, order = scores.sort(0, descending=True)
    keep = []
    while order.numel() > 0:
        if len(order.size()) == 0:
            i = order.item()
        else:
            i = order[0]
        keep.append(i)
        if order.numel() == 1:
            break
        xx1 = x1[order[1:]].clamp(min=x1[i])
        yy1 = y1[order[1:]].clamp(min=y1[i])
        xx2 = x2[order[1:]].clamp(max=x2[i])
        yy2 = y2[order[1:]].clamp(max=y2[i])
        w = (xx2 - xx1).clamp(min=0)
        h = (yy2 - yy1).clamp(min=0)
        inter = w * h
        ovr = inter / (areas[i] + areas[order[1:]] - inter)
        ids = (ovr <= threshold).nonzero().squeeze()
        if ids.numel() == 0:
            break
        order = order[ids+1]
    return torch.LongTensor(keep)

In [5]:
def decoder(pred):
    '''
    pred (tensor) 1 x 7 x 7 x 30
    return (tensor) box[[x1, y1, x2, y2]] label[...]
    '''
    boxes = []
    cls_indexs = []
    probs = []
    cell_size = 1. / 7
    pred = pred.data
    pred = pred.squeeze(0)
    contain1 = pred[:, :, 4].unsqueeze(2)
    contain2 = pred[:, :, 9].unsqueeze(2)
    contain = torch.cat((contain1, contain2), 2)
    mask1 = contain > 0.9
    mask2 = (contain == contain.max())
    mask = (mask1 + mask2).gt(0)
    min_score, min_index = torch.min(mask, 2)
    for i in range(7):
        for j in range(7):
            for b in range(2):
                index = min_index[i, j]
                mask[i, j, index] = 0
                if mask[i, j, b] == 1:
                    box = pred[i, j, b * 5 : b * 5 + 4]
                    contain_prob = torch.FloatTensor([pred[i, j, b * 5 + 4]])
                    xy = torch.FloatTensor([j, i]) * cell_size
                    box[:2] = box[:2] * cell_size + xy
                    box_xy = torch.FloatTensor(box.size())
                    box_xy[:2] = box[:2] - 0.5 * box[2:]
                    box_xy[2:] = box[:2] + 0.5 * box[2:]
                    max_prob,cls_index = torch.max(pred[i, j, 10:], 0)
                    boxes.append(box_xy.view(1, 4))
                    if len(cls_index.size()) == 0:
                        cls_index = cls_index.expand(1)
                    cls_indexs.append(cls_index)
                    probs.append(contain_prob)
    boxes = torch.cat(boxes, 0)
    probs = torch.cat(probs, 0)
    cls_indexs = torch.cat(cls_indexs, 0)
    keep = nms(boxes, probs)
    return boxes[keep], cls_indexs[keep], probs[keep]

In [6]:
model = torchvision.models.resnet50(pretrained=False)
in_features = model.fc.in_features
model.fc = torch.nn.Sequential(torch.nn.Linear(in_features, 1470), 
                               torch.nn.Sigmoid())
model.load_state_dict(torch.load('YOLOv1_Resnet50.pth'))
model.to(device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [18]:
def predictImage(img_path):
    print('Start predict!')
    model.eval()
    with torch.no_grad():
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        h, w, c = image.shape

        img_to_model = cv2.resize(image, (448, 448))
        transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor(), ])
        img_to_model = transform(img_to_model)
        img_to_model = img_to_model.view(1, -1, 448, 448)
        img_to_model = img_to_model.to(device)
        pred = model(img_to_model)
        pred = pred.view(-1, 7, 7, 30)
        pred = pred.to(torch.device('cpu'))

        boxes, cls_indexs, probs = decoder(pred)
        result = []
        for i, box in enumerate(boxes):
            x1 = int(box[0] * w)
            x2 = int(box[2] * w)
            y1 = int(box[1] * h)
            y2 = int(box[3] * h)
            cls_index = cls_indexs[i]
            cls_index = int(cls_index)
            prob = probs[i]
            prob = float(prob)
            result.append([(x1, y1), (x2, y2), VOC_CLASSES[cls_index], img_path, prob])
        for left_up, right_bottom, class_name, _, prob in result:
            cls_index = VOC_CLASSES.index(class_name)
            cv2.rectangle(image, left_up, right_bottom, (10*cls_index, 255-10*cls_index, 255-10*cls_index), 5)
            # text
            text = '{0}: {1:.2f}'.format(class_name, prob)
            # text_width, text_height = cv2.getTextSize(text, font, fontScale, lineType)
            text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 1, cv2.LINE_AA)
            # for text background
            text_xmax = left_up[0] + text_size[0][0]
            text_ymax = left_up[1] - text_size[0][1] # pay attention to the opencv coordinate
            cv2.rectangle(image, left_up, (text_xmax, text_ymax), (10*cls_index, 255-10*cls_index, 255-10*cls_index), cv2.FILLED)
            cv2.putText(image, text, left_up, cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0),
                        2, cv2.LINE_AA)
        cv2.imwrite(img_path[:-4] + '_result.jpg', cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
    print('Finish predict!')

In [19]:
def predictVideo(video_path):
    print('Start predict!')
    vid = cv2.VideoCapture(video_path)
    height, width = None, None
    writer = None
    while True:
        grabbed, frame = vid.read()
        if not grabbed:
            break
        if width is None or height is None:
            height, width = frame.shape[:2]
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        h, w, c = image.shape

        img_to_model = cv2.resize(image, (448, 448))
        transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor(), ])
        img_to_model = transform(img_to_model)
        img_to_model = img_to_model.view(1, -1, 448, 448)
        img_to_model = img_to_model.to(device)
        pred = model(img_to_model)
        pred = pred.view(-1, 7, 7, 30)
        pred = pred.to(torch.device('cpu'))

        boxes, cls_indexs, probs = decoder(pred)
        result = []
        for i, box in enumerate(boxes):
            x1 = int(box[0] * w)
            x2 = int(box[2] * w)
            y1 = int(box[1] * h)
            y2 = int(box[3] * h)
            cls_index = cls_indexs[i]
            cls_index = int(cls_index)
            prob = probs[i]
            prob = float(prob)
            result.append([(x1, y1), (x2, y2), VOC_CLASSES[cls_index], img_name, prob])
        for left_up, right_bottom, class_name, _, prob in result:
            cls_index = VOC_CLASSES.index(class_name)
            cv2.rectangle(image, left_up, right_bottom, (10*cls_index, 255-10*cls_index, 255-10*cls_index), 5)
            # text
            text = '{0}: {1:.2f}'.format(class_name, prob)
            # text_width, text_height = cv2.getTextSize(text, font, fontScale, lineType)
            text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 1, cv2.LINE_AA)
            # for text background
            text_xmax = left_up[0] + text_size[0][0]
            text_ymax = left_up[1] - text_size[0][1] # pay attention to the opencv coordinate
            cv2.rectangle(image, left_up, (text_xmax, text_ymax), (10*cls_index, 255-10*cls_index, 255-10*cls_index), cv2.FILLED)
            cv2.putText(image, text, left_up, cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0),
                        2, cv2.LINE_AA)

        if writer is None:
            fourcc = cv.VideoWriter_fourcc(*"MJPG")
            writer = cv.VideoWriter(FLAGS.video_output_path, fourcc, 30, (image.shape[1], image.shape[0]), True)
        writer.write(image)
    writer.release()
    vid.release()
    print('Finish predict!')

In [20]:
predictImage('testImage.JPG')

Start predict!
Finish predict!
