In [12]:
import torch
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
import os

Clone of the docu in the model: https://github.com/pytorch/vision/blob/master/torchvision/models/detection/faster_rcnn.py to start

In [13]:
backbone = torchvision.models.mobilenet_v2(pretrained=True).features

Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /home/jovyan/.cache/torch/checkpoints/mobilenet_v2-b0353104.pth
100%|██████████| 13.6M/13.6M [00:02<00:00, 5.31MB/s]


In [14]:
backbone.out_channels = 1280

In [15]:
anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                  aspect_ratios=((0.5, 1.0, 2.0),))

In [16]:
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0],
                                               output_size=7,
                                               sampling_ratio=2)

In [17]:
model = FasterRCNN(backbone,
                  num_classes=2,
                  rpn_anchor_generator=anchor_generator,
                  box_roi_pool=roi_pooler)

In [18]:
model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): Sequential(
    (0): ConvBNReLU(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm

In [19]:
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]

In [20]:
predictions = model(x)

In [21]:
predictions

[{'boxes': tensor([[2.0258e+02, 1.6481e+02, 2.9642e+02, 2.6432e+02],
          [1.2599e+02, 1.7854e+02, 2.1802e+02, 2.7416e+02],
          [1.6261e+02, 2.7027e+02, 1.7587e+02, 2.8203e+02],
          [3.5494e+02, 1.6246e+01, 3.6576e+02, 2.9443e+01],
          [1.7495e+02, 2.6996e+02, 1.8771e+02, 2.8160e+02],
          [2.4046e+02, 1.7766e+02, 3.3554e+02, 2.7730e+02],
          [3.5375e+02, 2.9104e+01, 3.6523e+02, 4.0802e+01],
          [1.1422e+02, 2.7017e+02, 1.2610e+02, 2.8176e+02],
          [2.4967e+01, 1.8144e+02, 1.1828e+02, 2.7574e+02],
          [6.8676e+01, 1.7599e+02, 1.6313e+02, 2.7132e+02],
          [1.0125e+02, 2.6995e+02, 1.1349e+02, 2.8182e+02],
          [1.7886e+02, 1.4481e+02, 2.7737e+02, 2.4658e+02],
          [2.9166e+01, 2.5777e+02, 4.1780e+01, 2.7188e+02],
          [2.7408e+02, 1.6937e+02, 3.6453e+02, 2.7175e+02],
          [3.5432e+02, 2.0827e+02, 3.6583e+02, 2.2126e+02],
          [8.7464e+01, 2.6538e+02, 1.9352e+02, 2.9995e+02],
          [1.2627e+02, 2.7025e+

# Testing Code

In [30]:
import torchvision.models.detection as models

In [31]:
# how I call the model inside my train script
model_test = models.__dict__['fasterrcnn_resnet50_fpn'](pretrained=False)


Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /home/jovyan/.cache/torch/checkpoints/resnet50-19c8e357.pth
100%|██████████| 97.8M/97.8M [00:18<00:00, 5.63MB/s]


In [32]:
model_test.train()

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d()
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d()
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d()
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d()
          )
  

# Use a real loader

In [69]:
coco_root = os.path.join('..','..','external_data','coco')
coco_detect = torchvision.datasets.CocoDetection(root=os.path.join(coco_root, 'train2017'), 
                               annFile=os.path.join(coco_root, 'annotations', 'instances_train2017.json'),
                              transform = torchvision.transforms.ToTensor())

loading annotations into memory...
Done (t=13.70s)
creating index...
index created!


In [77]:
for item in coco_detect:
    image, ann = item
    print(image.shape)
    for item in ann:
        item["boxes"] = item["bbox"]
        item["labels"] = item["category_id"]
        item["boxes"] = torch.Tensor(item["boxes"]).unsqueeze(dim=0)
        item["labels"] = torch.tensor(item["labels"], dtype=torch.int64)
    break

torch.Size([3, 480, 640])


In [71]:
ann[0].keys()

dict_keys(['segmentation', 'area', 'iscrowd', 'image_id', 'bbox', 'category_id', 'id', 'boxes', 'labels'])

In [60]:
xmin, ymin, xmax, ymax = torch.Tensor(test).unsqueeze(dim=0).unbind(1)

In [79]:
losses = model([image], ann)

In [80]:
losses

[{'boxes': tensor([[4.7334e+02, 4.4308e+02, 5.6012e+02, 4.8000e+02],
          [5.4846e+00, 4.3334e+02, 8.3532e+01, 4.8000e+02],
          [1.2855e-01, 2.9866e+02, 4.0051e+01, 3.7722e+02],
          [5.4488e+02, 3.6696e+02, 5.9027e+02, 4.1332e+02],
          [1.5916e+02, 3.9369e+02, 1.7936e+02, 4.1313e+02],
          [2.9344e+02, 2.7035e+02, 3.2296e+02, 3.2734e+02],
          [4.3760e+02, 4.7735e+01, 5.3550e+02, 3.0116e+02],
          [2.8659e+02, 2.5870e+02, 3.7492e+02, 3.1441e+02],
          [3.0971e+02, 2.7131e+02, 3.3867e+02, 3.2716e+02],
          [1.9720e+01, 4.1657e+02, 9.1155e+01, 4.7046e+02],
          [5.0471e+02, 3.4916e+02, 6.3904e+02, 4.2661e+02],
          [3.1885e+02, 2.0392e+01, 4.6940e+02, 2.4183e+02],
          [4.4555e+02, 8.2796e+01, 5.9008e+02, 2.6153e+02],
          [5.0943e+01, 4.1498e+02, 9.9152e+01, 4.8000e+02],
          [2.7464e+02, 1.0366e+00, 4.3367e+02, 1.6959e+02],
          [1.8343e+02, 5.1767e+01, 5.5760e+02, 3.2291e+02],
          [3.7317e+02, 7.3893e+