In [1]:
import torch
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

Clone of the docu in the model: https://github.com/pytorch/vision/blob/master/torchvision/models/detection/faster_rcnn.py to start

In [2]:
backbone = torchvision.models.mobilenet_v2(pretrained=True).features

Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /home/jovyan/.cache/torch/checkpoints/mobilenet_v2-b0353104.pth
100%|██████████| 14212972/14212972 [00:03<00:00, 4280940.43it/s]


In [6]:
backbone.out_channels = 1280

In [3]:
anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                  aspect_ratios=((0.5, 1.0, 2.0),))

In [4]:
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0],
                                               output_size=7,
                                               sampling_ratio=2)

In [7]:
model = FasterRCNN(backbone,
                  num_classes=2,
                  rpn_anchor_generator=anchor_generator,
                  box_roi_pool=roi_pooler)

In [8]:
model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): Sequential(
    (0): ConvBNReLU(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(96, eps

In [9]:
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]

In [10]:
predictions = model(x)

In [11]:
predictions

[{'boxes': tensor([[2.7975e+02, 2.5828e+02, 3.0582e+02, 2.9121e+02],
          [2.8239e+02, 2.3245e+02, 4.0000e+02, 3.0000e+02],
          [2.6799e+02, 2.5738e+02, 2.9378e+02, 2.9119e+02],
          [2.9146e+02, 2.5766e+02, 3.1815e+02, 2.9131e+02],
          [2.6033e+02, 2.4960e+02, 3.0069e+02, 3.0000e+02],
          [2.5449e+02, 2.5847e+02, 2.8041e+02, 2.9114e+02],
          [2.8226e+02, 2.4842e+02, 3.2325e+02, 3.0000e+02],
          [2.8013e+02, 2.7288e+02, 3.0592e+02, 3.0000e+02],
          [3.0371e+02, 2.6159e+02, 3.2951e+02, 2.9410e+02],
          [3.1558e+02, 0.0000e+00, 3.3527e+02, 3.1109e+01],
          [1.8394e+02, 2.5760e+02, 2.1017e+02, 2.9467e+02],
          [3.5094e+02, 2.7630e+02, 3.7853e+02, 2.9995e+02],
          [3.2247e+02, 2.7523e+02, 3.8014e+02, 3.0000e+02],
          [1.7102e+02, 2.5902e+02, 1.9796e+02, 2.9472e+02],
          [3.1616e+02, 9.4699e+00, 3.3727e+02, 4.2231e+01],
          [2.7336e+02, 2.7492e+02, 3.3331e+02, 3.0000e+02],
          [1.7123e+02, 0.0000e+

# Testing Code

In [13]:
import torchvision.models.detection as models

In [14]:
model_test = model = models.__dict__['fasterrcnn_resnet50_fpn'](pretrained=False)

In [15]:
model_test.train()

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d()
      (relu): ReLU(inplace)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d()
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d()
          (relu): ReLU(inplace)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d()
          )
        )
  

In [22]:
x = [torch.rand(3, 300, 400), torch.rand(3,300,400)]
targets = [torch.rand(4), torch.rand(4)]

In [25]:
model_test({'data': x, 'boxes': targets})

ValueError: In training mode, targets should be passed