In [2]:
import torch
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

Clone of the docu in the model: https://github.com/pytorch/vision/blob/master/torchvision/models/detection/faster_rcnn.py to start

In [3]:
backbone = torchvision.models.mobilenet_v2(pretrained=True).features

In [4]:
backbone.out_channels = 1280

In [5]:
anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                  aspect_ratios=((0.5, 1.0, 2.0),))

In [6]:
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0],
                                               output_size=7,
                                               sampling_ratio=2)

In [7]:
model = FasterRCNN(backbone,
                  num_classes=2,
                  rpn_anchor_generator=anchor_generator,
                  box_roi_pool=roi_pooler)

In [8]:
model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): Sequential(
    (0): ConvBNReLU(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(96, eps

In [9]:
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]

In [10]:
predictions = model(x)

In [11]:
predictions

[{'boxes': tensor([[4.2522e+01, 7.1565e+01, 1.5967e+02, 3.0000e+02],
          [8.8877e+01, 1.6303e+02, 1.4490e+02, 2.0598e+02],
          [9.8495e+01, 1.8497e+02, 3.2946e+02, 2.9968e+02],
          [1.1411e+02, 1.7738e+02, 1.6891e+02, 2.1967e+02],
          [3.2906e+02, 8.4160e+01, 3.9212e+02, 1.2755e+02],
          [2.6292e+02, 1.9917e+02, 3.1383e+02, 2.4256e+02],
          [3.4371e+02, 2.3097e+02, 3.9991e+02, 2.8258e+02],
          [3.6215e+01, 1.7210e+02, 1.3562e+02, 2.9979e+02],
          [2.6292e+02, 1.5870e+02, 3.6698e+02, 2.9953e+02],
          [2.9414e+02, 1.0284e+02, 3.4738e+02, 1.4619e+02],
          [2.8364e+02, 4.4561e+01, 3.8788e+02, 2.9834e+02],
          [1.8336e+02, 3.4239e+01, 2.5347e+02, 5.8656e+01],
          [2.7571e+02, 0.0000e+00, 3.8947e+02, 1.1987e+02],
          [3.3261e+02, 1.0625e+02, 3.9171e+02, 1.4909e+02],
          [7.2807e+01, 1.7374e+02, 1.7207e+02, 2.9978e+02],
          [1.2069e+02, 1.7047e+02, 2.2110e+02, 2.9957e+02],
          [0.0000e+00, 8.5944e+

# Testing Code

In [12]:
import torchvision.models.detection as models

In [13]:
# how I call the model inside my train script
model_test = model = models.__dict__['fasterrcnn_resnet50_fpn'](pretrained=False)

In [14]:
model_test.train()

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d()
      (relu): ReLU(inplace)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d()
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d()
          (relu): ReLU(inplace)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d()
          )
        )
  

In [19]:
x = [torch.rand(3, 300, 400), torch.rand(3,300,400)]
targets = [{'boxes': torch.tensor([[100,100,200,200]], dtype=torch.int32), 
            'labels': torch.ones(1,) }, 
           {'boxes': torch.tensor([[50,75,150,140]], dtype=torch.int32), 
            'labels': torch.ones(1,) }]

In [23]:
test = torch.tensor([[100,100,200,200]], dtype=torch.int32)
test[:, None, 2:]

tensor([[[200, 200]]], dtype=torch.int32)

In [16]:
#targets = {'boxes': torch.tensor([[100,100,200,200], [50,75,150,140]], dtype=torch.int32),
#          'labels': torch.tensor([1,0], dtype=torch.long)  }

In [17]:
#targets[0]['boxes'].unbind(1)

KeyError: 0

In [None]:
#res_2 = targets[0]['boxes'].view(1, 4)
#res_2 #.unbind(1)

In [20]:
losses = model_test(images = x, targets=targets)

RuntimeError: Expected object of scalar type Int but got scalar type Float for argument #2 'other'

In [None]:
losses