In [1]:
import torch
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
import os

Clone of the docu in the model: https://github.com/pytorch/vision/blob/master/torchvision/models/detection/faster_rcnn.py to start

In [2]:
backbone = torchvision.models.mobilenet_v2(pretrained=True).features

In [3]:
backbone.out_channels = 1280

In [4]:
anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                  aspect_ratios=((0.5, 1.0, 2.0),))

In [5]:
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0],
                                               output_size=7,
                                               sampling_ratio=2)

In [36]:
model = FasterRCNN(backbone,
                  num_classes=80,
                  rpn_anchor_generator=anchor_generator,
                  box_roi_pool=roi_pooler)

In [37]:
model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): Sequential(
    (0): ConvBNReLU(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm

In [38]:
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]

In [39]:
predictions = model(x)

In [40]:
predictions

[{'boxes': tensor([], size=(0, 4), grad_fn=<StackBackward>),
  'labels': tensor([], dtype=torch.int64),
  'scores': tensor([], grad_fn=<IndexBackward>)},
 {'boxes': tensor([], size=(0, 4), grad_fn=<StackBackward>),
  'labels': tensor([], dtype=torch.int64),
  'scores': tensor([], grad_fn=<IndexBackward>)}]

# Testing Code

In [11]:
import torchvision.models.detection as models

In [12]:
# how I call the model inside my train script
model_test = models.__dict__['fasterrcnn_resnet50_fpn'](pretrained=False)


In [13]:
model_test.train()

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d()
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d()
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d()
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d()
          )
  

# Use a real loader

In [14]:
coco_root = os.path.join('..','..','external_data','coco')
coco_detect = torchvision.datasets.CocoDetection(root=os.path.join(coco_root, 'train2017'), 
                               annFile=os.path.join(coco_root, 'annotations', 'instances_train2017.json'),
                              transform = torchvision.transforms.ToTensor())

loading annotations into memory...
Done (t=10.69s)
creating index...
index created!


In [33]:
for item in coco_detect:
    image, ann = item
    print(image.shape)
    print(len(ann))
    for item in ann:
        item["boxes"] = item["bbox"]
        item["labels"] = item["category_id"]
        item["boxes"] = torch.Tensor(item["boxes"]).unsqueeze(dim=0)
        item["labels"] = torch.tensor(item["labels"], dtype=torch.int64)
        item["labels"] = item["labels"]
    break

torch.Size([3, 480, 640])
8


In [55]:
torch.Tensor([51,]).shape

torch.Size([1])

In [31]:
ann[0]['bbox']

[1.08, 187.69, 611.59, 285.84]

In [49]:
model_ = FasterRCNN(backbone,
                  num_classes=20,
                  rpn_anchor_generator=anchor_generator,
                  box_roi_pool=roi_pooler)

In [50]:
model_.train()

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): Sequential(
    (0): ConvBNReLU(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm

I think we need to look at the number of categories that the model was initialised to train

In [51]:
losses = model_([image], ann)

IndexError: too many indices for tensor of dimension 0

In [19]:
len(losses)

1

In [22]:
losses[0].keys()

dict_keys(['boxes', 'labels', 'scores'])

In [24]:
losses[0]['scores']

tensor([0.5602, 0.5592, 0.5589, 0.5582, 0.5582, 0.5579, 0.5571, 0.5558, 0.5541,
        0.5526, 0.5505, 0.5458, 0.5456, 0.5452, 0.5440, 0.5438, 0.5434, 0.5430,
        0.5429, 0.5422, 0.5421, 0.5415, 0.5411, 0.5408, 0.5407, 0.5404, 0.5402,
        0.5401, 0.5400, 0.5397, 0.5396, 0.5392, 0.5391, 0.5390, 0.5389, 0.5385,
        0.5384, 0.5383, 0.5378, 0.5378, 0.5377, 0.5376, 0.5373, 0.5369, 0.5364,
        0.5364, 0.5363, 0.5362, 0.5360, 0.5359, 0.5358, 0.5357, 0.5355, 0.5354,
        0.5354, 0.5353, 0.5352, 0.5351, 0.5349, 0.5349, 0.5348, 0.5347, 0.5345,
        0.5344, 0.5344, 0.5342, 0.5341, 0.5341, 0.5340, 0.5337, 0.5336, 0.5335,
        0.5335, 0.5334, 0.5334, 0.5332, 0.5330, 0.5330, 0.5330, 0.5327, 0.5324,
        0.5324, 0.5324, 0.5322, 0.5321, 0.5320, 0.5319, 0.5317, 0.5317, 0.5314,
        0.5313, 0.5311, 0.5311, 0.5310, 0.5310, 0.5309, 0.5309, 0.5308, 0.5306,
        0.5304], grad_fn=<IndexBackward>)