In [1]:
import torch
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
import os

Clone of the docu in the model: https://github.com/pytorch/vision/blob/master/torchvision/models/detection/faster_rcnn.py to start

In [2]:
backbone = torchvision.models.mobilenet_v2(pretrained=True).features

In [3]:
backbone.out_channels = 1280

In [4]:
anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                  aspect_ratios=((0.5, 1.0, 2.0),))

In [5]:
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0],
                                               output_size=7,
                                               sampling_ratio=2)

In [6]:
model = FasterRCNN(backbone,
                  num_classes=80,
                  rpn_anchor_generator=anchor_generator,
                  box_roi_pool=roi_pooler)

In [7]:
model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): Sequential(
    (0): ConvBNReLU(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm

In [8]:
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]

In [9]:
predictions = model(x)

In [10]:
predictions

[{'boxes': tensor([], size=(0, 4), grad_fn=<StackBackward>),
  'labels': tensor([], dtype=torch.int64),
  'scores': tensor([], grad_fn=<IndexBackward>)},
 {'boxes': tensor([], size=(0, 4), grad_fn=<StackBackward>),
  'labels': tensor([], dtype=torch.int64),
  'scores': tensor([], grad_fn=<IndexBackward>)}]

# Testing Code

In [11]:
import torchvision.models.detection as models

In [12]:
# how I call the model inside my train script
model_auto = models.__dict__['fasterrcnn_resnet50_fpn'](pretrained=False)


In [13]:
model_auto.train()

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d()
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d()
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d()
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d()
          )
  

# Explore the RPN

In [14]:
anchor_generator

AnchorGenerator()

# Use a real loader

In [15]:
coco_root = os.path.join('..','..','external_data','coco')
#coco_detect = torchvision.datasets.CocoDetection(root=os.path.join(coco_root, 'train2017'), 
#                               annFile=os.path.join(coco_root, 'annotations', 'instances_train2017.json'),
#                              transform = torchvision.transforms.ToTensor())

loading annotations into memory...
Done (t=10.92s)
creating index...
index created!


In [None]:
# from https://github.com/pytorch/vision/blob/master/references/detection/transforms.py
class Compose(object):
    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, image, target):
        for t in self.transforms:
            image, target = t(image, target)
        return image, target

In [16]:
# processes coco labels for detection task deployed as transform
# adapted from https://github.com/pytorch/vision/blob/master/references/detection/coco_utils.py
class CocoDetectProcessor(object):
    def __call__(self, image, target):
        
        w, h = image.size
        
        image_id = target["image_id"]
        image_id = torch.tensor([image_id])

        anno = target["annotations"]
        
        # strip crowd scenes
        anno = [obj for obj in anno if obj['iscrowd'] == 0]
        
        boxes = [obj["bbox"] for obj in anno]
        # guard against no boxes via resizing
        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
        boxes[:, 2:] += boxes[:, :2]
        boxes[:, 0::2].clamp_(min=0, max=w)
        boxes[:, 1::2].clamp_(min=0, max=h)

        classes = [obj["category_id"] for obj in anno]
        classes = torch.tensor(classes, dtype=torch.int64)
        
        target = {}
        target["boxes"] = boxes
        target["labels"] = classes
        target["image_id"] = image_id
        
        return image, target

In [None]:
class CocoDetection(torchvision.datasets.CocoDetection):
    def __init__(self, img_folder, ann_file, transforms):
        super(CocoDetection, self).__init__(img_folder, ann_file)
        self._transforms = transforms

    def __getitem__(self, idx):
        img, target = super(CocoDetection, self).__getitem__(idx)
        image_id = self.ids[idx]
        target = dict(image_id=image_id, annotations=target)
        if self._transforms is not None:
            img, target = self._transforms(img, target)
        return img, target

In [None]:
# need a custom one to to return the target too
class ToTensor(object):
    def __call__(self, image, target):
        image = F.to_tensor(image)
        return image, target

In [None]:
class RandomHorizontalFlip(object):
    def __init__(self, prob):
        self.prob = prob

    def __call__(self, image, target):
        if random.random() < self.prob:
            height, width = image.shape[-2:]
            image = image.flip(-1)
            bbox = target["boxes"]
            bbox[:, [0, 2]] = width - bbox[:, [2, 0]]
            target["boxes"] = bbox
        return image, target

In [None]:
train_transforms = Compose([CocoDetectProcessor(), ToTensor(), RandomHorizontalFlip(0.5)])
val_transforms = Compose([CocoDetectProcessor(), ToTensor()])

In [None]:
train_set = CocoDetection(os.path.join(coco_root, 'train2017'), os.path.join(coco_root, 'annotations', 'instances_train2017.json'), train_transforms)
val_set = CocoDetection(os.path.join(coco_root, 'val2017'), os.path.join(coco_root, 'annotations', 'instances_val2017.json'), val_transforms)

In [None]:
# samplers from the dataset
train_sampler = torch.utils.data.RandomSampler(train_set)
test_sampler = torch.utils.data.SequentialSampler(val_set)

In [None]:
train_batch_sampler = torch.utils.data.BatchSampler(
            train_sampler, 12, drop_last=True)

In [None]:
def collate_fn(batch):
    return tuple(zip(*batch))

In [None]:
data_loader = torch.utils.data.DataLoader(
        train_set, batch_sampler=train_batch_sampler, num_workers=4,
        collate_fn=collate_fn)

data_loader_test = torch.utils.data.DataLoader(
        val_set, batch_size=1,
        sampler=test_sampler, num_workers=4,
        collate_fn=collate_fn)

# Simple Train Loop

In [20]:
model_ = FasterRCNN(backbone,
                  num_classes=20,
                  rpn_anchor_generator=anchor_generator,
                  box_roi_pool=roi_pooler)

In [21]:
model_auto.train()

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d()
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d()
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d()
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d()
          )
  

I think we need to look at the number of categories that the model was initialised to train

In [23]:
losses = model_auto([image], ann)

IndexError: too many indices for tensor of dimension 0

In [None]:
len(losses)

In [None]:
losses[0].keys()

In [None]:
losses[0]['scores']