In [1]:
import torch
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
import os

# Testing Code

In [2]:
import torchvision.models.detection as models

In [3]:
# how I call the model inside my train script
model_auto = models.__dict__['fasterrcnn_resnet50_fpn'](pretrained=False)
model_auto.to('cuda')

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d()
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d()
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d()
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d()
          )
  

# Use a real loader

In [4]:
coco_root = os.path.join('..','..','external_data','coco')
#coco_detect = torchvision.datasets.CocoDetection(root=os.path.join(coco_root, 'train2017'), 
#                               annFile=os.path.join(coco_root, 'annotations', 'instances_train2017.json'),
#                              transform = torchvision.transforms.ToTensor())

In [5]:
# from https://github.com/pytorch/vision/blob/master/references/detection/transforms.py
class Compose(object):
    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, image, target):
        for t in self.transforms:
            image, target = t(image, target)
        return image, target

In [6]:
# processes coco labels for detection task deployed as transform
# adapted from https://github.com/pytorch/vision/blob/master/references/detection/coco_utils.py
class CocoDetectProcessor(object):
    def __call__(self, image, target):
        
        w, h = image.size
        
        image_id = target["image_id"]
        image_id = torch.tensor([image_id])

        anno = target["annotations"]
        
        # strip crowd scenes
        anno = [obj for obj in anno if obj['iscrowd'] == 0]
        
        boxes = [obj["bbox"] for obj in anno]
        # guard against no boxes via resizing
        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
        boxes[:, 2:] += boxes[:, :2]
        boxes[:, 0::2].clamp_(min=0, max=w)
        boxes[:, 1::2].clamp_(min=0, max=h)

        classes = [obj["category_id"] for obj in anno]
        classes = torch.tensor(classes, dtype=torch.int64)
        
        target = {}
        target["boxes"] = boxes
        target["labels"] = classes
        target["image_id"] = image_id
        
        return image, target

In [7]:
class CocoDetection(torchvision.datasets.CocoDetection):
    def __init__(self, img_folder, ann_file, transforms):
        super(CocoDetection, self).__init__(img_folder, ann_file)
        self._transforms = transforms

    def __getitem__(self, idx):
        img, target = super(CocoDetection, self).__getitem__(idx)
        image_id = self.ids[idx]
        target = dict(image_id=image_id, annotations=target)
        if self._transforms is not None:
            img, target = self._transforms(img, target)
        return img, target

In [8]:
# need a custom one to to return the target too
from torchvision.transforms import functional as F
class ToTensor(object):
    def __call__(self, image, target):
        image = F.to_tensor(image)
        return image, target

In [9]:
class RandomHorizontalFlip(object):
    def __init__(self, prob):
        self.prob = prob

    def __call__(self, image, target):
        if random.random() < self.prob:
            height, width = image.shape[-2:]
            image = image.flip(-1)
            bbox = target["boxes"]
            bbox[:, [0, 2]] = width - bbox[:, [2, 0]]
            target["boxes"] = bbox
        return image, target

In [10]:
train_transforms = Compose([CocoDetectProcessor(), ToTensor(), RandomHorizontalFlip(0.5)])
val_transforms = Compose([CocoDetectProcessor(), ToTensor()])

In [11]:
train_set = CocoDetection(os.path.join(coco_root, 'train2017'), os.path.join(coco_root, 'annotations', 'instances_train2017.json'), train_transforms)
val_set = CocoDetection(os.path.join(coco_root, 'val2017'), os.path.join(coco_root, 'annotations', 'instances_val2017.json'), val_transforms)

loading annotations into memory...
Done (t=11.03s)
creating index...
index created!
loading annotations into memory...
Done (t=1.56s)
creating index...
index created!


In [12]:
# samplers from the dataset
train_sampler = torch.utils.data.RandomSampler(train_set)
test_sampler = torch.utils.data.SequentialSampler(val_set)

In [13]:
train_batch_sampler = torch.utils.data.BatchSampler(
            train_sampler, 3, drop_last=True)

In [14]:
def collate_fn(batch):
    return tuple(zip(*batch))

In [15]:
data_loader = torch.utils.data.DataLoader(
        train_set, batch_sampler=train_batch_sampler, num_workers=1,
        collate_fn=collate_fn)

data_loader_test = torch.utils.data.DataLoader(
        val_set, batch_size=1,
        sampler=test_sampler, num_workers=1,
        collate_fn=collate_fn)

# Logger Function Train Loop

In [16]:
# based on https://github.com/pytorch/vision/blob/master/references/detection/utils.py
# but we don't need the full one yet
class Logger(object):
    def __init__(self):
        self.filler = 1
    
    def log(self, iterable):
        for obj in iterable:
            yield obj

# Simple Train Loop

In [17]:
model_auto.train()

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d()
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d()
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d()
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d()
          )
  

I think we need to look at the number of categories that the model was initialised to train

In [18]:
import random

In [19]:
device = 'cuda'
logger = Logger() 
for images, targets in logger.log(iterable=data_loader):
    # transforms the tuples back to list (why did we need tuple?)
    images = list(image.to(device) for image in images)
    targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
    losses = model_auto(images, targets)
    break

In [20]:
losses

{'loss_classifier': tensor(4.5340, device='cuda:0', grad_fn=<NllLossBackward>),
 'loss_box_reg': tensor(0.0289, device='cuda:0', grad_fn=<DivBackward0>),
 'loss_objectness': tensor(0.6945, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>),
 'loss_rpn_box_reg': tensor(0.0804, device='cuda:0', grad_fn=<DivBackward0>)}