a very quick implementation of object detection.


1. Get the pascal voc dataset.

In [1]:
from copy import deepcopy
from os import makedirs
from os.path import join
from typing import Tuple, List
import numpy as np
import cv2
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchvision.datasets.voc import VOCDetection
from torchvision.models import detection
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.transforms import transforms
from matplotlib import pyplot as plt
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torch.nn import functional as F
from tqdm import tqdm


In [2]:

VOC_CLASSES = [
    'background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
    'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
    'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']


HEIGHT = 448
WIDTH = 448


albument_transforms = A.Compose(
    [
        A.Resize(height=HEIGHT, width=WIDTH, always_apply=True),
        # A.RandomCrop(width=WIDTH, height=HEIGHT),
        # A.HorizontalFlip(p=0.5),
        # A.RandomBrightnessContrast(p=0.2),
        ToTensorV2(always_apply=True),
        A.Lambda(image=lambda x, **kwargs: x / 255.0)
    ],
    bbox_params=A.BboxParams(
        format='pascal_voc',
        min_visibility=0.5,
        label_fields=['class_labels']
    )
)


def transform_targets_for_model(bboxes, classes):
    """
    boxes: FloatTensor[N, 4] x1y1x2y2
    labels: Int64Tensor[N]
    """
    return {
        'boxes': torch.FloatTensor(bboxes),
        'labels': torch.LongTensor(classes),
    }


def transform_targets_for_augmentation(targets: dict) -> Tuple[List[Tuple[int, int, int, int]], List[int]]:
    classes = []
    boxes = []
    for object in targets['annotation']['object']:
        class_index = VOC_CLASSES.index(object['name'])
        classes.append(class_index)

        box = object['bndbox']
        box = tuple(int(box[key]) for key in ['xmin', 'ymin', 'xmax', 'ymax'])
        boxes.append(box)

    return boxes, classes


def transforms_fn(image, targets):
    boxes, classes = transform_targets_for_augmentation(targets)

    transformed = albument_transforms(
        image=np.array(image),
        bboxes=boxes,
        class_labels=classes,
    )

    transformed_image = transformed['image']
    transformed_bboxes = transformed['bboxes']
    transformed_class_labels = transformed['class_labels']

    transformed_targets = transform_targets_for_model(
        transformed_bboxes, transformed_class_labels
    )
    return transformed_image, transformed_targets


def collate_fn_voc(batch: List[Tuple[torch.Tensor, dict]]) -> Tuple[list, Tuple[list, list]]:
    """
    :param batch: list of tuple of image and and dict targets
    :return: images are batched into a tensor, rest are lists
    """
    # batch = [transforms_fn(image, target) for image, target in batch]
    images = []
    targets = []
    for image, target in batch:
        images.append(image)
        targets.append(target)

    images = torch.stack(images)
    return images, targets




target is of the following shape:

```yaml

annotation:
  filename: 2009_004972.jpg
  folder: VOC2012
  object:
    - name: bicycle
      bndbox:
        xmax: 471
        xmin: 54
        ymax: 336
        ymin: 39
      difficult: 0
      occluded: 0
      pose: Left
      trucated: 0
  segmented: 0
  size:
    depth: 3
    height: 375
    width: 500
  source:
    annotation: PASCAL VOC2009
    database: The VOC2009 Database
    image: flickr
```

But it needs to be in this shape:

```yaml
boxese: FloatTensor[N, 4] x1y1x2y2
labels: Int64Tensor[N]
image_id:
area:
iscrowd:
masks:
keypoints:
```

In [3]:

class DetectionMetrics:
    """
    This class keeps track of all the metrics during training and evaluation.
    """
    def __init__(self):
        self.metrics_to_track = [
            #'accuracy',
            #'iou',
        ]
        self.epoch_metrics = {key: 0.0 for key in self.metrics_to_track}
        self.n_batches = 0

    def step_batch(
        self,
        **other,
    ):
        batch_metrics = other

        self._log_batch(batch_metrics)
        return batch_metrics

    def step_epoch(self) -> dict:
        for key in self.epoch_metrics:
            self.epoch_metrics[key] /= self.n_batches
            metrics = deepcopy(self.epoch_metrics)
            self.clear()
            return metrics

    def _log_batch(self, batch_metrics: dict):
        for key in batch_metrics:
            if key not in self.epoch_metrics:
                self.epoch_metrics[key] = 0
            self.epoch_metrics[key] += float(batch_metrics[key])
        self.n_batches += 1

    def clear(self):
        self.epoch_metrics = {key: 0.0 for key in self.metrics_to_track}
        self.n_batches = 0


In [4]:
def get_model(num_classes: int):

    backbone = torchvision.models.mobilenet_v2(pretrained=True).features
    backbone.out_channels = 1280

    anchor_generator = AnchorGenerator(
        sizes=((64, 128, 256),),
        aspect_ratios=((1.0,),)
    )

    roi_pooler = torchvision.ops.MultiScaleRoIAlign(
        featmap_names=['0'],
        output_size=7,
        sampling_ratio=2
    )

    model = FasterRCNN(
        backbone,
        num_classes=2,
        rpn_anchor_generator=anchor_generator,
        box_roi_pool=roi_pooler
    )

    num_classes = num_classes
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    return model


In [5]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device = torch.device(device)


def _save_model(model, cfg: dict, postfix=None):
    postfix = f"__{postfix}" if postfix is not None else ""
    file_name = f"model{postfix}.pth"
    checkpoint_path = join(cfg['output_path'], "checkpoints", file_name)
    torch.save(model.state_dict(), checkpoint_path)


def evaluate_dataset(model, data_loader, metrics, optimizer):
    # TODO: Change this to eval()
    model.train()
    for images, targets in data_loader:

        # with torch.no_grad():
        # TODO: I don't know how to do this in eval mode.
        losses = model(images)
        loss = torch.sum(losses.values())

        optimizer.zero_grad()

        metrics.step_batch(
            loss=float(loss),
        )
    scores = metrics.step_epoch()
    return scores


def log_metrics(logger, scores: dict, prefix: str, step):
    for key in scores:
        logger.add_scalar(f'{prefix}/{key}', scores[key], step)

In [6]:
cfg = {
    'data_path': './data',
    'valset_size': 0.1,
    'epochs': 100,
    'batch_size': 16,
    'learning_rate.initial': 0.0001,
    'learning_rate.decay_every': 30,
    'learning_rate.decay_by': 0.3,
    'output_path': './output',
    'model_save_frequency': 5,
}

makedirs(join(cfg['output_path'], "checkpoints"), exist_ok=True)
logger = SummaryWriter()
metrics = DetectionMetrics()

In [7]:

train_loader = DataLoader(
    VOCDetection(
        root='../data',
        year='2012',
        image_set='train',
        download=False,
        transforms=transforms_fn,
    ),
    batch_size=2,
    shuffle=True,
    collate_fn=collate_fn_voc,
)

test_loader = DataLoader(
    VOCDetection(
        root='../data',
        year='2012',
        image_set='val',
        download=False,
        transforms=transforms_fn,
    ),
    batch_size=2,
    shuffle=False,
    collate_fn=collate_fn_voc,
)


In [8]:
model = get_model(len(VOC_CLASSES))

if cfg.get('checkpoint'):
    model.load(cfg['checkpoint'])

model = model.float().to(device)

optimizer = torch.optim.Adam(
    model.parameters(), lr=cfg['learning_rate.initial']
)

lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=cfg['learning_rate.decay_every'],
    gamma=cfg['learning_rate.decay_by'],
)

In [9]:
try:
    for epoch in range(cfg['epochs']):
        model.train()
        with tqdm(train_loader, unit="batch") as batches_bar:
            batches_bar.set_description(f"Epoch {epoch}")

            for images, targets in batches_bar:
                losses = model(images, targets)
                loss = torch.sum(losses.values())

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # -------- Metrics ------------

                batch_metrics = metrics.step_batch(
                    loss=float(loss),
                    learning_rate=float(lr_scheduler.get_last_lr()[0]),
                    **losses
                )
                batches_bar.set_postfix(
                    loss=batch_metrics['loss'],
                )

        lr_scheduler.step()
        if (epoch + 1) % cfg['model_save_frequency'] == 0:
            _save_model(model, cfg, epoch)

        scores = metrics.step_epoch()
        log_metrics(logger, scores, 'train', epoch)

        test_scores = evaluate_dataset(model, test_loader, metrics, optimizer)
        log_metrics(logger, test_scores, 'val', epoch)


except KeyboardInterrupt:
    print("Stopping early.")
    optimizer.zero_grad()

_save_model(model, cfg)
test_scores = evaluate_dataset(model, test_loader, metrics, optimizer)
logger.add_hparams(cfg, test_scores)

Epoch 0:   0%|          | 0/2859 [00:01<?, ?batch/s]


RuntimeError: Couldn't load custom C++ ops. This can happen if your PyTorch and torchvision versions are incompatible, or if you had errors while compiling torchvision from source. For further information on the compatible versions, check https://github.com/pytorch/vision#installation for the compatibility matrix. Please check your PyTorch version with torch.__version__ and your torchvision version with torchvision.__version__ and verify if they are compatible, and if not please reinstall torchvision so that it matches your PyTorch install.