# YOLO v1

[paper](https://arxiv.org/pdf/1506.02640.pdf)

Key points:

- S x S grid. S = 7
- predicts B boxes for each cell. B = 2
- Responsible cell:
    - the cell that contains bbox midpoint.
    - Among B predicted boxes, only the one that has highest IoU will be responsible.
- predicts confidence each cell. confidence = IoU
- predicts x, y, w, h each cell:
    - x, y: they are midpoint coordinates relative to cell origin, h, w.
        Meaning, cell h, w are 1, 1, and x, y will be in [0, 1]
    - h, w: they are bbox height, width relative to whole image.
- predicts C classes each cell.
- All are trained only when the cell is responsible for a bbox.
- Each cell can only predict 1 object. although it tries to predict B bboxes
- Predicted tensor is of shape [S, S, (C + 5B)]
- Architecture is simply a CNN followed by a flatten and fully-connected layers.
- While inference, multiply C probabilities with predicted confidence.
- While inference, apply NMS
- All losses are MSE variations.

Hyperparams:

- leaky relu
- batch size 64
- epochs 135 (with pre-trained)
- momentum 0.9
- decay: 0.0005
- lr:
    - 10^-3 for few epochs.
    - 10^-2 for +75 epochs
    - 10^-3 for +30 epochs.
    - 10^-4 for +30 epochs.
- Extensive augmentation:
    - Random scaling and translation up to 20%
    - randomly adjust the exposure and saturation of the image by up to a factor of 1.5 in the HSV color space.
- dropout of 0.5 on last fully-connected

Losses:

- Object exists: lambda_coord * sum((x - xhat)^2 + (y - yhat)^2)
- Object exists: lambda_coord * sum((sqrt(w) - sqrt(w_hat))^2 + (sqrt(h) - sqrt(h_hat))^2)
- Object exists: 1 * sum((confidence - confidence_hat)^2)
- No-object exists: lambda_no_object * sum((confidence - confidence_hat)^2)
- Object exists: sum((probability(c) - probability(c_hat))^2)

confidence = IoU
lambda_coord = 5
lambda_no_object = 0.5

In [4]:
import numpy as np
from albumentations.pytorch import ToTensorV2
import cv2
from torch.utils.data import DataLoader
from torchvision.datasets import VOCDetection
import albumentations as A
import torch
import torch.nn as nn
from typing import List, Union, Optional, Tuple
import pytorch_lightning as pl


In [None]:
! pip install pytorch-lightning albumentations

In [5]:
VOC_CLASSES = [
    "aeroplane",
    "bicycle",
    "bird",
    "boat",
    "bottle",
    "bus",
    "car",
    "cat",
    "chair",
    "cow",
    "diningtable",
    "dog",
    "horse",
    "motorbike",
    "person",
    "pottedplant",
    "sheep",
    "sofa",
    "train",
    "tvmonitor",
]

In [11]:

class VocYoloDataModule(pl.LightningDataModule):
    def __init__(
        self,
        grid_size: int,
        batch_size: int,
        data_path: str,
        dataloader_num_workers: int = 0,
        data_augment=False,
        **_,
    ):
        super().__init__()

        self.h = 448
        self.w = 448
        self.dims = (3, self.h, self.w)
        self.num_classes = 20

        self.grid_size = grid_size
        self.data_path = data_path
        self.batch_size = batch_size
        self.num_workers = dataloader_num_workers
        self.augment = data_augment

        self.transforms = self._get_transforms()

        self.dataset_train, self.dataset_val = None, None

    def prepare_data(self):
        VOCDetection(
            root=self.data_path,
            year="2012",
            image_set="trainval",
            download=True,
        )

    def setup(self, stage: Optional[str] = None):
        self.dataset_train = VOCDetection(
            root=self.data_path,
            year="2012",
            image_set="train",
            download=False,
            transforms=self.transforms,
        )
        self.dataset_val = VOCDetection(
            root=self.data_path,
            year="2012",
            image_set="val",
            download=False,
            transforms=self.transforms,
        )

    def train_loader(self):
        return DataLoader(
            self.dataset_train,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers,
        )

    def val_loader(self):
        return DataLoader(
            self.dataset_val,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
        )

    def _get_transforms(self):
        """
        The transform function takes in pil image and a dict of target bboxes.
        It applies augmentations and returns an image and target tensor of shape (C+5, S, S)
        The transform will return image tensor and target tensor.

        The target is of the shape excluding unrelated info:
        ```
        annotation:
          object:
            - name: bicycle
              bndbox:
                xmax: 471
                xmin: 54
                ymax: 336
                ymin: 39
        ```
        The output target will be a tensor of shape: (C+5, S, S)
        :return: Callable function
        """
        albument_transforms = self._get_augmentations(self.h, self.w, self.augment)

        def transforms(image, targets: dict):
            boxes, classes = self._transform_pre_augmentation(targets)

            transformed = albument_transforms(
                image=np.array(image),
                bboxes=boxes,
                class_labels=classes,
            )

            image = transformed["image"]
            boxes = transformed["bboxes"]
            classes = transformed["class_labels"]

            targets = self._transform_targets(
                boxes, classes, self.num_classes, self.grid_size
            )
            return image, targets

        return transforms

    def _transform_targets(
        self, boxes, classes, num_classes, grid_size
    ) -> torch.Tensor:
        """
        - Get responsible pairs:
            - Find midpoints of all bboxes.
            - For all cells, if there's a bbox midpoint in the cell,
              that cell and bbox will go in a responsible pair list.
        - Convert coordinates from (xmin, ymin, ...) to yolo style.
        - Put everything in a tensor.

        :param boxes: list of tuples of (xmin, ymin, xmax, ymax)
        :param classes: list of integers
        :return: torch.Tensor of shape (C+5, S, S)
        """
        pairs: List[Tuple[int, int, int]] = self._get_responsible_pairs(
            boxes, grid_size, self.h, self.w
        )
        boxes_yolo = self._convert_boxes_to_yolo(
            boxes, pairs, grid_size, self.h, self.w
        )

        tensor = torch.zeros((num_classes + 5, grid_size, grid_size))
        for i, (r, c, b) in enumerate(pairs):
            tensor[classes[b], r, c] = 1.0
            tensor[num_classes, r, c] = 1.0
            for j in range(4):
                tensor[num_classes + 1 + j, r, c] = boxes_yolo[i][j]
        return tensor

    @staticmethod
    def _convert_boxes_to_yolo(
        boxes: List[Tuple[int, int, int, int]],
        pairs: List[Tuple[int, int, int]],
        grid_size: int,
        h: int,
        w: int,
    ) -> List[Tuple[float, float, float, float]]:
        """
        Returns a yolo style bbox coordinates for each responsible pair.
        """
        cell_h = h / grid_size
        cell_w = w / grid_size

        yolo_boxes = []
        for r, c, b in pairs:
            xmin, ymin, xmax, ymax = boxes[b]

            tw = (xmax - xmin) / w
            th = (ymax - ymin) / h

            mx = (xmax - xmin) / 2
            my = (ymax - ymin) / 2
            tx = mx / cell_w
            ty = my / cell_h

            yolo_boxes.append((tx, ty, tw, th))

        return yolo_boxes

    @staticmethod
    def _get_responsible_pairs(
        boxes: List[Tuple[int, int, int, int]],
        grid_size: int,
        h: int,
        w: int,
    ) -> List[Tuple[int, int, int]]:
        """
        - Find midpoints of all bboxes.
        - For all cells, if there's a bbox midpoint in the cell,
          that cell and bbox will go in a responsible pair list.
        """
        midpoints = []
        for (xmin, ymin, xmax, ymax) in boxes:
            x = (xmin + xmax) / 2
            y = (ymin + ymax) / 2
            midpoints.append((x, y))

        cell_h = h / grid_size
        cell_w = w / grid_size

        pairs = []
        for r in range(grid_size):
            y1 = r * cell_h
            y2 = y1 + cell_h
            for c in range(grid_size):
                x1 = c * cell_w
                x2 = x1 + cell_w
                for b, (mx, my) in enumerate(midpoints):
                    if x1 < mx < x2 and y1 < my < y2:
                        pairs.append((r, c, b))
        return pairs

    @staticmethod
    def _get_augmentations(h, w, augment: bool):
        resizing = [
            # A.LongestMaxSize(max_size=WIDTH, always_apply=True),
            A.PadIfNeeded(min_height=h, min_width=w, border_mode=cv2.BORDER_CONSTANT),
            A.RandomCrop(h, w),
            # A.Resize(height=HEIGHT, width=WIDTH, always_apply=True),
        ]
        augmentations = []
        if augment:
            augmentations = [
                A.HorizontalFlip(p=0.5),
                A.RandomBrightnessContrast(p=0.2),
            ]
        compatibility = [
            ToTensorV2(always_apply=True),
            A.Lambda(image=lambda x, **kwargs: x / 255.0),
        ]
        return A.Compose(
            resizing + augmentations + compatibility,
            bbox_params=A.BboxParams(
                format="pascal_voc", min_visibility=0.05, label_fields=["class_labels"]
            ),
        )

    @staticmethod
    def _transform_pre_augmentation(targets: dict) -> Tuple[list, list]:
        """
        This converts the targets compatible with albumentations
        The target is of the shape excluding unrelated info:
        ```
        annotation:
          object:
            - name: bicycle
              bndbox:
                xmax: 471
                xmin: 54
                ymax: 336
                ymin: 39
        ```
        Output will be of the form:
        (
            [(xmin, ymin, xmax, ymax), ...],
            [3, ...]
        )
        """
        classes = []
        boxes = []
        for object in targets["annotation"]["object"]:
            class_index = VOC_CLASSES.index(object["name"])
            classes.append(class_index)

            box = object["bndbox"]
            box = tuple(int(box[key]) for key in ["xmin", "ymin", "xmax", "ymax"])
            boxes.append(box)

        return boxes, classes


In [12]:
config = {
    "grid_size": 7,
    "data_path": "../data",
    "batch_size": 2,
    "dataloader_num_workers": 0,
    "data_augment": True,
}
dataset = VocYoloDataModule(**config)
# dataset.prepare_data()
dataset.setup()
for images, targets in dataset.train_loader():
    print(images.shape)
    print(targets.shape)
    break

torch.Size([2, 3, 448, 448])
torch.Size([2, 25, 7, 7])


In [None]:
"""
Information about architecture config:
- Tuple is structured by (kernel_size, filters, stride, padding)
- "M" is simply maxpooling with stride 2x2 and kernel 2x2
- List is structured by tuples and lastly int with number of repeats
"""

architecture_config = [
    (7, 64, 2, 3),
    "M",
    (3, 192, 1, 1),
    "M",
    (1, 128, 1, 0),
    (3, 256, 1, 1),
    (1, 256, 1, 0),
    (3, 512, 1, 1),
    "M",
    [(1, 256, 1, 0), (3, 512, 1, 1), 4],
    (1, 512, 1, 0),
    (3, 1024, 1, 1),
    "M",
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],
    (3, 1024, 1, 1),
    (3, 1024, 2, 1),
    (3, 1024, 1, 1),
    (3, 1024, 1, 1),
]

In [None]:
class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))


class SimpleCNN(nn.Module):
    def __init__(
            self,
            architecture: List[Union[tuple, str, list]],
            in_channels: int,
    ):
        super(SimpleCNN, self).__init__()
        layers = []
        for module in architecture:
            if type(module) is tuple:
                layers.append(self._get_cnn_block(module, in_channels))
                in_channels = module[1]
            elif module == 'M':
                layers.append(nn.MaxPool2d(
                    kernel_size=(2, 2),
                    stride=(2, 2),
                ))
            elif type(module) is list:
                for i in range(module[-1]):
                    for j in range(len(module) - 1):
                        layers.append(self._get_cnn_block(module[j], in_channels))
                        in_channels = module[j][1]
        self.model = nn.Sequential(*layers)

    @staticmethod
    def _get_cnn_block(module: tuple, in_channels):
        kernel_size, filters, stride, padding = module
        return CNNBlock(
            in_channels,
            filters,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
        )

    def forward(self, x):
        return self.model(x)


class YoloV1(nn.Module):
    def __init__(self, in_channels, split_size, num_boxes, num_classes):
        super(YoloV1, self).__init__()
        self.darknet = SimpleCNN(architecture_config, in_channels)

        S, B, C = split_size, num_boxes, num_classes
        self.fcs = nn.Sequential(
            nn.Flatten(),

            nn.Linear(1024 * S * S, 496),
            nn.Dropout(0.1),
            nn.LeakyReLU(0.1),

            nn.Linear(496, S * S * (C + B * 5)),
        )
        self.final_shape = (-1, (C + B * 5), S, S)

    def forward(self, x):
        x = self.darknet(x)
        out = self.fcs(torch.flatten(x, start_dim=1))
        out = out.view(self.final_shape)
        return out

In [None]:
model = YoloV1(in_channels=3, split_size=7, num_boxes=2, num_classes=21)
random_batch = torch.rand((2, 3, 448, 448))
random_output = model(random_batch)
random_output.shape

In [None]:
class YoloV1Loss(nn.Module):
    def __init__(self, num_boxes: int, num_classes: int):
        """
        Find the responsible cell-bbox pairs.

        :param num_boxes: (B)
        :param num_classes: (C)
        """
        super().__init__()

        self.num_boxes = num_boxes
        self.num_classes = num_classes

    def forward(
            self, preds: torch.Tensor, targets: torch.Tensor
    ) -> Tuple[torch.Tensor, dict]:
        """
        :param preds: tensor of shape (batch, (C + B * 5), S, S)
        :param targets: tensor of shape (batch, C+5, S, S)
        :return:
        """
        pass


In [None]:
class YoloV1PL(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.yolo_v1 = YoloV1(
            in_channels=3,
            split_size=7,
            num_boxes=2,
            num_classes=21
        )

    def forward(self, x):
        return self.yolo_v1(x)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        x = x.view(x.size(0), -1)
        z = self.encoder(x)
        x_hat = self.decoder(z)
        loss = F.mse_loss(x_hat, x)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        x = x.view(x.size(0), -1)
        z = self.encoder(x)
        x_hat = self.decoder(z)
        loss = F.mse_loss(x_hat, x)
        self.log('val_loss', loss)