# License Plate Detector using MobilenetV3Small SSD

### Load Dataset

In [1]:
import os
import torch
import torch.utils.data
import torchvision
from PIL import Image
from pycocotools.coco import COCO

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, root, annotation, transforms=None):
        self.root = root
        self.transforms = transforms
        self.coco = COCO(annotation)
        self.ids = list(sorted(self.coco.imgs.keys()))

    def __getitem__(self, index):
        # Own coco file
        coco = self.coco
        # Image ID
        img_id = self.ids[index]
        # List: get annotation id from coco
        ann_ids = coco.getAnnIds(imgIds=img_id)
        # Dictionary: target coco_annotation file for an image
        coco_annotation = coco.loadAnns(ann_ids)
        # path for input image
        path = coco.loadImgs(img_id)[0]['file_name']
        # open the input image
        img = Image.open(os.path.join(self.root, path))
        img = img.convert("RGB")

        # number of objects in the image
        num_objs = len(coco_annotation)

        # Bounding boxes for objects
        # In coco format, bbox = [xmin, ymin, width, height]
        # In pytorch, the input should be [xmin, ymin, xmax, ymax]
        boxes = []
        for i in range(num_objs):
            xmin = coco_annotation[i]['bbox'][0]
            ymin = coco_annotation[i]['bbox'][1]
            xmax = xmin + coco_annotation[i]['bbox'][2]
            ymax = ymin + coco_annotation[i]['bbox'][3]
            boxes.append([xmin, ymin, xmax, ymax])
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # Labels (In my case, I only one class: target class or background)
        labels = torch.ones(num_objs, dtype=torch.int64)
        # Tensorise img_id
        img_id = torch.tensor(img_id)
        # Size of bbox (Rectangular)
        areas = []
        for i in range(num_objs):
            areas.append(coco_annotation[i]['area'])
        areas = torch.as_tensor(areas, dtype=torch.float32)
        # Iscrowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        if num_objs == 0:
            boxes = torch.zeros((0, 4), dtype=torch.float32)
            labels = torch.zeros((0,), dtype=torch.int64)
            areas = torch.zeros((0,), dtype=torch.float32)
            iscrowd = torch.zeros((0,), dtype=torch.int64)

        # Annotation is in dictionary format
        my_annotation = {}
        my_annotation["boxes"] = boxes
        my_annotation["labels"] = labels
        my_annotation["image_id"] = img_id
        my_annotation["area"] = areas
        my_annotation["iscrowd"] = iscrowd

        if self.transforms is not None:
            if not isinstance(img, Image.Image):
                img = Image.fromarray(img)
            img = self.transforms(img)

        return img, my_annotation

    def __len__(self):
        return len(self.ids)

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [3]:
from torchvision.transforms import v2 as T


def get_transform(train):
    transforms = []
    # if train:
    #     transforms.append(T.RandomHorizontalFlip(0.5))
    # rescale to 0-1
    transforms.append(T.ToDtype(torch.float, scale=1/255.0))
    # transforms.append(T.ToDtype(torch.float, scale=True))
    transforms.append(torchvision.transforms.ToTensor())
    return T.Compose(transforms)

In [4]:
# path to your own data and coco file
import utils
train_data_dir = 'data/train'
train_coco = 'data/train/_annotations.coco.json'

test_data_dir = 'data/test'
test_coco = 'data/test/_annotations.coco.json'

valid_data_dir = 'data/valid'
valid_coco = 'data/valid/_annotations.coco.json'

# create own Dataset
train_ds = CustomDataset(root=train_data_dir,
                          annotation=train_coco,
                          transforms=get_transform(train=True)
                          )

test_ds = CustomDataset(root=test_data_dir,
                          annotation=test_coco,
                          transforms=get_transform(train=False)
                          )

valid_ds = CustomDataset(root=valid_data_dir,
                          annotation=valid_coco,
                          transforms=get_transform(train=False)
                          )

# collate_fn needs for batch
def collate_fn(batch):
    return tuple(zip(*batch))

# Batch size
train_batch_size = 4
test_batch_size = 4
valid_batch_size = 4

# own DataLoader
train_loader = torch.utils.data.DataLoader(train_ds,
                                          batch_size=train_batch_size,
                                          shuffle=True,
                                          collate_fn=utils.collate_fn)

test_loader = torch.utils.data.DataLoader(test_ds,
                                            batch_size=test_batch_size,
                                            shuffle=False,
                                            collate_fn=utils.collate_fn)

val_loader = torch.utils.data.DataLoader(valid_ds,
                                            batch_size=valid_batch_size,
                                            shuffle=False,
                                            collate_fn=utils.collate_fn)

loading annotations into memory...
Done (t=0.02s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!


### Baseline model

In [5]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# load a model pre-trained on COCO
model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn(weights="DEFAULT")
model_resnet = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn(weights="DEFAULT")

# replace the classifier with a new one, that has
# num_classes which is user-defined
num_classes = 2  # 1 class (person) + background
# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
in_features_resnet = model_resnet.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
model_resnet.roi_heads.box_predictor = FastRCNNPredictor(in_features_resnet, num_classes)

In [10]:
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

# load a pre-trained model for classification and return
# only the features
backbone_large = torchvision.models.mobilenet_v3_large(weights="DEFAULT").features
backbone_large.out_channels = 960

backbone = torchvision.models.mobilenet_v3_small(weights="DEFAULT").features

# ``FasterRCNN`` needs to know the number of
# so we need to add it here
backbone.out_channels = 576 

anchor_generator_old = AnchorGenerator(
    sizes=((32, 64, 128, 256, 512),),
    aspect_ratios=((0.5, 1.0, 2.0),)
)

anchor_generator = AnchorGenerator(
    sizes=((32, 64, 128, 256),),
    aspect_ratios=((0.5, 1.0, 2.0),)
)

roi_pooler = torchvision.ops.MultiScaleRoIAlign(
    featmap_names=['0'],
    output_size=7,
    sampling_ratio=2
)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# put the pieces together inside a Faster-RCNN model
model = FasterRCNN(
    backbone,
    num_classes=2,
    rpn_anchor_generator=anchor_generator,
    box_roi_pool=roi_pooler
).to(device)

In [7]:
from torchmetrics.detection.mean_ap import MeanAveragePrecision

def getIoU(bbox, gt):
    x1, y1, w1, h1 = bbox
    x2, y2, w2, h2 = gt
    xA = max(x1, x2)
    yA = max(y1, y2)
    xB = min(x1+w1, x2+w2)
    yB = min(y1+h1, y2+h2)
    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
    boxAArea = w1 * h1
    boxBArea = w2 * h2
    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

# mean average precision
def get_mAP(pred_boxes, pred_labels, pred_scores, gt_boxes, gt_labels):
    # get mAP
    pred = [{'boxes': pred_boxes, 'labels': pred_labels, 'scores': pred_scores}]
    gt = [{'boxes': gt_boxes, 'labels': gt_labels}]
    map_metric = MeanAveragePrecision(iou_thresholds=[0.5], class_metrics=True)
    map_metric.update(pred, gt)
    mAP = map_metric.compute()
    return mAP['map']


def validate(model, data_loader, device):
    model.eval()
    average_IoU = 0

    mAP = 0
    with torch.no_grad():
        for images, targets in data_loader:
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            pred = model(images)
            pred_boxes = pred[0]['boxes'].cpu()
            pred_labels = pred[0]['labels'].cpu()
            pred_scores = pred[0]['scores'].cpu()
            
            gt_boxes = targets[0]['boxes'].cpu()
            gt_labels = targets[0]['labels'].cpu()
            # get mAP
            if(len(pred_boxes) == 0):
                continue
            mAP += get_mAP(pred_boxes, pred_labels, pred_scores, gt_boxes, gt_labels)
            pred_boxes = pred[0]['boxes'].cpu().numpy()
            pred_labels = pred[0]['labels'].cpu().numpy()
            pred_scores = pred[0]['scores'].cpu().numpy()

            gt_boxes = targets[0]['boxes'].cpu().numpy()
            gt_labels = targets[0]['labels'].cpu().numpy()

            # get IoU
            if(len(pred_boxes) == 0):
                continue
            iou = getIoU(pred_boxes[0], gt_boxes[0])
            average_IoU += iou

    average_IoU /= len(data_loader)
    mAP /= len(data_loader)

    return average_IoU, mAP

In [12]:
small_resnet_model = FasterRCNN(
    backbone,
    num_classes=2,
    rpn_anchor_generator=anchor_generator_old,
    box_roi_pool=roi_pooler
).to(device)
small_resnet_model.load_state_dict(torch.load('FasterRCNN_MobileNetV3_small_0.86.pth'))
small_resnet_model.eval()
print("Model small loaded")

large_resnet_model = FasterRCNN(
    backbone_large,
    num_classes=2,
    rpn_anchor_generator=anchor_generator_old,
    box_roi_pool=roi_pooler
).to(device)
large_resnet_model.load_state_dict(torch.load('FasterRCNN_MobileNetV3_large_0.877.pth'))
large_resnet_model.eval()
print("Model large loaded")

small_fpn_model = FasterRCNN(
    backbone,
    num_classes=2,
    rpn_anchor_generator=anchor_generator,
    box_roi_pool=roi_pooler
).to(device)
small_fpn_model.load_state_dict(torch.load('FasterRCNN_MobileNetV3_320FPN_2_large_0.902.pth'))
small_fpn_model.eval()
print("Model fpn loaded")

  small_resnet_model.load_state_dict(torch.load('FasterRCNN_MobileNetV3_small_0.86.pth'))


Model small loaded


  large_resnet_model.load_state_dict(torch.load('FasterRCNN_MobileNetV3_large_0.877.pth'))


Model large loaded


  small_fpn_model.load_state_dict(torch.load('FasterRCNN_MobileNetV3_320FPN_2_large_0.902.pth'))


Model fpn loaded


In [20]:
import time

print("Evaluate small")
ds_length = len(train_loader.dataset)
vit_tic = time.time()
for i, (images, targets) in enumerate(train_loader):
    with torch.no_grad():
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        small_resnet_model(images)
vit_toc = time.time()

print("Evaluate large")
evit_tic = time.time()
for i, (images, targets) in enumerate(train_loader):
    with torch.no_grad():
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        large_resnet_model(images)
evit_toc = time.time()

print("Evaluate fpn")
fpn_tic = time.time()
for i, (images, targets) in enumerate(train_loader):
    with torch.no_grad():
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        small_fpn_model(images)
fpn_toc = time.time()

print("Evaluate large")
fpn_large_tic = time.time()
for i, (images, targets) in enumerate(train_loader):
    with torch.no_grad():
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        small_fpn_model(images)
fpn_large_toc = time.time()

Evaluate small
Evaluate large
Evaluate fpn


In [22]:
vit_seconds = int(vit_toc - vit_tic)
evit_seconds = int(evit_toc - evit_tic)
fpn_seconds = int(fpn_toc - fpn_tic)

vit_img_per_second = ds_length / float(vit_seconds)
evit_img_per_second = ds_length / float(evit_seconds)
fpn_img_per_second = ds_length / float(fpn_seconds)

print("Total images: ", ds_length)
print("ViT Inference Time: ", int(vit_toc - vit_tic), "Seconds")
print("EfficientViT Inference Time: ", int(evit_toc - evit_tic), "Seconds")
print("FPN Inference Time: ", int(fpn_toc - fpn_tic), "Seconds")

# img/seconds
print("Resnet50 Mobilenet Small Inference Speed: {0:.1f} images/second".format(ds_length / float(vit_toc - vit_tic)))
print("Resnet50 Mobilenet Large Inference Speed: {0:.1f} images/second".format(ds_length / float(evit_toc - evit_tic)))
print("FPN Mobilenet Small Inference Speed: {0:.1f} images/second".format(ds_length / float(fpn_toc - fpn_tic)))
print("-----------------------------------------------------------")
if vit_seconds < evit_seconds and vit_seconds < fpn_seconds:
    print("ViT is the fastest model")
elif evit_seconds < vit_seconds and evit_seconds < fpn_seconds:
    print("EfficientViT is the fastest model")
else:
    print("FPN is the fastest model")

Total images:  1203
ViT Inference Time:  28 Seconds
EfficientViT Inference Time:  55 Seconds
FPN Inference Time:  27 Seconds
Resnet50 Mobilenet Small Inference Speed: 42.1 images/second
Resnet50 Mobilenet Large Inference Speed: 21.6 images/second
FPN Mobilenet Small Inference Speed: 44.0 images/second
-----------------------------------------------------------
FPN is the fastest model


In [None]:
vit_seconds = int(vit_toc - vit_tic)
evit_seconds = int(evit_toc - evit_tic)
fpn_seconds = int(fpn_toc - fpn_tic)

vit_img_per_second = ds_length / float(vit_seconds)
evit_img_per_second = ds_length / float(evit_seconds)
fpn_img_per_second = ds_length / float(fpn_seconds)

print("Total images: ", ds_length)
print("ViT Inference Time: ", int(vit_toc - vit_tic), "Seconds")
print("EfficientViT Inference Time: ", int(evit_toc - evit_tic), "Seconds")
print("FPN Inference Time: ", int(fpn_toc - fpn_tic), "Seconds")

# img/seconds
print("ViT Inference Speed: {0:.1f} images/second".format(ds_length / float(vit_toc - vit_tic)))
print("EfficientViT Inference Speed: {0:.1f} images/second".format(ds_length / float(evit_toc - evit_tic)))
print("FPN Inference Speed: {0:.1f} images/second".format(ds_length / float(fpn_toc - fpn_tic)))
print("-----------------------------------------------------------")
if vit_seconds < evit_seconds and vit_seconds < fpn_seconds:
    print("ViT is the fastest model")
elif evit_seconds < vit_seconds and evit_seconds < fpn_seconds:
    print("EfficientViT is the fastest model")
else:
    print("FPN is the fastest model")

In [24]:
from thop import profile, clever_format

input = torch.randn(1, 3, 224, 224).to(device)
macs_small_resnet, params_small_resnet = profile(small_resnet_model, inputs=(input,))
macs_large_resnet, params_large_resnet = profile(large_resnet_model, inputs=(input,))
macs_small_fpn, params_small_fpn = profile(small_fpn_model, inputs=(input,))

macs_small_resnet, params_small_resnet = clever_format([macs_small_resnet, params_small_resnet], "%.3f")
macs_large_resnet, params_large_resnet = clever_format([macs_large_resnet, params_large_resnet], "%.3f")
macs_small_fpn, params_small_fpn = clever_format([macs_small_fpn, params_small_fpn], "%.3f")

print(f"Resnet50 Mobilenet Small: {macs_small_fpn}, {params_small_resnet}")
print(f"Resnet50 Mobilenet Large: {macs_large_resnet}, {params_large_resnet}")
print(f"FPN Mobilenet Small: {macs_small_resnet}, {params_small_fpn}")

[INFO] Register count_convNd() for <class 'torch.nn.modules.conv.Conv2d'>.
[INFO] Register count_normalization() for <class 'torch.nn.modules.batchnorm.BatchNorm2d'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.activation.ReLU'>.
[INFO] Register count_adap_avgpool() for <class 'torch.nn.modules.pooling.AdaptiveAvgPool2d'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.container.Sequential'>.
[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.
[INFO] Register count_convNd() for <class 'torch.nn.modules.conv.Conv2d'>.
[INFO] Register count_normalization() for <class 'torch.nn.modules.batchnorm.BatchNorm2d'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.activation.ReLU'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.container.Sequential'>.
[INFO] Register count_adap_avgpool() for <class 'torch.nn.modules.pooling.AdaptiveAvgPool2d'>.
[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.
[INFO] Regis