<img src="https://futurejobs.my/wp-content/uploads/2021/05/d-min-1024x297.png" width="300"> </img>

> **Copyright &copy; 2021 Skymind Education Group Sdn. Bhd.**<br>
> <br>
> This program and the accompanying materials are made available under the
> terms of the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0). \
> Unless required by applicable law or agreed to in writing, software
> distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
> WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
> License for the specific language governing permissions and limitations
> under the License. <br>
> <br>**SPDX-License-Identifier: Apache-2.0**


# Object Detection


## Introduction


[Insert Object Detection intro here]

<img src=""></img>

This hands-on will guide you through building a pipeline to automatically detect objects from the Pascal VOC Dataset, using a pretrained Single Shot Detector (SSD) model.

_Authored by: [Scotrraaj Gopal](http://github.com/scotgopal)_


In [None]:
import torch
from torchvision import datasets
import matplotlib.pyplot as plt
import cv2 as cv
import numpy as np
from pathlib import Path

In [None]:
from albumentations import Compose, Resize, BboxParams
from torchvision.transforms.functional import to_tensor

def process_and_transform(image, target, new_image_size=(400,400)):
    def get_bbox_and_label_list(target_list_of_dicts):
        bbox_list_of_dicts = [det_obj['bndbox'] for det_obj in target_list_of_dicts]
        label_list = [det_obj['name'] for det_obj in target_list_of_dicts]
        bbox_list_of_lists = []
        for bbox_dict in bbox_list_of_dicts:
            xmin = int(bbox_dict['xmin'])
            ymin = int(bbox_dict['ymin'])
            xmax = int(bbox_dict['xmax'])
            ymax = int(bbox_dict['ymax'])
            bbox_list_of_lists.append([xmin, ymin, xmax, ymax])
        return bbox_list_of_lists, label_list

    target = target['annotation']['object']
    bbox_list_of_lists, label_list = get_bbox_and_label_list(target)
    albu_transformer = Compose([Resize(*new_image_size)], bbox_params=BboxParams(format='pascal_voc', label_fields=['class_labels']))

    transformed_dict = albu_transformer(image=np.array(image), bboxes=bbox_list_of_lists, class_labels=label_list)
    transformed_img = to_tensor(transformed_dict['image'])
    transformed_target = [{'name':name, 'bbox':bbox_list} for name,bbox_list in zip(transformed_dict['class_labels'],transformed_dict['bboxes'])]
    return transformed_img, transformed_target

In [None]:
DATASET_BASE_PATH = Path("../datasets").resolve()
VOC_DATASET_DIR = Path.joinpath(DATASET_BASE_PATH, "VOCdevkit")
if VOC_DATASET_DIR.exists():
    trainval_ds = datasets.VOCDetection(root="../datasets", image_set="trainval", transforms=process_and_transform)
    
else:
    trainval_ds = datasets.VOCDetection(root="../datasets", image_set="trainval", transforms=process_and_transform, download=True)

trainval_ds

In [None]:
image, target = trainval_ds[4]
target

In [None]:
image.shape

In [None]:
from torchvision.transforms.functional import to_pil_image
feature_tensor, target_annotation = trainval_ds[200]
feature_image = to_pil_image(feature_tensor)
for annotated_object in target_annotation:
    name = annotated_object['name']
    xmin, ymin, xmax, ymax = annotated_object['bbox']

    upper_left_point = (int(xmin), int(ymin))
    lower_right_point = (int(xmax), int(ymax))
    print(upper_left_point, lower_right_point)
    colour = (255,0,0)
    line_thickness = 2

    org = (int(xmin), int(ymin)-5)
    font = cv.FONT_HERSHEY_SIMPLEX
    font_scale = 0.5
    font_colour = (255,0,0)
    text_line_type = cv.LINE_AA
    

    feature_image = cv.rectangle(np.array(feature_image), upper_left_point, lower_right_point, colour, line_thickness)
    feature_image = cv.putText(img=feature_image, text=name, org=org, fontFace=font, fontScale=font_scale,color=font_colour, lineType=text_line_type)

plt.figure(figsize=(19,7))
plt.imshow(feature_image)

In [None]:
import torchvision
model = torchvision.models.detection.ssd300_vgg16(pretrained=True)
model.eval() # Put to eval mode

In [None]:
print(model)

In [None]:
# Ref: https://pytorch.org/vision/stable/models.html#object-detection-instance-segmentation-and-person-keypoint-detection
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

In [None]:
from sklearn.model_selection import ShuffleSplit
from torch.utils.data import Subset

shuffle_split = ShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
indice_range = range(len(trainval_ds))
train_indices, val_indices = next(shuffle_split.split(indice_range))

voc_train_ds = Subset(trainval_ds, train_indices)
voc_val_ds = Subset(trainval_ds, val_indices)

In [None]:
def display_dataset(feature_tensor, target_annotation):
    feature_image = to_pil_image(feature_tensor)
    for annotated_object in target_annotation:
        name = annotated_object['name']
        xmin, ymin, xmax, ymax = annotated_object['bbox']

        upper_left_point = (int(xmin), int(ymin))
        lower_right_point = (int(xmax), int(ymax))
        print(f"\n{name}, {upper_left_point=}, {lower_right_point=}")
        colour = (255,0,0)
        line_thickness = 2

        org = (int(xmin), int(ymin)+10)
        font = cv.FONT_HERSHEY_SIMPLEX
        font_scale = 0.5
        font_colour = (255,0,0)

        feature_image = cv.rectangle(np.array(feature_image), upper_left_point, lower_right_point, colour, line_thickness)
        feature_image = cv.putText(img=feature_image, text=name, org=org, fontFace=font, fontScale=font_scale,color=font_colour, lineType=cv.LINE_AA, bottomLeftOrigin=False)

    plt.imshow(feature_image)

from torch.utils.data import Dataset
def inference(model, ds:Dataset, ds_index: int, det_threshold: float=0.45):
    image_tensor_3d, annotation = ds[ds_index]
    plt.figure(figsize=(20,10))
    plt.subplot(1,2,1)
    display_dataset(image_tensor_3d, annotation)
    plt.title("My image/annotation")
    

    # make inference on image
    with torch.no_grad():
        pred = model([image_tensor_3d])
    
    print(f"\n{len(pred)=}")
    for id, detections_dict in enumerate(pred):
        print("\nImage:", id)
        print("Total keys:", len(detections_dict))
        print("Keys:", detections_dict.keys())
        print("Total Scores:", len(detections_dict['scores']))
        print("Boxes", len(detections_dict['boxes']))
        print("First 5 Scores:", detections_dict['scores'][:5])
        print("First 5 Labels:", detections_dict['labels'][:5])
        detections_that_matter = (detections_dict['scores'][:20]>=det_threshold).nonzero(as_tuple=True)[0]
        labels_that_matter = [detections_dict['labels'][index].item() for index in detections_that_matter.numpy()]
        boxes_that_matter = [detections_dict['boxes'][index].tolist() for index in detections_that_matter.numpy()]
        target_annotation = list()
        for coco_index,bbox in zip(labels_that_matter, boxes_that_matter):
            object = dict()
            object['name'] = COCO_INSTANCE_CATEGORY_NAMES[coco_index]
            object['bbox'] = bbox
            target_annotation.append(object)

        plt.subplot(1,2,2)
        display_dataset(image_tensor_3d, target_annotation)
        plt.title("SSD output image/annotation")

    plt.show()

inference(model, voc_val_ds, 404, det_threshold=0.2)   

In [None]:
from torch.utils.data import DataLoader
voc_val_dl = DataLoader(voc_val_ds, batch_size=2)

In [None]:
next(iter(voc_val_dl))

In [None]:
import tarfile
import requests

url = "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCdevkit_18-May-2011.tar"
res = requests.get(url)

In [None]:
open("VOCdevkit_18-May-2011.tar", "wb").write(res.content)

my_tarfile = tarfile.TarFile("VOCdevkit_18-May-2011.tar")

In [None]:
my_tarfile.extractall()

In [None]:
VOC_CATEGORY_NAMES = ['aeroplane','bicycle','bird','boat','bottle','bus','car','cat','chair','cow',
'diningtable','dog','horse','motorbike','person','pottedplant','sheep','sofa','train','tvmonitor']

1. Freeze params in the pretrained model by changing every parameters' _.require_grad()_ property equals to **False**
2. Design another output layer and replace the old output layer with the new one.
3. Prep training pipeline
    a. loss pipeline for batch
    b. evaluation metric + loss function for batch
    c. optimizer, optimizer.step
    d. loss pipeline for epoch
    e. save the model weights, epoch number, metrics, optimizer's state_dict (weights?) for everytime the model improves
    f. 

