# Object Detection Examples

&nbsp;

<img align="center" src="https://media-exp1.licdn.com/dms/image/C4E1BAQFay3CGU2VmRg/company-background_10000/0/1621019049425?e=2159024400&v=beta&t=dmPCWeJlWvkzmM019V4_oKMluIPkQPX52i0zdgP-x2M" alt="nn" style="width: 1200px;"/>

<img align="left" src="https://media-exp1.licdn.com/dms/image/C4E0BAQHJY2WpOb492w/company-logo_200_200/0/1620816916063?e=2159024400&v=beta&t=U-VvNpjzV2DLp4EBIeqI8ZIWYUekPeJOQyxfXaJBMnU" alt="nn" style="width: 25px;"/>

&nbsp;&nbsp;Please visit us at https://onspecta.com

&nbsp;

## COCO Dataset Overview
<img align="left" src="https://cocodataset.org/images/coco-logo.png" alt="nn" style="width: 200px;"/>

&nbsp;

&nbsp;

These examples are using subset of COCO object detection validation set from year 2014.
COCO is a large-scale object detection, segmentation, and captioning dataset.

More info can be found here: https://cocodataset.org

&nbsp;

In [None]:
import os
import cv2
import time
import subprocess
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
from tensorflow.python.saved_model import tag_constants

from utils.coco import COCODataset
import utils.post_processing as pp
import utils.benchmark as bench_utils

LAT_BATCH_SIZE = 1

## YOLO v4 Tiny in fp32 precision using TF2 api

DLS offers a significant speed-up in standard fp32 inference scenarios.
This example shows the performance of Yolo v4 Tiny model in fp32 precision.
Original Yolo v4 paper can be found here: https://arxiv.org/pdf/2004.10934.pdf

In [None]:
input_shape = (416, 416)
path_to_model = "yolo_v4_tiny/yolo_v4_tiny_tf_fp32"

In [None]:
# ! DLS_NUM_THREADS should be set prior to launching jupyter notebook !

# setting the configuration - please not that this has to happen before initializing the model
tf.config.threading.set_intra_op_parallelism_threads(bench_utils.get_intra_op_parallelism_threads())
tf.config.threading.set_inter_op_parallelism_threads(1)

In [None]:
# initialization of COCO dataset
coco = COCODataset(
    batch_size=LAT_BATCH_SIZE,
    color_model="RGB",
    images_filename_base="COCO_val2014_000000000000",
    pre_processing="YOLO"
)

_ = coco.get_input_array(input_shape)
input_tensor = tf.constant(coco.get_input_array(input_shape))

# for the purpose of visualizing results let's load the image without pre-processing
img = cv2.imread(str(coco.path_to_latest_image))

In [None]:
# let's load the model

saved_model_loaded = tf.saved_model.load(path_to_model, tags=[tag_constants.SERVING])
yolo = saved_model_loaded.signatures['serving_default']

In [None]:
# running the model with DLS enabled

tf.DLS.force_enable()

# warm-up run
_ = yolo(input_tensor)

# actual run
start = time.time()
output_dls = yolo(input_tensor)
finish = time.time()

latency_ms = (finish - start) * 1000
print("\nYOLO v4 Tiny FP32 latency with DLS: {:.0f} ms\n".format(latency_ms))

In [None]:
# we have to reload the model

saved_model_loaded = tf.saved_model.load(path_to_model, tags=[tag_constants.SERVING])
yolo = saved_model_loaded.signatures['serving_default']

In [None]:
# running the model with DLS disabled

tf.DLS.force_disable()

# warm-up run
_ = yolo(input_tensor)

# actual run
start = time.time()
output_no_dls = yolo(input_tensor)
finish = time.time()

latency_ms = (finish - start) * 1000
print("\nYOLO v4 Tiny FP32 latency without DLS: {:.0f} ms\n".format(latency_ms))

In [None]:
# visualizing output

bboxes_dls = output_dls["tf.concat_12"][:, :, 0:4]
preds_dls = output_dls["tf.concat_12"][:, :, 4:]

bboxes_no_dls = output_no_dls["tf.concat_12"][:, :, 0:4]
preds_no_dls = output_no_dls["tf.concat_12"][:, :, 4:]

# post-processing
def post_process(image, bboxes, preds):
    detection_boxes, _, detection_classes, valid_detections = tf.image.combined_non_max_suppression(
            boxes=tf.reshape(bboxes, (tf.shape(bboxes)[0], -1, 1, 4)),
            scores=tf.reshape(
                preds, (tf.shape(preds)[0], -1, tf.shape(preds)[-1])),
            max_output_size_per_class=50,
            max_total_size=50,
            iou_threshold=0.45,
            score_threshold=0.25
        )

    for i in range(LAT_BATCH_SIZE):
        for d in range(int(valid_detections[i])):
            # first let's switch order of bbox boundaries from [top left bottom right] to [left top right bottom]
            converted_bbox = coco.convert_bbox_to_coco_order(
                detection_boxes[i][d] * input_shape[0],
                1, 0, 3, 2,
                absolute=False
            )

            # then rescale back to original image ratio
            converted_bbox = coco.rescale_bbox(i, converted_bbox)

            # we can now draw bbox on the original input image
            image = pp.draw_bbox(image, converted_bbox, int(detection_classes[i][d]))

    return image

# show the post-processed images
plt.imshow(cv2.cvtColor(post_process(img, bboxes_dls, preds_dls), cv2.COLOR_BGR2RGB))
plt.show()
print("YOLO v4 Tiny FP32 output with DLS enabled\n")

plt.imshow(cv2.cvtColor(post_process(img, bboxes_no_dls, preds_no_dls), cv2.COLOR_BGR2RGB))
plt.show()
print("YOLO v4 Tiny FP32 output with DLS disabled\n")

## SSD Inception v2 in fp16 precision using TF1 api

This example shows the performance of SSD Inception v2 model converted to fp16 precision.
Models in fp16 precision are expected to offer accuracy on par with fp32 counterparts and up to 2x inference speed-up
on compatible hardware when run with DLS. You can read more on SSD architecture here: https://arxiv.org/pdf/1512.02325.pdf

In [None]:
input_shape = (300, 300)
threshold = 0.3
path_to_fp16_model = "ssd_inception_v2/ssd_inception_v2_tf_fp16.pb"
path_to_fp32_model = "ssd_inception_v2/ssd_inception_v2_tf_fp32.pb"
output_names = ["detection_classes:0", "detection_boxes:0", "detection_scores:0", "num_detections:0"]

In [None]:
# first let's load the model in fp16 precision

graph = tf.compat.v1.Graph()
with graph.as_default():
    graph_def = tf.compat.v1.GraphDef()
    with tf.compat.v1.gfile.GFile(path_to_fp16_model, 'rb') as fid:
        serialized_graph = fid.read()
        graph_def.ParseFromString(serialized_graph)
        tf.compat.v1.import_graph_def(graph_def, name="")

In [None]:
# ! DLS_NUM_THREADS should be set prior to launching jupyter notebook !

# creating TF config
config = tf.compat.v1.ConfigProto()
config.allow_soft_placement = True
config.intra_op_parallelism_threads = bench_utils.get_intra_op_parallelism_threads()
config.inter_op_parallelism_threads = 1

In [None]:
# preparing input and output dictionaries

# creation of output dictionary
output_dict = {output_name: graph.get_tensor_by_name(output_name) for output_name in output_names}

# initialization of COCO dataset
coco = COCODataset(
    batch_size=LAT_BATCH_SIZE,
    color_model="BGR",
    images_filename_base="COCO_val2014_000000000000"
)

input_array = coco.get_input_array(target_shape=input_shape)

# assignment of input image to input tensor
feed_dict = {graph.get_tensor_by_name("image_tensor:0"): input_array}

# for the purpose of visualizing results let's load the image without pre-processing
img = cv2.imread(str(coco.path_to_latest_image))

In [None]:
# running the model with DLS enabled in fp16 precision

tf.DLS.force_enable()

with tf.compat.v1.Session(config=config, graph=graph) as sess:
    # warm-up run
    _ = sess.run(output_dict, feed_dict)

    # actual run
    start = time.time()
    output_dls = sess.run(output_dict, feed_dict)
    finish = time.time()

latency_ms = (finish - start) * 1000
print("\nSSD Inception v2 FP16 latency with DLS: {:.0f} ms\n".format(latency_ms))

In [None]:
# running the model with DLS disabled in fp16 precision

tf.DLS.force_disable()

with tf.compat.v1.Session(config=config, graph=graph) as sess:
    # warm-up run
    _ = sess.run(output_dict, feed_dict)

    # actual run
    start = time.time()
    output_no_dls_fp16 = sess.run(output_dict, feed_dict)
    finish = time.time()

latency_ms = (finish - start) * 1000
print("\nSSD Inception v2 FP16 latency without DLS: {:.0f} ms\n".format(latency_ms))

In [None]:
# now let's load the model in fp32 precision for validation

graph = tf.compat.v1.Graph()
with graph.as_default():
    graph_def = tf.compat.v1.GraphDef()
    with tf.compat.v1.gfile.GFile(path_to_fp32_model, 'rb') as fid:
        serialized_graph = fid.read()
        graph_def.ParseFromString(serialized_graph)
        tf.compat.v1.import_graph_def(graph_def, name="")

# creation of output dictionary
output_dict = {output_name: graph.get_tensor_by_name(output_name) for output_name in output_names}

# assignment of input image to input tensor
feed_dict = {graph.get_tensor_by_name("image_tensor:0"): input_array}

In [None]:
# running the model with DLS disabled in fp32 precision

tf.DLS.force_disable()

with tf.compat.v1.Session(config=config, graph=graph) as sess:
    # warm-up run
    _ = sess.run(output_dict, feed_dict)

    # actual run
    start = time.time()
    output_no_dls_fp32 = sess.run(output_dict, feed_dict)
    finish = time.time()

latency_ms = (finish - start) * 1000
print("\nSSD Inception v2 FP32 latency without DLS: {:.0f} ms\n".format(latency_ms))


In [None]:
# visualizing output

# post-processing
def post_process(image, output):
    for i in range(LAT_BATCH_SIZE):
        for d in range(int(output["num_detections:0"][i])):

            # the detected object does not exceed a set threshold we skip it
            if output["detection_scores:0"][i][d] < threshold:
                continue

            # first let's switch order of bbox boundaries from [top left bottom right] to [left top right bottom]
            converted_bbox = coco.convert_bbox_to_coco_order(
                output["detection_boxes:0"][i][d] * input_shape[0],
                1, 0, 3, 2,
                absolute=False
            )

            # then rescale back to original image ratio
            converted_bbox = coco.rescale_bbox(i, converted_bbox)

            # we can now draw bbox on the original input image
            image = pp.draw_bbox(image, converted_bbox, int(output["detection_classes:0"][i][d]))

    return image

# show the post-processed images
plt.imshow(cv2.cvtColor(post_process(img, output_dls), cv2.COLOR_BGR2RGB))
plt.show()
print("SSD Inception v2 FP16 output with DLS enabled\n")

plt.imshow(cv2.cvtColor(post_process(img, output_no_dls_fp16), cv2.COLOR_BGR2RGB))
plt.show()
print("SSD Inception v2 FP16 output with DLS disabled\n")

plt.imshow(cv2.cvtColor(post_process(img, output_no_dls_fp32), cv2.COLOR_BGR2RGB))
plt.show()
print("SSD Inception v2 FP32 output with DLS disabled\n")

## SSD MobileNet v2 in int8 precision using TFLite

This example shows the performance of SSD MobileNet v2 model quantized to int8 precision with the use of TFLite converter.
Models in mixed fp32/int8 precision as the one described here are expected to offer significant speed-up while allowing only a little degradation to accuracy.
You can read more on SSD MobileNet architecture here: https://arxiv.org/pdf/1801.04381.pdf

In [None]:
input_shape = (300, 300)
threshold = 0.3
path_to_model = "ssd_mobilenet_v2/ssd_mobilenet_v2_tflite_int8.tflite"

In [None]:
# loading the .tflite model and initializing Interpreter

interpreter = tf.lite.Interpreter(
    model_path=path_to_model,
    num_threads=bench_utils.get_intra_op_parallelism_threads()
)

interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

In [None]:
# initialization of COCO dataset
coco = COCODataset(
    batch_size=LAT_BATCH_SIZE,
    color_model="BGR",
    images_filename_base="COCO_val2014_000000000000",
    pre_processing="SSD"
)

for _ in range(2):
    _ = coco.get_input_array(target_shape=input_shape)
input_array = coco.get_input_array(target_shape=input_shape)

# assignment of input image to input tensor
interpreter.set_tensor(input_details[0]["index"], input_array)

# for the purpose of visualizing results let's load the image without pre-processing
img = cv2.imread(str(coco.path_to_latest_image))

In [None]:
# running the model with DLS enabled

tf.DLS.force_enable()

# warm-up run
interpreter.invoke()

# actual run
start = time.time()
interpreter.invoke()
finish = time.time()

latency_ms = (finish - start) * 1000
print("\nSSD MobileNet v2 INT8 latency with DLS: {:.0f} ms\n".format(latency_ms))

In [None]:
# visualizing output

# post-processing
def post_process(image, det_boxes, det_classes, det_scores, num_det):


    for i in range(LAT_BATCH_SIZE):
        for d in range(int(num_det)):

            # the detected object does not exceed a set threshold we skip it
            if det_scores[i][d] < threshold:
                continue

            # first let's switch order of bbox boundaries from [top left bottom right] to [left top right bottom]
            converted_bbox = coco.convert_bbox_to_coco_order(
                det_boxes[i][d] * input_shape[0],
                1, 0, 3, 2,
                absolute=False
            )

            # then rescale back to original image ratio
            converted_bbox = coco.rescale_bbox(i, converted_bbox)

            # we can now draw bbox on the original input image
            image = pp.draw_bbox(image, converted_bbox, int(det_classes[i][d]))

    return image

detection_boxes = interpreter.get_tensor(output_details[0]["index"])
detection_classes = interpreter.get_tensor(output_details[1]["index"])
detection_classes += 1  # model uses indexing from 0 while COCO dateset start with idx of 1
detection_scores = interpreter.get_tensor(output_details[2]["index"])
num_detections = interpreter.get_tensor(output_details[3]["index"])

# show the post-processed images
plt.imshow(cv2.cvtColor(
    post_process(img, detection_boxes, detection_classes, detection_scores, num_detections),
    cv2.COLOR_BGR2RGB
))
plt.show()
print("SSD MobileNet v2 INT8 output with DLS enabled\n")

## More examples can be run like this:

In [None]:
print("numactl --cpunodebind=0 --membind=0 python3 /model_zoo/object_detection/ssd_mobilenet_v2/run.py -m /model_zoo/object_detection/ssd_mobilenet_v2/ssd_mobilenet_v2_tf_fp32.pb -p fp32 --timeout=5")
process = subprocess.Popen(["numactl", "--cpunodebind=0", "--membind=0", "python3", "/model_zoo/object_detection/ssd_mobilenet_v2/run.py", "-m", "/model_zoo/object_detection/ssd_mobilenet_v2/ssd_mobilenet_v2_tf_fp32.pb", "-p", "fp32", "--timeout=5"], stdout=subprocess.PIPE)
stdout = process.communicate()[0]
print(stdout.decode("utf-8"))