# Classification Examples

&nbsp;

<img align="center" src="https://media-exp1.licdn.com/dms/image/C4E1BAQFay3CGU2VmRg/company-background_10000/0/1621019049425?e=2159024400&v=beta&t=dmPCWeJlWvkzmM019V4_oKMluIPkQPX52i0zdgP-x2M" alt="nn" style="width: 1200px;"/>

<img align="left" src="https://media-exp1.licdn.com/dms/image/C4E0BAQHJY2WpOb492w/company-logo_200_200/0/1620816916063?e=2159024400&v=beta&t=U-VvNpjzV2DLp4EBIeqI8ZIWYUekPeJOQyxfXaJBMnU" alt="nn" style="width: 25px;"/>

&nbsp;&nbsp;Please visit us at https://onspecta.com

&nbsp;

## ImageNet Dataset Overview
<img align="left" src="https://www.image-net.org/static_files/index_files/logo.jpg" alt="nn" style="width: 200px;"/>

&nbsp;

These examples are using subset of ImageNet classification validation set from year 2012.
ImageNet is a large-scale classification dataset that has been instrumental in advancing computer vision and deep learning research.

More info can be found here: https://image-net.org/

&nbsp;

In [None]:
import os
import cv2
import time
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt

from utils.imagenet import ImageNet
import utils.post_processing as pp
import utils.benchmark as bench_utils

LAT_BATCH_SIZE = 1
THROUGHPUT_BATCH_SIZE = 32

## Latency with ResNet-50 v1.5 in fp32 precision

DLS offers a significant speed-up in standard fp32 inference scenarios.
This example shows the performance of ResNet-50 v1.5 model in fp32 precision.
Original ResNet paper can be found here: https://arxiv.org/pdf/1512.03385.pdf

In [None]:
input_shape = (224, 224)
path_to_model = "resnet_50_v15/resnet_50_v15_tf_fp32.pb"

In [None]:
# first let's load the model

graph = tf.compat.v1.Graph()
with graph.as_default():
    graph_def = tf.compat.v1.GraphDef()
    with tf.compat.v1.gfile.GFile(path_to_model, 'rb') as fid:
        serialized_graph = fid.read()
        graph_def.ParseFromString(serialized_graph)
        tf.compat.v1.import_graph_def(graph_def, name="")

In [None]:
# ! DLS_NUM_THREADS should be set prior to launching jupyter notebook !

# creating TF config
config = tf.compat.v1.ConfigProto()
config.allow_soft_placement = True
config.intra_op_parallelism_threads = bench_utils.get_intra_op_parallelism_threads()
config.inter_op_parallelism_threads = 1

In [None]:
# preparing input and output dictionaries

# creation of output dictionary
output_dict = {"softmax_tensor:0": graph.get_tensor_by_name("softmax_tensor:0")}

# initialization of ImageNet dataset
imagenet = ImageNet(
    batch_size=LAT_BATCH_SIZE,
    color_model="RGB",
    pre_processing="VGG",
    is1001classes=True
)

input_array = imagenet.get_input_array(target_shape=input_shape)

# assignment of input image to input tensor
feed_dict = {graph.get_tensor_by_name("input_tensor:0"): input_array}

# for the purpose of visualizing results let's load the image without pre-processing
img = cv2.imread(str(imagenet.path_to_latest_image))

In [None]:
# running the model with DLS enabled in fp32 precision

tf.DLS.force_enable()

with tf.compat.v1.Session(config=config, graph=graph) as sess:
    # warm-up run
    _ = sess.run(output_dict, feed_dict)

    # actual run
    start = time.time()
    output_dls = sess.run(output_dict, feed_dict)["softmax_tensor:0"]
    finish = time.time()

latency_ms = (finish - start) * 1000
print("\nResNet-50 v1.5 FP32 latency with DLS: {:.0f} ms\n".format(latency_ms))

In [None]:
# running the model with DLS disabled in fp32 precision

tf.DLS.force_disable()

with tf.compat.v1.Session(config=config, graph=graph) as sess:
    # warm-up run
    _ = sess.run(output_dict, feed_dict)

    # actual run
    start = time.time()
    output_no_dls = sess.run(output_dict, feed_dict)["softmax_tensor:0"]
    finish = time.time()

latency_ms = (finish - start) * 1000
print("\nResNet-50 v1.5 FP32 latency without DLS: {:.0f} ms\n".format(latency_ms))

In [None]:
# visualizing output

# show the image
plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
plt.show()
print("ResNet-50 v1.5 FP32 predictions with DLS enabled:\n")

print(f"Top-1 prediction: {pp.get_imagenet_names(imagenet.extract_top1(output_dls[0]) + 1)}")
print(f"Top-5 predictions: {pp.get_imagenet_names(imagenet.extract_top5(output_dls[0]) + 1)}")

## SSD MobileNet v2 in int8 precision using TFLite

This example shows the performance of SSD MobileNet v2 model quantized to int8 precision with the use of TFLite converter.
Models in mixed fp32/int8 precision as the one described here are expected to offer significant speed-up while allowing only a little degradation to accuracy.
You can read more on SSD MobileNet architecture here: https://arxiv.org/pdf/1801.04381.pdf

In [None]:
input_shape = (300, 300)
threshold = 0.3
path_to_model = "ssd_mobilenet_v2/ssd_mobilenet_v2_tflite_int8.tflite"

In [None]:
# loading the .tflite model and initializing Interpreter

interpreter = tf.lite.Interpreter(
    model_path=path_to_model,
    num_threads=bench_utils.get_intra_op_parallelism_threads()
)

interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

In [None]:
# initialization of COCO dataset
coco = COCODataset(
    batch_size=LAT_BATCH_SIZE,
    color_model="BGR",
    images_filename_base="COCO_val2014_000000000000",
    pre_processing="SSD"
)

for _ in range(6):
    _ = coco.get_input_array(target_shape=input_shape)
input_array = coco.get_input_array(target_shape=input_shape)

# assignment of input image to input tensor
interpreter.set_tensor(input_details[0]["index"], input_array)

# for the purpose of visualizing results let's load the image without pre-processing
img = cv2.imread(str(coco.path_to_latest_image))

In [None]:
# running the model with DLS enabled

tf.DLS.force_enable()

# warm-up run
interpreter.invoke()

# actual run
start = time.time()
interpreter.invoke()
finish = time.time()

latency_ms = (finish - start) * 1000
print("\nSSD MobileNet v2 INT8 latency with DLS: {:.0f} ms\n".format(latency_ms))

In [None]:
# visualizing output

# post-processing
def post_process(image, det_boxes, det_classes, det_scores, num_det):


    for i in range(LAT_BATCH_SIZE):
        for d in range(int(num_det)):

            # the detected object does not exceed a set threshold we skip it
            if det_scores[i][d] < threshold:
                continue

            # first let's switch order of bbox boundaries from [top left bottom right] to [left top right bottom]
            converted_bbox = coco.convert_bbox_to_coco_order(
                det_boxes[i][d] * input_shape[0],
                1, 0, 3, 2,
                absolute=False
            )

            # then rescale back to original image ratio
            converted_bbox = coco.rescale_bbox(i, converted_bbox)

            # we can now draw bbox on the original input image
            image = pp.draw_bbox(image, converted_bbox, int(det_classes[i][d]))

    return image

detection_boxes = interpreter.get_tensor(output_details[0]["index"])
detection_classes = interpreter.get_tensor(output_details[1]["index"])
detection_classes += 1  # model uses indexing from 0 while COCO dateset start with idx of 1
detection_scores = interpreter.get_tensor(output_details[2]["index"])
num_detections = interpreter.get_tensor(output_details[3]["index"])

# show the post-processed images
plt.imshow(cv2.cvtColor(
    post_process(img, detection_boxes, detection_classes, detection_scores, num_detections),
    cv2.COLOR_BGR2RGB
))
plt.show()
print("SSD MobileNet v2 INT8 output with DLS enabled\n")

## SSD Inception v2 in fp16 precision using TF1 api

This example shows the performance of SSD Inception v2 model converted to fp16 precision.
Models in fp16 precision are expected to offer accuracy on par with fp32 counterparts and up to 2x inference speed-up
on compatible hardware when run with DLS. You can read more on SSD architecture here: https://arxiv.org/pdf/1512.02325.pdf