# Convert a model to TensorRT to accelerate the process

In [29]:
import torch
import torchvision
from torchvision import transforms
ONNX_FILE_PATH = 'mobilenetv2-7.onnx'
transformations = transforms.Compose([
    transforms.Resize(255),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [2]:
def preprocess_image(img_path, transformation):

    # read input image
    input_img = cv2.imread(img_path)
    # do transformations
    input_data = transformation(image=input_img)["image"]
    # prepare batch
    batch_data = torch.unsqueeze(input_data, 0)

    return batch_data


def postprocess(output_data):
    # get class names
    with open("imagenet_classes.txt") as f:
        classes = [line.strip() for line in f.readlines()]
    # calculate human-readable value by softmax
    confidences = torch.nn.functional.softmax(output_data, dim=1)[0] * 100
    # find top predicted classes
    _, indices = torch.sort(output_data, descending=True)
    i = 0
    # print the top classes predicted by the model
    while confidences[indices[0][i]] > 0.5:
        class_idx = indices[0][i]
        print(
            "class:",
            classes[class_idx],
            ", confidence:",
            confidences[class_idx].item(),
            "%, index:",
            class_idx.item(),
        )
        i += 1

## Load Evaluation Images

In [3]:
testset = torchvision.datasets.ImageNet(root='./data', split='val',
                                       transform=transformations)
dataloader = torch.utils.data.DataLoader(testset, batch_size=32,
                                         shuffle=False, num_workers=2)


## Export TorchVision Model to ONNX model(Only need to run once)

In [4]:
dummy_input = torch.randn(10, 3, 224, 224, device='cuda')
model = torch.hub.load('pytorch/vision:v0.6.0', 'mobilenet_v2', pretrained=True)
model.cuda()
input_names = [ "actual_input_1" ] + [ "learned_%d" % i for i in range(16) ]
output_names = [ "output1" ]

torch.onnx.export(model, dummy_input, "mobilenet.onnx", verbose=True, input_names=input_names, output_names=output_names)

Using cache found in /home/jason/.cache/torch/hub/pytorch_vision_v0.6.0


graph(%actual_input_1 : Float(10:150528, 3:50176, 224:224, 224:1),
      %learned_0 : Float(32:27, 3:9, 3:3, 3:1),
      %learned_1 : Float(32:1),
      %learned_2 : Float(32:1),
      %learned_3 : Float(32:1),
      %learned_4 : Float(32:1),
      %learned_6 : Float(32:9, 1:9, 3:3, 3:1),
      %learned_7 : Float(32:1),
      %learned_8 : Float(32:1),
      %learned_9 : Float(32:1),
      %learned_10 : Float(32:1),
      %learned_12 : Float(16:32, 32:1, 1:1, 1:1),
      %learned_13 : Float(16:1),
      %learned_14 : Float(16:1),
      %learned_15 : Float(16:1),
      %features.1.conv.2.running_var : Float(16:1),
      %features.2.conv.0.0.weight : Float(96:16, 16:1, 1:1, 1:1),
      %features.2.conv.0.1.weight : Float(96:1),
      %features.2.conv.0.1.bias : Float(96:1),
      %features.2.conv.0.1.running_mean : Float(96:1),
      %features.2.conv.0.1.running_var : Float(96:1),
      %features.2.conv.1.0.weight : Float(96:9, 1:9, 3:3, 3:1),
      %features.2.conv.1.1.weight : Float(96:

In [30]:
import tensorrt as trt
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda 
import time

input_size = 32

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def build_engine(model_path):
    with trt.Builder(TRT_LOGGER) as builder, \
        builder.create_network() as network, \
        trt.OnnxParser(network, TRT_LOGGER) as parser: 
        builder.max_workspace_size = 1<<20
        builder.max_batch_size = 1
        with open(model_path, "rb") as f:
            parser.parse(f.read())
        engine = builder.build_cuda_engine(network)
        return engine

def alloc_buf(engine):
    # host cpu mem
    h_in_size = trt.volume(engine.get_binding_shape(0))
    h_out_size = trt.volume(engine.get_binding_shape(1))
    h_in_dtype = trt.nptype(engine.get_binding_dtype(0))
    h_out_dtype = trt.nptype(engine.get_binding_dtype(1))
    in_cpu = cuda.pagelocked_empty(h_in_size, h_in_dtype)
    out_cpu = cuda.pagelocked_empty(h_out_size, h_out_dtype)
    # allocate gpu mem
    in_gpu = cuda.mem_alloc(in_cpu.nbytes)
    out_gpu = cuda.mem_alloc(out_cpu.nbytes)
    stream = cuda.Stream()
    return in_cpu, out_cpu, in_gpu, out_gpu, stream


def inference(engine, context, inputs, out_cpu, in_gpu, out_gpu, stream):
    # async version
    # with engine.create_execution_context() as context:  # cost time to initialize
    # cuda.memcpy_htod_async(in_gpu, inputs, stream)
    # context.execute_async(1, [int(in_gpu), int(out_gpu)], stream.handle, None)
    # cuda.memcpy_dtoh_async(out_cpu, out_gpu, stream)
    # stream.synchronize()

    # sync version
    cuda.memcpy_htod(in_gpu, inputs)
    context.execute(1, [int(in_gpu), int(out_gpu)])
    cuda.memcpy_dtoh(out_cpu, out_gpu)
    return out_cpu

inputs = np.random.random((1, 3, input_size, input_size)).astype(np.float32)
engine = build_engine(ONNX_FILE_PATH)
context = engine.create_execution_context()
for _ in range(10):
    t1 = time.time()
    in_cpu, out_cpu, in_gpu, out_gpu, stream = alloc_buf(engine)
    res = inference(engine, context, inputs.reshape(-1), out_cpu, in_gpu, out_gpu, stream)
    print(res)
    print("cost time: ", time.time()-t1)

AttributeError: 'NoneType' object has no attribute 'create_execution_context'

## Initialize model in TensorRT

In [24]:
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import tensorrt as trt

# logger to capture errors, warnings, and other information during the build and inference phases
TRT_LOGGER = trt.Logger()

def build_engine(onnx_file_path):
    # initialize TensorRT engine and parse ONNX model
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network:
        # parse ONNX
        parser = trt.OnnxParser(network, TRT_LOGGER)
        with open(onnx_file_path, 'rb') as model:
            print('Beginning ONNX file parsing')
            parser.parse(model.read())
        print('Completed parsing of ONNX file')
        # allow TensorRT to use up to 1GB of GPU memory for tactic selection
        builder.max_workspace_size = 1 << 30
        # we have only one image in batch
        builder.max_batch_size = 1
        # use FP16 mode if possible
        config = builder.create_builder_config()
        config.max_workspace_size = 1 << 20
        with builder.build_engine(network, config) as engine:
            print(engine)
            context = engine.create_execution_context()
            return engine, context
    
#     print('Building engine')
#     context = engine.create_execution_context()
#     return engine, context
# #     if builder.platform_has_fast_fp16:
# #         builder.fp16_mode = True
# #         # generate TensorRT engine optimized for the target platform
# #         print('Building an engine...')
# #         engine = builder.build_cuda_engine(network)
# #         context = engine.create_execution_context()
# #         print("Completed creating Engine")

#     return engine, context

## Pipeline

In [25]:
# initialize TensorRT engine and parse ONNX model
engine, context = build_engine(ONNX_FILE_PATH)
print(engine, context)
# get sizes of input and output and allocate memory required for input data and for output data
for binding in engine:
    if engine.binding_is_input(binding):  # we expect only one input
        input_shape = engine.get_binding_shape(binding)
        input_size = trt.volume(input_shape) * engine.max_batch_size * np.dtype(np.float32).itemsize  # in bytes
        device_input = cuda.mem_alloc(input_size)
    else:  # and one output
        output_shape = engine.get_binding_shape(binding)
        # create page-locked memory buffers (i.e. won't be swapped to disk)
        host_output = cuda.pagelocked_empty(trt.volume(output_shape) * engine.max_batch_size, dtype=np.float32)
        device_output = cuda.mem_alloc(host_output.nbytes)

# Create a stream in which to copy inputs/outputs and run inference.
stream = cuda.Stream()


# preprocess input data
host_input = np.array(preprocess_image("dog.jpg").numpy(), dtype=np.float32, order='C')
cuda.memcpy_htod_async(device_input, host_input, stream)

# run inference
context.execute_async(bindings=[int(device_input), int(device_output)], stream_handle=stream.handle)
cuda.memcpy_dtoh_async(host_output, device_output, stream)
stream.synchronize()

# postprocess results
output_data = torch.Tensor(host_output).reshape(engine.max_batch_size, output_shape[0])
postprocess(output_data)

Beginning ONNX file parsing
Completed parsing of ONNX file


AttributeError: __enter__

In [3]:
import tensorflow as tf
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
mobilenet_v2 = tf.keras.applications.MobileNetV2(weights='imagenet')
mobilenet_v2.save('mobilenet_v2')
img = tf.keras.preprocessing.image.load_img('dog.jpg', target_size=(224, 224))
x = tf.keras.preprocessing.image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = tf.keras.applications.mobilenet_v2.preprocess_input(x)
# Run inference
preds = mobilenet_v2.predict(x)
print('Predicted:', tf.keras.applications.mobilenet_v2.decode_predictions(preds, top=5)[0])

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: mobilenet_v2\assets
Predicted: [('n02099712', 'Labrador_retriever', 0.47311324), ('n02109961', 'Eskimo_dog', 0.0550954), ('n02091244', 'Ibizan_hound', 0.02530424), ('n02099601', 'golden_retriever', 0.023565773), ('n02110806', 'basenji', 0.014359273)]


In [None]:
# from tensorflow.python.compiler.tensorrt import trt_convert as trt

params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(
  precision_mode='FP16',
  is_dynamic_op=True)
# params = trt.DEFAULT_TRT_CONVERSION_PARAMS
converter = trt.TrtGraphConverterV2(
      input_saved_model_dir='mobilenet_v2',
      conversion_params=params)
converter.convert()
saved_model_dir_trt = 'mobilenet_v2.trt'
converter.save(saved_model_dir_trt)
# Load the particular signature from the TRT graph
root = tf.saved_model.load(saved_model_dir_trt)
concrete_func = root.signatures['serving_default']

#hide_output
# Gather the ImageNet labels first and prepare them
labels_path = tf.keras.utils.get_file('ImageNetLabels.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/ImageNetLabels.txt')
imagenet_labels = np.array(open(labels_path).read().splitlines())
# Perform inference
labeling = concrete_func(tf.constant(x.astype('float32')))
activations = tf.nn.softmax(labeling['predictions'])
imagenet_labels[np.argsort(activations)[0,::-1][:5]+1]