# Classification on GPU

In [1]:
from torch import onnx as tonnx
import torch
import tensorrt as trt
from gpu.utils import to_GiB, return_pruning_params, DummyDataset, Calibrator, configure_quantization_and_inputs, CifarDataLoader, EntropyCalibrator
from pytorch_lightning.callbacks import ModelPruning
from torch.nn.utils.prune import is_pruned
import numpy as np
from pycuda import driver as cuda
import torch.utils.data as tdata


In [2]:
model = torch.hub.load(
    "chenyaofo/pytorch-cifar-models",
    "cifar10_mobilenetv2_x1_0",
    pretrained=True,
)

Using cache found in /home/blue/.cache/torch/hub/chenyaofo_pytorch-cifar-models_master


In [3]:
torch.onnx.export(model, torch.randn((1, 3, 32, 32)), "mbv3.onnx")

In [4]:
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
TRT_LOGGER = trt.Logger(trt.Logger.INFO)

builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(EXPLICIT_BATCH)
config = builder.create_builder_config()
parser = trt.OnnxParser(network, TRT_LOGGER)

[12/20/2022-22:50:19] [TRT] [I] [MemUsageChange] Init CUDA: CPU +194, GPU +0, now: CPU 308, GPU 202 (MiB)
[12/20/2022-22:50:20] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +7, GPU +2, now: CPU 334, GPU 204 (MiB)


In [5]:
with open("mbv3.onnx", "rb") as model:
    ok = parser.parse(model.read())

config.max_workspace_size = to_GiB(1)

  config.max_workspace_size = to_GiB(1)


In [6]:
plan = builder.build_serialized_network(network, config)
with open("mbv3.trt", "wb") as f:
    f.write(plan)

[12/20/2022-22:50:21] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +230, GPU +94, now: CPU 564, GPU 298 (MiB)
[12/20/2022-22:50:21] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +111, GPU +44, now: CPU 675, GPU 342 (MiB)
[12/20/2022-22:50:21] [TRT] [W] TensorRT was linked against cuDNN 8.4.1 but loaded cuDNN 8.3.2
[12/20/2022-22:50:21] [TRT] [I] Local timing cache in use. Profiling results in this builder pass will not be stored.
[12/20/2022-22:50:31] [TRT] [I] Some tactics do not have sufficient workspace memory to run. Increasing workspace size will enable more tactics, please check verbose output for requested sizes.
[12/20/2022-22:50:34] [TRT] [I] Detected 1 inputs and 1 output network tensors.
[12/20/2022-22:50:34] [TRT] [I] Total Host Persistent Memory: 83200
[12/20/2022-22:50:34] [TRT] [I] Total Device Persistent Memory: 38912
[12/20/2022-22:50:34] [TRT] [I] Total Scratch Memory: 1536
[12/20/2022-22:50:34] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory al

# FP16

In [7]:
config, network = configure_quantization_and_inputs(config, network, fp16=True, inputs_fp16=False, int8=False, inputs_int8=False)

In [8]:
plan = builder.build_serialized_network(network, config)
with open("mbv3_fp16.trt", "wb") as f:
    f.write(plan)

[12/20/2022-22:50:34] [TRT] [W] FP16 support requested on hardware without native FP16 support, performance will be negatively affected.
[12/20/2022-22:50:35] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +10, now: CPU 992, GPU 456 (MiB)
[12/20/2022-22:50:35] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 992, GPU 464 (MiB)
[12/20/2022-22:50:35] [TRT] [W] TensorRT was linked against cuDNN 8.4.1 but loaded cuDNN 8.3.2
[12/20/2022-22:50:35] [TRT] [I] Local timing cache in use. Profiling results in this builder pass will not be stored.
[12/20/2022-22:50:36] [TRT] [W] Weights [name=Conv_0 + PWN(Clip_3).weight] had the following issues when converted to FP16:
[12/20/2022-22:50:36] [TRT] [W]  - Subnormal FP16 values detected. 
[12/20/2022-22:50:36] [TRT] [W]  - Values less than smallest positive FP16 Subnormal value detected. Converting to FP16 minimum subnormalized value. 
[12/20/2022-22:50:36] [TRT] [W] If this is not the desired behavior, please modify the 

# INT 8

In [5]:
cuda.init()
dev = cuda.Device(0)
ctx = dev.make_context()
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
TRT_LOGGER = trt.Logger(trt.Logger.INFO)

builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(EXPLICIT_BATCH)
config = builder.create_builder_config()
parser = trt.OnnxParser(network, TRT_LOGGER)
with open("mbv3.onnx", "rb") as model:
    ok = parser.parse(model.read())

config.max_workspace_size = to_GiB(1)

calib_set = CifarDataLoader("dataset", calibration=True)
calib_set = tdata.Subset(calib_set, range(0, 500))

config, network = configure_quantization_and_inputs(config, network, fp16=False, int8=True, inputs_fp16=False, inputs_int8=False)
config.int8_calibrator = EntropyCalibrator(
    calib_set, "cache.txt", 1
)
plan = builder.build_serialized_network(network, config)
with open("mbv3_int8.trt", "wb") as f:
    f.write(plan)

[12/20/2022-23:03:36] [TRT] [I] The logger passed into createInferBuilder differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.

[12/20/2022-23:03:36] [TRT] [I] [MemUsageChange] Init CUDA: CPU +54, GPU +0, now: CPU 1251, GPU 820 (MiB)
[12/20/2022-23:03:37] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +5, GPU +2, now: CPU 1256, GPU 822 (MiB)


  config.max_workspace_size = to_GiB(1)


Files already downloaded and verified
[12/20/2022-23:03:38] [TRT] [W] FP16 support requested on hardware without native FP16 support, performance will be negatively affected.
[12/20/2022-23:03:38] [TRT] [I] Reading Calibration Cache for calibrator: EntropyCalibration2
[12/20/2022-23:03:38] [TRT] [I] Generated calibration scales using calibration cache. Make sure that calibration cache has latest scales.
[12/20/2022-23:03:38] [TRT] [I] To regenerate calibration cache, please delete the existing one. TensorRT will generate a new calibration cache.
[12/20/2022-23:03:38] [TRT] [W] Missing scale and zero-point for tensor input.416, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[12/20/2022-23:03:38] [TRT] [W] Missing scale and zero-point for tensor (Unnamed Layer* 99) [Constant]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[12/20/2022-23:03:38] [TRT] [W] Missing scale and zero-point for t

# Pose Estimation on Embedded GPUs

In [1]:
from BronchoTrack.BronchoTrack.models.offsetnet import OffsetNet
from BronchoTrack.BronchoTrack.models.bronchonet import BronchoNetDoubleTemporalConvLateFusion

In [2]:
model = BronchoNetDoubleTemporalConvLateFusion()



In [None]:
tonnx.export(model, torch.randn((1, 2, 3, 256, 256)),  "broncho.onnx", verbose=True, opset_version=9)

In [3]:
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
TRT_LOGGER = trt.Logger(trt.Logger.INFO)

builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(EXPLICIT_BATCH)
config = builder.create_builder_config()
parser = trt.OnnxParser(network, TRT_LOGGER)

[12/20/2022-21:52:33] [TRT] [I] [MemUsageChange] Init CUDA: CPU +194, GPU +0, now: CPU 271, GPU 202 (MiB)
[12/20/2022-21:52:34] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +6, GPU +2, now: CPU 296, GPU 204 (MiB)


In [4]:
with open("broncho.onnx", "rb") as model:
    ok = parser.parse(model.read())

config.max_workspace_size = to_GiB(1)

[12/20/2022-21:52:38] [TRT] [W] onnx2trt_utils.cpp:369: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.


  config.max_workspace_size = to_GiB(1)


In [5]:
plan = builder.build_serialized_network(network, config)
with open("broncho.trt", "wb") as f:
    f.write(plan)

[12/20/2022-21:53:44] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +229, GPU +94, now: CPU 993, GPU 298 (MiB)
[12/20/2022-21:53:45] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +112, GPU +44, now: CPU 1105, GPU 342 (MiB)
[12/20/2022-21:53:45] [TRT] [W] TensorRT was linked against cuDNN 8.4.1 but loaded cuDNN 8.3.2
[12/20/2022-21:53:45] [TRT] [I] Local timing cache in use. Profiling results in this builder pass will not be stored.
[12/20/2022-21:54:11] [TRT] [I] Some tactics do not have sufficient workspace memory to run. Increasing workspace size will enable more tactics, please check verbose output for requested sizes.
[12/20/2022-21:54:40] [TRT] [I] Detected 1 inputs and 1 output network tensors.
[12/20/2022-21:54:41] [TRT] [I] Total Host Persistent Memory: 2436368
[12/20/2022-21:54:41] [TRT] [I] Total Device Persistent Memory: 7805952
[12/20/2022-21:54:41] [TRT] [I] Total Scratch Memory: 147456
[12/20/2022-21:54:41] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU me

# FP16

In [None]:
config, network = configure_quantization_and_inputs(config, network, fp16=True, int8=False)

In [None]:
plan = builder.build_serialized_network(network, config)
with open("broncho_fp16.trt", "wb") as f:
    f.write(plan)

## INT8 Quantization

In [None]:
config, network = configure_quantization_and_inputs(config, network, fp16=True, int8=True)

In [None]:
int8_calib_set = DummyDataset()

In [None]:
config.int8_calibrator = Calibrator(
    int8_calib_set, 1
)

In [None]:
plan = builder.build_serialized_network(network, config)
with open("broncho_int8.trt", "wb") as f:
    f.write(plan)

## Pruning

In [3]:
pruner = ModelPruning(
        pruning_fn="ln_structured",
        parameters_to_prune=return_pruning_params(model),
        amount=0.3,
        use_global_unstructured=False,
        pruning_norm=1,
        pruning_dim=0,
        parameter_names=['weight'],
        use_lottery_ticket_hypothesis=False,
        prune_on_train_epoch_end=True,
        make_pruning_permanent=True,
        verbose=1
    )
pruner.apply_pruning(0.3)

Applied `ln_structured`. Pruned: 0/14053752 (0.00%) -> 4214994/14053752 (29.99%)


In [4]:
pruner.apply_pruning(0.3)
print("Pruning has been applied as pre-hooks. The network appear as pruned -> Pruned?", is_pruned(model))
pruner.make_pruning_permanent(model)
print("Now prune hooks are deleted, then the network appears as unpruned -> Pruned?", is_pruned(model))

Applied `ln_structured`. Pruned: 4214994/14053752 (29.99%) -> 7162594/14053752 (50.97%)


Pruning has been applied as pre-hooks. The network appear as pruned -> Pruned? True
Now prune hooks are deleted, then the network appears as unpruned -> Pruned? False


In [None]:
tonnx.export(model, torch.randn(1, 2, 3, 256, 256),  "broncho_30.onnx", verbose=True, opset_version=16)

In [None]:
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(EXPLICIT_BATCH)
config = builder.create_builder_config()
parser = trt.OnnxParser(network, TRT_LOGGER)

with open("broncho.onnx", "rb") as model:
    ok = parser.parse(model.read())

config.max_workspace_size = to_GiB(1)

plan = builder.build_serialized_network(network, config)
with open("broncho_30.trt", "wb") as f:
    f.write(plan)

# Benchmark

In [5]:
!trtexec --loadEngine="broncho.trt" --niter 300

/bin/bash: trtexec: command not found


In [None]:
!trtexec --loadEngine="broncho_fp16.trt" --niter 300

In [None]:
!trtexec --loadEngine="broncho_int8.trt" --niter 300

In [None]:
!trtexec --loadEngine="broncho_pruned.trt" --niter 300

# Inference

In [None]:
cuda.init()
dev = cuda.Device(0)
ctx = dev.make_context()

In [None]:
with open("broncho_fp16.trt", "rb") as f, trt.Runtime(
    TRT_LOGGER
) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())

In [None]:
trt_context = engine.create_execution_context()

In [None]:
(
    inputs,
    outputs,
    bindings,
    stream,
) = allocate_buffers(engine)

# Exercise

Convert a pretrained network (e.g. mobilenetv2) for classification into trt and make inference with it