# Running DeepConsensus in TensorRT

## Imports

In [1]:
# %%bash 

# pip install --upgrade pip setuptools wheel
# pip install deepconsensus[gpu]==1.1.0 onnxruntime tensorrt pycuda tf2onnx gsutil

In [2]:
import onnx

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [4]:
import tensorflow as tf
import onnx
import colorama
import numpy as np
import random
import PIL
from IPython.display import display
from deepconsensus.models import model_configs
from deepconsensus.models import model_utils
from deepconsensus.models import data_providers
from deepconsensus.utils import dc_constants

2023-01-25 22:53:02.161060: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-25 22:53:03.149763: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-25 22:53:03.149853: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


## Load the examples

In [5]:
%%bash
# Download Model
mkdir -p deepconsensus_model
gsutil cp -r gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/* deepconsensus_model/
# Download test data
gsutil cp gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.1/test/tf-test-00000-of-00500.tfrecord.gz ./tf-test.tfrecord.gz

Copying gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/checkpoint.data-00000-of-00001...
Copying gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/checkpoint.index...
Copying gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/params.json...
| [3 files][ 85.7 MiB/ 85.7 MiB]                                                
Operation completed over 3 objects/85.7 MiB.                                     
Copying gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.1/test/tf-test-00000-of-00500.tfrecord.gz...
| [1 files][ 90.6 MiB/ 90.6 MiB]                                                
Operation completed over 1 objects/90.6 MiB.                                     


In [6]:
checkpoint_path = 'deepconsensus_model/checkpoint'
params = model_utils.read_params_from_json(checkpoint_path=checkpoint_path)

tfrecord_path = 'tf-test.tfrecord.gz'

# Number of examples:
batch_size = 8192  #@param

ds = data_providers.get_dataset(tfrecord_path,
                                num_epochs=None,
                                batch_size=batch_size,
                                params=params,
                                inference=False)

# Just get one batch to inspect:
for batch in ds.take(1):
  break

keys = ['name', 'label', 'rows', 'num_passes', 'window_pos']
for key in keys:
  print(f'{key}.shape: {batch[key].shape}')

2023-01-25 22:53:10.496734: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-01-25 22:53:10.496775: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: hcsa-dgx32gb
2023-01-25 22:53:10.496785: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: hcsa-dgx32gb
2023-01-25 22:53:10.496886: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 515.65.1
2023-01-25 22:53:10.496922: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 515.65.1
2023-01-25 22:53:10.496930: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 515.65.1
2023-01-25 22:53:10.497373: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with

name.shape: (8192, 1)
label.shape: (8192, 100)
rows.shape: (8192, 85, 100, 1)
num_passes.shape: (8192, 1)
window_pos.shape: (8192, 1)


## Load the model

In [7]:
model = model_utils.get_model(params)
checkpoint = tf.train.Checkpoint(model=model)

row_size = data_providers.get_total_rows(params.max_passes)
input_shape = (1, row_size, params.max_length, params.num_channels)
model_utils.print_model_summary(model, input_shape)
checkpoint.restore(
    checkpoint_path).expect_partial().assert_existing_objects_matched()

model_utils.modify_params(
  params=params,
  speedy=True,
  max_length=100,
  is_training=False)

Model: "encoder_only_learned_values_transformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 relative_position_embedding  multiple                 0         
  (RelativePositionEmbedding                                     
 )                                                               
                                                                 
 encoder_stack (EncoderStack  multiple                 7320200   
 )                                                               
                                                                 
 dense (Dense)               multiple                  1405      
                                                                 
 softmax (Softmax)           multiple                  0         
                                                                 
 bases_embedding (ModifiedOn  multiple                 40        
 DeviceEmbedding)          

In [8]:
from keras.layers import Input

inputs = Input(shape=input_shape[1:])
outputs = model.call(inputs, training=False)
model.summary()

Model: "encoder_only_learned_values_transformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 relative_position_embedding  (100, 280)               0         
  (RelativePositionEmbedding                                     
 )                                                               
                                                                 
 encoder_stack (EncoderStack  {'self_attention_layer_0  7320200  
 )                           ': (None, 100, 280),                
                              'attention_scores_0': (            
                             None, 2, 100, 100),                 
                              'ffn_layer_0': (None, 1            
                             00, 280),                           
                              'self_attention_layer_1            
                             ': (None, 100, 280),                
                           

In [9]:
softmax_output = model.predict(batch['rows'])

In [10]:
tf_preds = np.argmax(softmax_output, -1)

## Export the model in the ONNX format

In [11]:
tf.saved_model.save(model, 'deepconsensus_model/SavedModel')



INFO:tensorflow:Assets written to: deepconsensus_model/SavedModel/assets


INFO:tensorflow:Assets written to: deepconsensus_model/SavedModel/assets


In [12]:
%%bash

python -m tf2onnx.convert --saved-model deepconsensus_model/SavedModel/ --output deepconsensus_model/pbdc_temp.onnx

2023-01-25 22:54:20.380121: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-25 22:54:21.345931: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-25 22:54:21.346010: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2023-01-25 22:54:26,395 - INFO - Signatures found in model: [serving_default].
2023-01-25 22:54:26,396 - INFO - Output names: ['output_1']
2023-01-25 22:54:31,044 - I

2023-01-25 22:54:39,541 - INFO - replacing einsum node 'StatefulPartitionedCall/encoder_only_learned_values_transformer/Transformer/encode/encoder_stack/layer_3/self_attention/pre_post_processing_wrapper_6/self_attention_3/value/einsum/Einsum' by its decomposed version, name of the last node 'Identity__2477'.
2023-01-25 22:54:39,549 - INFO - replacing einsum node 'StatefulPartitionedCall/encoder_only_learned_values_transformer/Transformer/encode/encoder_stack/layer_3/self_attention/pre_post_processing_wrapper_6/self_attention_3/query/einsum/Einsum' by its decomposed version, name of the last node 'Identity__2512'.
2023-01-25 22:54:39,557 - INFO - replacing einsum node 'StatefulPartitionedCall/encoder_only_learned_values_transformer/Transformer/encode/encoder_stack/layer_2/self_attention/pre_post_processing_wrapper_4/self_attention_2/query/einsum/Einsum' by its decomposed version, name of the last node 'Identity__2547'.
2023-01-25 22:54:39,568 - INFO - replacing einsum node 'StatefulPar

In [13]:
onnx_model = onnx.load_model('deepconsensus_model/pbdc_temp.onnx')

onnx.checker.check_model(onnx_model)

In [14]:
inputs = onnx_model.graph.input
for input in inputs:
    dim1 = input.type.tensor_type.shape.dim[0]
    dim1.dim_value = batch_size

In [15]:
model_name = "deepconsensus_model/pbdc.onnx"
onnx.save_model(onnx_model, model_name)

In [16]:
%%bash 

rm deepconsensus_model/pbdc_temp.onnx

## Running the model in TensorRT

In [17]:
import os

igpu = 1
os.environ["CUDA_VISIBLE_DEVICES"] = str(igpu)
os.environ["CUDA_MODULE_LOADING"] = "LAZY"

In [18]:
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit

In [19]:
USE_FP16 = False

target_dtype = np.float16 if USE_FP16 else np.float32
trt_engine = "deepconsensus_model/pbdc_float16.trt" if USE_FP16 else "deepconsensus_model/pbdc_float32.trt"

In [20]:
if USE_FP16:
    !/usr/src/tensorrt/bin/trtexec --onnx="deepconsensus_model/pbdc.onnx" --saveEngine="$trt_engine" --useCudaGraph --inputIOFormats=fp16:chw --outputIOFormats=fp16:chw --fp16
else:
    !/usr/src/tensorrt/bin/trtexec --onnx="deepconsensus_model/pbdc.onnx" --saveEngine="$trt_engine" --useCudaGraph

&&&& RUNNING TensorRT.trtexec [TensorRT v8502] # /usr/src/tensorrt/bin/trtexec --onnx=deepconsensus_model/pbdc.onnx --saveEngine=deepconsensus_model/pbdc_float32.trt --useCudaGraph
[01/25/2023-22:54:49] [I] === Model Options ===
[01/25/2023-22:54:49] [I] Format: ONNX
[01/25/2023-22:54:49] [I] Model: deepconsensus_model/pbdc.onnx
[01/25/2023-22:54:49] [I] Output:
[01/25/2023-22:54:49] [I] === Build Options ===
[01/25/2023-22:54:49] [I] Max batch: explicit batch
[01/25/2023-22:54:49] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
[01/25/2023-22:54:49] [I] minTiming: 1
[01/25/2023-22:54:49] [I] avgTiming: 8
[01/25/2023-22:54:49] [I] Precision: FP32
[01/25/2023-22:54:49] [I] LayerPrecisions: 
[01/25/2023-22:54:49] [I] Calibration: 
[01/25/2023-22:54:49] [I] Refit: Disabled
[01/25/2023-22:54:49] [I] Sparsity: Disabled
[01/25/2023-22:54:49] [I] Safe mode: Disabled
[01/25/2023-22:54:49] [I] DirectIO mode: Disabled
[01/25/2023-22:54:49] [I

In [21]:
f = open(trt_engine, "rb")
runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING)) 

engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()

[01/25/2023-22:59:59] [TRT] [W]  (foreignNode) cuBLASLt subversions: compiled against 11.10.3.0 but running against 11.11.3.0.


In [22]:
batch_output = np.empty(softmax_output.numpy().shape, dtype = target_dtype)

# Allocate device memory
d_input = cuda.mem_alloc(1 * batch['rows'].numpy().nbytes)
d_output = cuda.mem_alloc(1 * batch_output.nbytes)

bindings = [int(d_input), int(d_output)]

stream = cuda.Stream()

In [23]:
from pycuda import driver

def tpredict(batch): # result gets copied into output
    # Transfer input data to device
    cuda.memcpy_htod_async(d_input, batch, stream)
    # Execute model
    context.execute_async_v2(bindings, stream.handle, None)
    # Transfer predictions back
    cuda.memcpy_dtoh_async(batch_output, d_output, stream)
    # Syncronize threads
    stream.synchronize()
    
    return batch_output

In [24]:
trt_softmax_output = tpredict(batch['rows']).astype(np.float32)

In [25]:
trt_preds = np.argmax(trt_softmax_output, -1)

In [26]:
loss = model_utils.get_deepconsensus_loss(params)

In [27]:
batch_loss = loss.eval(batch['label'], trt_softmax_output).numpy().mean()
print("Batch accuracy: {:.3f}".format(batch_loss))

Batch accuracy: 0.538


In [28]:
%%time

total_loss = 0

for i, batch in enumerate(ds.take(count=-1)):
    trt_softmax_output = tpredict(batch['rows'])
    batch_loss = loss.eval(batch['label'], tf.convert_to_tensor(trt_softmax_output, dtype=tf.float32)).numpy().mean()
    total_loss = (total_loss * i + batch_loss) / (i + 1)
    
print("Dataset average accuracy: {:.3f}".format(total_loss))

Dataset average accuracy: 0.512
CPU times: user 2min 6s, sys: 26.8 s, total: 2min 33s
Wall time: 19.2 s


## Assessing DeepConsensus accuracy

In [29]:
%%bash

mkdir -p train

gsutil cp gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.1/train/tf-train-00000-of-00500.tfrecord.gz ./train/tf-train-00000-of-00500.tfrecord.gz

Copying gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.1/train/tf-train-00000-of-00500.tfrecord.gz...
- [1 files][  2.2 GiB/  2.2 GiB]   63.5 MiB/s                                   
Operation completed over 1 objects/2.2 GiB.                                      


In [30]:
dataset_path = "train/tf-train-00000-of-00500.tfrecord.gz"

In [31]:
ds = data_providers.get_dataset(dataset_path,
                                num_epochs=None,
                                batch_size=batch_size,
                                params=params,
                                inference=False)

In [32]:
%%time

total_loss = 0

for i, batch in enumerate(ds.take(count=-1)):
    trt_softmax_output = tpredict(batch['rows']).astype(np.float32)
    batch_loss = loss.eval(batch['label'], trt_softmax_output).numpy().mean()
    total_loss = (total_loss * i + batch_loss) / (i + 1)
    
print("Dataset average accuracy: {:.3f}".format(total_loss))

2023-01-25 23:01:12.788346: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 80323 of 1000000
2023-01-25 23:01:22.787872: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 161540 of 1000000
2023-01-25 23:01:32.788095: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 243088 of 1000000
2023-01-25 23:01:42.787843: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 323821 of 1000000
2023-01-25 23:01:52.787863: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 404795 of 1000000
2023-01-25 23:02:02.787970: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 486087 of 1000000
2023-01-25 23:02:12.788076: I tensorflow/core/kernels/data/shuffle_data

Dataset average accuracy: 1.075
CPU times: user 1h 7min 51s, sys: 13min 34s, total: 1h 21min 25s
Wall time: 9min 51s
