# Test de rapidité du modèle utilisé avec TensorRT

Dans ce notebook, nous allons tester la rapidité du modèle TensorRT

In [1]:
import tensorflow as tf
import numpy as np
import cv2
import os
import shutil

from tensorflow import keras
from matplotlib import pyplot as plt

### Chargement du moteur de l'IA n°1 : Détection des coordonnées des articulations

In [2]:
import tensorrt as trt

# Construction de la class du logger
class MyLogger(trt.ILogger):
    def __init__(self):
        trt.ILogger.__init__(self)

    def log(self, severity, msg):
        print("%s : %s" %(severity,msg))
        pass

In [3]:
import pycuda.driver as cuda
import pycuda.autoinit

logger = MyLogger()
runtime = trt.Runtime(logger)
trt.init_libnvinfer_plugins(logger, namespace="")

with open("model_jetson.engine", "rb") as f:
    engine_IA1 = runtime.deserialize_cuda_engine(f.read())

Severity.INFO : [MemUsageChange] Init CUDA: CPU +225, GPU +0, now: CPU 320, GPU 2762 (MiB)
Severity.VERBOSE : Registered plugin creator - ::BatchTilePlugin_TRT version 1
Severity.VERBOSE : Registered plugin creator - ::BatchedNMS_TRT version 1
Severity.VERBOSE : Registered plugin creator - ::BatchedNMSDynamic_TRT version 1
Severity.VERBOSE : Registered plugin creator - ::CoordConvAC version 1
Severity.VERBOSE : Registered plugin creator - ::CropAndResize version 1
Severity.VERBOSE : Registered plugin creator - ::CropAndResizeDynamic version 1
Severity.VERBOSE : Registered plugin creator - ::DecodeBbox3DPlugin version 1
Severity.VERBOSE : Registered plugin creator - ::DetectionLayer_TRT version 1
Severity.VERBOSE : Registered plugin creator - ::EfficientNMS_TRT version 1
Severity.VERBOSE : Registered plugin creator - ::EfficientNMS_ONNX_TRT version 1
Severity.VERBOSE : Registered plugin creator - ::EfficientNMS_Explicit_TF_TRT version 1
Severity.VERBOSE : Registered plugin creator - ::E

### Chargement du moteur de l'IA n°2 : Détection de la position

In [4]:
with open("model_classification_multi.engine", "rb") as f:
    engine_IA2 = runtime.deserialize_cuda_engine(f.read())

Severity.INFO : Loaded engine size: 1 MiB
Severity.VERBOSE : Using cublas as a tactic source
Severity.INFO : [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +0, now: CPU 723, GPU 3169 (MiB)
Severity.VERBOSE : Using cuDNN as a tactic source
Severity.INFO : [MemUsageChange] Init cuDNN: CPU +0, GPU +0, now: CPU 723, GPU 3169 (MiB)
Severity.VERBOSE : Deserialization required 28354 microseconds.
Severity.INFO : [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +0, now: CPU 0, GPU 14 (MiB)


 ### Création des contextes

In [5]:
context_IA1 = engine_IA1.create_execution_context()

Severity.VERBOSE : Using cublas as a tactic source
Severity.INFO : [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +0, now: CPU 722, GPU 3169 (MiB)
Severity.VERBOSE : Using cuDNN as a tactic source
Severity.INFO : [MemUsageChange] Init cuDNN: CPU +0, GPU +0, now: CPU 722, GPU 3169 (MiB)
Severity.VERBOSE : Total per-runner device persistent memory is 13678592
Severity.VERBOSE : Total per-runner host persistent memory is 157056
Severity.VERBOSE : Allocated activation device memory of size 18387456
Severity.INFO : [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +30, now: CPU 0, GPU 44 (MiB)


In [6]:
context_IA2 = engine_IA2.create_execution_context()

Severity.VERBOSE : Using cublas as a tactic source
Severity.INFO : [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +0, now: CPU 729, GPU 3177 (MiB)
Severity.VERBOSE : Using cuDNN as a tactic source
Severity.INFO : [MemUsageChange] Init cuDNN: CPU +0, GPU +0, now: CPU 729, GPU 3177 (MiB)
Severity.VERBOSE : Total per-runner device persistent memory is 0
Severity.VERBOSE : Total per-runner host persistent memory is 18592
Severity.VERBOSE : Allocated activation device memory of size 7168
Severity.INFO : [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 44 (MiB)


### Allocation de l'espace mémoire hôte et GPU pour l'IA1

In [7]:
# Réservation de la mémoire pour l'entrée
size_input_IA1 = trt.volume(engine_IA1.get_binding_shape(0))* engine_IA1.max_batch_size
input_host_mem_IA1 = cuda.pagelocked_empty(size_input_IA1, np.int32)
input_device_mem_IA1 = cuda.mem_alloc(input_host_mem_IA1.nbytes)

In [8]:
# Réservation de la mémoire pour les sorties
output_device_mem_IA1 = [];
format_sorties_IA1 = [];
types_sorties_IA1 = [];

for i in range(engine_IA1.num_bindings):
    if not engine_IA1.binding_is_input(i):
        size_output = trt.volume(engine_IA1.get_binding_shape(i))* engine_IA1.max_batch_size
        output_host_mem = cuda.pagelocked_empty(size_output, trt.nptype(engine_IA1.get_binding_dtype(i)))
        output_device_mem_IA1.append(cuda.mem_alloc(output_host_mem.nbytes))
        format_sorties_IA1.append(engine_IA1.get_binding_shape(i))
        types_sorties_IA1.append(trt.nptype(engine_IA1.get_binding_dtype(i)))

In [9]:
# Récupère les adresses en GPU des buffers entrées / sorties
binding_entree_IA1 = int(input_device_mem_IA1)

binding_sorties_IA1 = []
for output_ in output_device_mem_IA1:
    binding_sorties_IA1.append(int(output_))

### Allocation de l'espace mémoire hôte et GPU pour l'IA2

In [10]:
# Réservation de la mémoire pour l'entrée
size_input_IA2 = trt.volume(engine_IA2.get_binding_shape(0))* engine_IA2.max_batch_size
input_host_mem_IA2 = cuda.pagelocked_empty(size_input_IA2, np.float32)
input_device_mem_IA2 = cuda.mem_alloc(input_host_mem_IA2.nbytes)

In [11]:
# Réservation de la mémoire pour les sorties
output_device_mem_IA2 = [];
format_sorties_IA2 = [];
types_sorties_IA2 = [];

for i in range(engine_IA2.num_bindings):
    if not engine_IA2.binding_is_input(i):
        size_output = trt.volume(engine_IA2.get_binding_shape(i))* engine_IA2.max_batch_size
        output_host_mem = cuda.pagelocked_empty(size_output, trt.nptype(engine_IA2.get_binding_dtype(i)))
        output_device_mem_IA2.append(cuda.mem_alloc(output_host_mem.nbytes))
        format_sorties_IA2.append(engine_IA2.get_binding_shape(i))
        types_sorties_IA2.append(trt.nptype(engine_IA2.get_binding_dtype(i)))

In [12]:
# Récupère les adresses en GPU des buffers entrées / sorties
binding_entree_IA2 = int(input_device_mem_IA2)

binding_sorties_IA2 = []
for output_ in output_device_mem_IA2:
    binding_sorties_IA2.append(int(output_))

### Exécution des IAs

On commence par récupérer une image du dataset :

In [None]:
output_host_mem_IA1[0][0]

TypeError: 'tuple' object is not callable

In [16]:
import time

nbr_run = 100
delais = []

bindings_IA1 = [binding_entree_IA1, binding_sorties_IA1[0]]
bindings_IA2 = [binding_entree_IA2, binding_sorties_IA2[0]]

image = tf.keras.preprocessing.image.load_img("Corbeau_NonCorrect.jpg",target_size=(256, 256))
image = np.expand_dims(image,axis=0)

# Initialisation des calculs
print("Initialisation des calculs...")
for i in range(5):  
    # IA1
    np.copyto(input_host_mem_IA1,image.ravel())
    cuda.memcpy_htod(input_device_mem_IA1, input_host_mem_IA1)
    context_IA1.execute_v2(bindings=bindings_IA1)
    output_host_mem_IA1 = []
    for i in range(len(output_device_mem_IA1)):
        output_host_mem_IA1.append(np.zeros(format_sorties_IA1[i],types_sorties_IA1[i]))
    for i in range(len(output_host_mem_IA1)):
        cuda.memcpy_dtoh(output_host_mem_IA1[i], output_device_mem_IA1[i])
    
    # IA2
    x = output_host_mem_IA1[0][0]           # (1,17,3)
    np.copyto(input_host_mem_IA2,x.ravel())
    cuda.memcpy_htod(input_device_mem_IA2, input_host_mem_IA2)
    context_IA2.execute_v2(bindings=bindings_IA2)
    output_host_mem_IA2 = []
    for i in range(len(output_device_mem_IA2)):
        output_host_mem_IA2.append(np.zeros(format_sorties_IA2[i],types_sorties_IA2[i]))
    for i in range(len(output_host_mem_IA2)):
        cuda.memcpy_dtoh(output_host_mem_IA2[i], output_device_mem_IA2[i])

# Lance les inférences
for i in range(nbr_run):
    time0 = time.time()

    # IA1
    np.copyto(input_host_mem_IA1,image.ravel())
    cuda.memcpy_htod(input_device_mem_IA1, input_host_mem_IA1)
    context_IA1.execute_v2(bindings=bindings_IA1)
    output_host_mem_IA1 = []
    for i in range(len(output_device_mem_IA1)):
        output_host_mem_IA1.append(np.zeros(format_sorties_IA1[i],types_sorties_IA1[i]))
    for i in range(len(output_host_mem_IA1)):
        cuda.memcpy_dtoh(output_host_mem_IA1[i], output_device_mem_IA1[i])
    
    # IA2
    x = output_host_mem_IA1[0][0]           # (1,17,3)
    np.copyto(input_host_mem_IA2,x.ravel())
    cuda.memcpy_htod(input_device_mem_IA2, input_host_mem_IA2)
    context_IA2.execute_v2(bindings=bindings_IA2)
    output_host_mem_IA2 = []
    for i in range(len(output_device_mem_IA2)):
        output_host_mem_IA2.append(np.zeros(format_sorties_IA2[i],types_sorties_IA2[i]))
    for i in range(len(output_host_mem_IA2)):
        cuda.memcpy_dtoh(output_host_mem_IA2[i], output_device_mem_IA2[i])
    
    time_end = time.time()
        
    delais = np.append(delais,time_end - time0)
        
    if i%10 == 0:
        print("Etape %d-%d moyenne : %4.1f ms" %(i,i+5,(delais[-10:].mean())*1000))

Initialisation des calculs...
Etape 0-5 moyenne : 52.3 ms
Etape 0-5 moyenne : 55.6 ms
Etape 0-5 moyenne : 56.5 ms
Etape 0-5 moyenne : 55.3 ms
Etape 0-5 moyenne : 54.0 ms
Etape 0-5 moyenne : 54.1 ms
Etape 0-5 moyenne : 53.5 ms
Etape 0-5 moyenne : 53.1 ms
Etape 0-5 moyenne : 52.7 ms
Etape 0-5 moyenne : 52.1 ms
Etape 0-5 moyenne : 51.6 ms
Etape 0-5 moyenne : 50.4 ms
Etape 0-5 moyenne : 48.9 ms
Etape 0-5 moyenne : 47.5 ms
Etape 0-5 moyenne : 46.7 ms
Etape 0-5 moyenne : 45.3 ms
Etape 0-5 moyenne : 44.5 ms
Etape 0-5 moyenne : 43.6 ms
Etape 0-5 moyenne : 42.6 ms
Etape 0-5 moyenne : 42.0 ms
Etape 0-5 moyenne : 41.3 ms
Etape 0-5 moyenne : 40.6 ms
Etape 0-5 moyenne : 40.0 ms
Etape 0-5 moyenne : 40.0 ms
Etape 0-5 moyenne : 39.9 ms
Etape 0-5 moyenne : 39.9 ms
Etape 0-5 moyenne : 39.6 ms
Etape 0-5 moyenne : 39.3 ms
Etape 0-5 moyenne : 39.2 ms
Etape 0-5 moyenne : 39.2 ms
Etape 0-5 moyenne : 39.1 ms
Etape 0-5 moyenne : 39.1 ms
Etape 0-5 moyenne : 39.3 ms
Etape 0-5 moyenne : 39.3 ms
Etape 0-5 moyenne 