In [1]:
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.driver as cuda
import pycuda.autoinit  # initializes CUDA driver
import numpy as np
import time
import torch
import cv2 as cv

In [2]:
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

# 1. Load the engine
with open("checkpoints/RetinaFace-R50.engine", "rb") as f:
    runtime = trt.Runtime(TRT_LOGGER)
    engine = runtime.deserialize_cuda_engine(f.read())

# 2. Create context
context = engine.create_execution_context()

# 3. Allocate buffers
inputs, outputs, bindings, stream = [], [], [], cuda.Stream()

for i in range(engine.num_io_tensors):
    tensor_name = engine.get_tensor_name(i)
    size = trt.volume(engine.get_tensor_shape(tensor_name))
    dtype = trt.nptype(engine.get_tensor_dtype(tensor_name))

    host_mem = cuda.pagelocked_empty(size, dtype)
    device_mem = cuda.mem_alloc(host_mem.nbytes)

    bindings.append(int(device_mem))
    if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
        inputs.append((host_mem, device_mem))
    else:
        outputs.append((host_mem, device_mem))

In [None]:
def infer(input_numpy):
    # Copy input data to host buffer
    np.copyto(inputs[0][0], input_numpy.ravel())

    # Transfer to GPU
    cuda.memcpy_htod_async(inputs[0][1], inputs[0][0], stream)

    # Execute
    context.execute_v2(bindings)

    # Transfer outputs back
    cuda.memcpy_dtoh_async(outputs[0][0], outputs[0][1], stream)
    cuda.memcpy_dtoh_async(outputs[1][0], outputs[1][1], stream)
    cuda.memcpy_dtoh_async(outputs[2][0], outputs[2][1], stream)
    stream.synchronize()
    return outputs

In [8]:
def measure_inference_time(repetitions=100):
    input_tensor = np.random.randn(repetitions, 3, 640, 640).astype(np.float32)

    # warmup
    for i in range(10):
        _ = infer(input_tensor[i:i+1])

    start = time.time()
    for i in range(repetitions):
        _ = infer(input_tensor[i:i+1])
    end = time.time()

    avg_time = (end - start) / repetitions
    return avg_time

print("Avg inference time:", measure_inference_time() * 1000, "ms")


Avg inference time: 13.821144104003906 ms


### pytorch Avg inference time: 30-33ms
#### more than 100% boost 