In [1]:
import tensorrt as trt
import pycuda.driver as cuda
import torchvision.transforms as transforms
import pycuda.autoinit  # initializes CUDA driver
import numpy as np
import time
import torch
import cv2 as cv

In [5]:
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

# 1. Load the engine
with open("checkpoints/arcface-r100-glint360k_fp16.engine", "rb") as f:
    runtime = trt.Runtime(TRT_LOGGER)
    engine = runtime.deserialize_cuda_engine(f.read())

# 2. Create context
context = engine.create_execution_context()

# 3. Allocate buffers
inputs, outputs, bindings, stream = [], [], [], cuda.Stream()

for i in range(engine.num_io_tensors):
    tensor_name = engine.get_tensor_name(i)

    size = trt.volume(engine.get_tensor_shape(tensor_name))
    dtype = trt.nptype(engine.get_tensor_dtype(tensor_name))

    host_mem = cuda.pagelocked_empty(size, dtype)
    device_mem = cuda.mem_alloc(host_mem.nbytes)

    bindings.append(int(device_mem))
    if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
        inputs.append((host_mem, device_mem))
    else:
        outputs.append((host_mem, device_mem))

In [6]:
def infer(input_numpy):
    context.set_input_shape("input", input_numpy.shape)
    # Copy input data to host buffer
    np.copyto(inputs[0][0], input_numpy.ravel())

    # Transfer to GPU
    cuda.memcpy_htod_async(inputs[0][1], inputs[0][0], stream)

    # Execute
    context.execute_v2(bindings)

    # Transfer outputs back
    cuda.memcpy_dtoh_async(outputs[0][0], outputs[0][1], stream)
    stream.synchronize()
    return outputs

In [7]:
def normalize(img):
    # img: (C, H, W)
    mean = np.array([0.5, 0.5, 0.5], dtype=np.float32).reshape(3, 1, 1)
    std = np.array([0.5, 0.5, 0.5], dtype=np.float32).reshape(3, 1, 1)
    return (img - mean) / std

In [28]:
def measure_inference_time(repetitions=100):
    input_tensor = np.random.randn(repetitions, 3, 112, 112).astype(np.float32)

    # warmup
    for i in range(10):
        _ = infer(input_tensor[0:10])

    start = time.time()
    for i in range(repetitions):
        _ = infer(input_tensor[0:10])
    end = time.time()

    avg_time = (end - start) / repetitions
    return avg_time

print("Avg inference time:", measure_inference_time() * 1000, "ms")


Avg inference time: 6.216003894805908 ms


In [3]:
from src.constants import QDRANT_PORT, QDRANT_HOST
from qdrant_client import QdrantClient
from qdrant_client.http import models

collection_name = "faces"
client = QdrantClient(QDRANT_HOST, grpc_port=QDRANT_PORT)

In [8]:
def normalize(img):
    # img: (C, H, W)
    mean = np.array([0.5, 0.5, 0.5], dtype=np.float32).reshape(1, 1, 3)
    std = np.array([0.5, 0.5, 0.5], dtype=np.float32).reshape(1, 1, 3)
    return (img - mean) / std

In [9]:
image = cv.imread("amir.jpg")
image = normalize(image/255)
vec = infer(image)
vec = vec[0][0].tolist()

In [10]:
client.upsert(
    collection_name=collection_name, 
    points=[models.PointStruct(id=1, vector=vec, payload={"name": "amir"})])

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)