Create the INT8

In [None]:
import numpy as np
import os

calibration_data_path = "/home/guoy/led_detection/training/yolov8/yolo_val/extra_npy"
files = sorted([os.path.join(calibration_data_path, f) for f in os.listdir(calibration_data_path) if f.endswith(".npy")])

for f in files[:5]:  # 只检查前5个
    data = np.load(f)
    print(f"检查 {f}: shape={data.shape}, dtype={data.dtype}, min={data.min()}, max={data.max()}")


尝试1

In [4]:
import tensorrt as trt
import numpy as np
import os
import pycuda.driver as cuda
import pycuda.autoinit
import ctypes
import onnx

class RetinaNetInt8Calibrator(trt.IInt8EntropyCalibrator2):
    def __init__(self, calibration_data_folder, onnx_model_path, batch_size=1):
        super(RetinaNetInt8Calibrator, self).__init__()
        self.batch_size = batch_size
        self.data_files = [os.path.join(calibration_data_folder, f) 
                           for f in os.listdir(calibration_data_folder) if f.endswith(".npy")]
        self.data_files.sort()
        self.current_index = 0
        self.input_shape = self.get_input_shape(onnx_model_path)
        self.host_input = np.empty(self.input_shape, dtype=np.float32)  # 直接用 NumPy 申请 Host 内存
    
    def get_input_shape(self, onnx_model_path):
        onnx_model = onnx.load(onnx_model_path)
        input_tensor = onnx_model.graph.input[0]
        input_shape = []
        for dim in input_tensor.type.tensor_type.shape.dim:
            if dim.dim_value is not None and dim.dim_value > 0:
                input_shape.append(dim.dim_value)
            else:
                input_shape.append(self.batch_size)  # 避免 None 值
        print("Expected input shape:", input_shape)
        return tuple(input_shape)

    def get_batch_size(self):
        return self.batch_size

    def get_batch(self, names):
        if self.current_index + self.batch_size > len(self.data_files):
            return None  # 结束校准
        
        batch_files = self.data_files[self.current_index:self.current_index + self.batch_size]
        batch_data = [np.load(f).astype(np.float32) for f in batch_files]
        batch_data = np.stack(batch_data, axis=0)  # 确保 batch 维度正确
        self.current_index += self.batch_size
        
        if batch_data.shape != self.input_shape:
            raise ValueError(f"Batch shape {batch_data.shape} does not match expected input shape {self.input_shape}")
        
        np.copyto(self.host_input, batch_data)  # 直接复制数据到 Host 端固定页内存
        return [ctypes.c_void_p(self.host_input.ctypes.data)]  # 确保返回的是 `void*` 指针
    
    def read_calibration_cache(self):
        cache_file = "calibration.cache"
        if os.path.exists(cache_file):
            with open(cache_file, "rb") as f:
                return f.read()
        return None
    
    def write_calibration_cache(self, cache):
        with open("calibration.cache", "wb") as f:
            f.write(cache)

# 用法示例：使用 TensorRT 进行 INT8 量化转换
onnx_model_path = "/home/guoy/led_detection/training/RetinaNet/RetinaNet_v1.onnx"
calibration_data_folder = "/home/guoy/led_detection/training/yolov8/yolo_val/extra_npy_fixed"
int8_calibrator = RetinaNetInt8Calibrator(calibration_data_folder, onnx_model_path)

logger = trt.Logger(trt.Logger.INFO)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, logger)

with open(onnx_model_path, "rb") as model_file:
    if not parser.parse(model_file.read()):
        for error in range(parser.num_errors):
            print(parser.get_error(error))
        raise ValueError("Failed to parse ONNX model")

config = builder.create_builder_config()
config.set_flag(trt.BuilderFlag.INT8)
config.int8_calibrator = int8_calibrator

serialized_engine = builder.build_serialized_network(network, config)

if serialized_engine is None:
    raise RuntimeError("Failed to build TensorRT engine")

engine_path = "/home/guoy/led_detection/training/RetinaNet/RetinaNet_v1_int8.engine"
with open(engine_path, "wb") as f:
    f.write(serialized_engine)

print(f"INT8 TensorRT engine saved at {engine_path}")


Expected input shape: [1, 3, 640, 640]
[03/20/2025-23:11:58] [TRT] [I] The logger passed into createInferBuilder differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.


  config.int8_calibrator = int8_calibrator


[03/20/2025-23:12:00] [TRT] [I] Calibration table does not match calibrator algorithm type.
[03/20/2025-23:12:00] [TRT] [I] Perform graph optimization on calibration graph.
[03/20/2025-23:12:00] [TRT] [I] Local timing cache in use. Profiling results in this builder pass will not be stored.
[03/20/2025-23:12:00] [TRT] [I] Compiler backend is used during engine build.
[03/20/2025-23:12:01] [TRT] [I] Detected 1 inputs and 3 output network tensors.
[03/20/2025-23:12:02] [TRT] [I] Total Host Persistent Memory: 313120 bytes
[03/20/2025-23:12:02] [TRT] [I] Total Device Persistent Memory: 673280 bytes
[03/20/2025-23:12:02] [TRT] [I] Max Scratch Memory: 2151424 bytes
[03/20/2025-23:12:02] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 315 steps to complete.
[03/20/2025-23:12:02] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 43.6311ms to assign 34 blocks to 315 nodes requiring 70206976 bytes.
[03/20/2025-23:12:02] [TRT] [I] Total Activation Memory: 70206976

[ERROR] Exception caught in get_batch(): Unable to cast Python instance to C++ type (#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)


RuntimeError: Failed to build TensorRT engine

尝试2

In [1]:
import tensorrt as trt
import numpy as np
import os
import pycuda.driver as cuda
import pycuda.autoinit

onnx_file = "RetinaNet_v1.onnx"
trt_engine_path = "RetinaNet_16_int8.trt"


# 🔹 创建 TensorRT Logger
logger = trt.Logger(trt.Logger.INFO)

# 🔹 创建 TensorRT Builder 和 Network
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, logger)

# 🔹 解析 ONNX 模型
with open(onnx_file, 'rb') as model:
    if not parser.parse(model.read()):
        for i in range(parser.num_errors):
            print(f"ONNX Parsing Error {i}: {parser.get_error(i)}")
        raise RuntimeError("Failed to parse ONNX model")
    else:
        print("ONNX model successfully parsed!")

# 🔹 创建 TensorRT 配置
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 8 << 30)  # 8GB 工作空间
config.set_flag(trt.BuilderFlag.INT8)  # 开启 INT8 模式

# **INT8 校准器**
class Int8EntropyCalibrator2(trt.IInt8EntropyCalibrator2):
    def __init__(self, calibration_data_path, batch_size=1):
        trt.IInt8EntropyCalibrator2.__init__(self)
        self.batch_size = batch_size
        self.data_path = calibration_data_path
        self.data_files = sorted(os.listdir(calibration_data_path))  # 获取所有图片
        self.current_index = 0
        self.device_input = None
        
        # 预加载校准数据
        self.data = []
        for file in self.data_files:
            file_path = os.path.join(calibration_data_path, file)
            img = np.load(file_path, allow_pickle=False).astype(np.float32)
            if img.shape == (640, 640, 3):
                img = img.transpose(2, 0, 1)  # 转换为 (3, 640, 640)
            img = img.reshape(1, 3, 640, 640)  # 确保是 (1, 3, 640, 640)
            img = np.ascontiguousarray(img)

            self.data.append(img)
        
        self.device_input = cuda.mem_alloc(self.batch_size * self.data[0].nbytes)  # 申请显存
        
    def get_batch_size(self):
        return self.batch_size

    def get_batch(self, names):
        print(f"🧐 正在获取 batch, 当前索引: {self.current_index}")
        if self.current_index + self.batch_size > len(self.data):
            return None  # 结束
        batch = self.data[self.current_index:self.current_index + self.batch_size]
        
        # 先在 CPU 上处理数据
        batch = np.ascontiguousarray(batch, dtype=np.float32)  # 确保是 float32
        batch = batch.ravel()  # 确保是一维数据
        
        # **强制拷贝到 CPU 并转换**
        batch_cpu = np.empty_like(batch, dtype=np.float32)
        np.copyto(batch_cpu, batch)  # 确保数据从 GPU 到 CPU 再传输
        
        # 拷贝到 GPU
        cuda.memcpy_htod(self.device_input, batch_cpu)
        self.current_index += self.batch_size
        return [self.device_input]

    def read_calibration_cache(self):
        return None  # 不使用缓存

    def write_calibration_cache(self, cache):
        pass  # 不写入缓存

# Assuming you have a folder with calibration data in .npy format
calibration_data_folder = "/home/guoy/led_detection/training/yolov8/yolo_val/extra_npy"

# 赋值 INT8 校准器
config.int8_calibrator = Int8EntropyCalibrator2(calibration_data_folder)

# 创建优化配置
profile = builder.create_optimization_profile()
profile.set_shape("input", (1, 3, 640, 640), (1, 3, 640, 640), (1, 3, 640, 640))  
config.add_optimization_profile(profile)

# Build the TensorRT engine
print("🚀 Building TensorRT engine, please wait...")
engine = builder.build_serialized_network(network, config)
if engine is None:
    print("❌ Failed to build the TensorRT engine!")
    exit(1)

# Save the engine to a file
with open(trt_engine_path, "wb") as f:
    f.write(engine)
print(f"✅ TensorRT engine has been saved to {trt_engine_path}")


[03/20/2025-23:12:13] [TRT] [I] [MemUsageChange] Init CUDA: CPU -2, GPU +0, now: CPU 40, GPU 1794 (MiB)
[03/20/2025-23:12:15] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +2754, GPU +446, now: CPU 2996, GPU 2240 (MiB)
ONNX model successfully parsed!


  config.int8_calibrator = Int8EntropyCalibrator2(calibration_data_folder)


🚀 Building TensorRT engine, please wait...
[03/20/2025-23:12:15] [TRT] [I] Perform graph optimization on calibration graph.
[03/20/2025-23:12:15] [TRT] [I] Local timing cache in use. Profiling results in this builder pass will not be stored.
[03/20/2025-23:12:15] [TRT] [I] Compiler backend is used during engine build.
[03/20/2025-23:12:17] [TRT] [I] Detected 1 inputs and 3 output network tensors.
[03/20/2025-23:12:18] [TRT] [I] Total Host Persistent Memory: 313120 bytes
[03/20/2025-23:12:18] [TRT] [I] Total Device Persistent Memory: 673280 bytes
[03/20/2025-23:12:18] [TRT] [I] Max Scratch Memory: 2151424 bytes
[03/20/2025-23:12:18] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 315 steps to complete.
[03/20/2025-23:12:18] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 38.375ms to assign 34 blocks to 315 nodes requiring 70206976 bytes.
[03/20/2025-23:12:18] [TRT] [I] Total Activation Memory: 70206976 bytes
[03/20/2025-23:12:18] [TRT] [I] Total Weigh

TypeError: a bytes-like object is required, not 'NoneType'

个人觉得最接近正确的代码

In [None]:
import tensorrt as trt
import numpy as np
import os
import pycuda.driver as cuda
import pycuda.autoinit
import ctypes
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

onnx_file = "./RetinaNet_v3.onnx"
engine_file = "./model_8.trt"
calibration_data_path = "./cali_datei"

# ✅ TensorRT Logger
logger = trt.Logger(trt.Logger.INFO)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, logger)

# ✅ Load ONNX Model
with open(onnx_file, "rb") as model:
    if not parser.parse(model.read()):
        for i in range(parser.num_errors):
            print(f"ONNX Parsing Error {i}: {parser.get_error(i)}")
        raise RuntimeError("Failed to parse ONNX model")
    else:
        print("✅ ONNX model successfully parsed!")

# ✅ Create TensorRT Config
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 8 << 30)
config.set_flag(trt.BuilderFlag.INT8)  # 🔥 Enable INT8
config.set_flag(trt.BuilderFlag.DISABLE_TIMING_CACHE)  # 🔥 Avoid Timing Cache Errors

import tensorrt as trt
import numpy as np
import os
import pycuda.driver as cuda
import pycuda.autoinit
import ctypes

class Int8Calibrator(trt.IInt8EntropyCalibrator2):
    def __init__(self, calibration_data_folder, batch_size=1, cache_file="calibration.cache"):
        super(Int8Calibrator, self).__init__()

        self.batch_size = batch_size
        self.cache_file = cache_file

        # 🔥 获取所有 .npy 文件
        self.image_paths = sorted([
            os.path.join(calibration_data_folder, f)
            for f in os.listdir(calibration_data_folder) if f.endswith(".npy")
        ])
        self.current_index = 0

        # ✅ 预分配 GPU 设备内存 (Device Memory)
        self.device_input = cuda.mem_alloc(batch_size * 3 * 640 * 640 * np.float32().nbytes)

        # ✅ 预分配 CPU (Host) 内存
        self.pinned_memory = np.zeros((batch_size, 3, 640, 640), dtype=np.float32)

        # ✅ 创建一个生成器 (batches) 提供数据
        self.batches = self._batch_generator()

        print(f"✅ Initialized Int8Calibrator with {len(self.image_paths)} calibration images")

    def _batch_generator(self):
        """ 生成校准数据 """
        for i in range(0, len(self.image_paths), self.batch_size):
            batch = []
            for j in range(self.batch_size):
                if i + j < len(self.image_paths):
                    npy_file = self.image_paths[i + j]
                    img = np.load(npy_file).astype(np.float32)

                if img.shape == (1, 3, 640, 640):
                    img = img.squeeze(0)

                    batch.append(img)

            if len(batch) > 0:
                yield np.array(batch)

    def get_batch_size(self):
        return self.batch_size

    def get_batch(self, names):
        try:
            # ✅ 从生成器获取数据
            data = next(self.batches)

            print(f"✅ Loaded batch, shape: {data.shape}")

            # ✅ 复制数据到 GPU
            cuda.memcpy_htod(self.device_input, data)

            # ✅ 返回 GPU 设备内存指针
            return [int(self.device_input)]

        except StopIteration:
            print("❌ No more calibration data available")
            return None

    def read_calibration_cache(self):
        """✅ 读取 INT8 校准缓存"""
        try:
            with open(self.cache_file, "rb") as f:
                cache = f.read()
            print(f"✅ Using existing INT8 calibration cache: {self.cache_file}")
            return cache
        except FileNotFoundError:
            print(f"❌ Calibration cache not found, running fresh calibration")
            return None

    def write_calibration_cache(self, cache):
        """✅ 写入 INT8 校准缓存"""
        if cache is None or len(cache) == 0:
            print("❌ Calibration cache is empty, possible issue!")
        else:
            with open(self.cache_file, "wb") as f:
                f.write(cache)
            print(f"✅ Calibration cache saved: {self.cache_file}")

# ✅ Apply INT8 Calibrator
config.int8_calibrator = Int8Calibrator(calibration_data_path)

# ✅ Create Optimization Profile
profile = builder.create_optimization_profile()
profile.set_shape("input", (1, 3, 640, 640), (1, 3, 640, 640), (1, 3, 640, 640))  # 🔥 Static batch size = 1
config.add_optimization_profile(profile)

# ✅ Build TensorRT INT8 Engine
serialized_engine = builder.build_serialized_network(network, config)

if serialized_engine is None:
    raise RuntimeError("❌ Failed to build TensorRT INT8 engine!")

# ✅ Save TensorRT INT8 Engine
with open(engine_file, "wb") as f:
    f.write(serialized_engine)

print(f"✅ INT8 TensorRT engine saved at: {engine_file}")


: 