## 步骤 1：基准测试

In [1]:
import torch
import torchvision.models as models
from torchvision.models import ResNet50_Weights
import time

# 1. 加载预训练模型
model = models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V1).to("cuda")
model.eval()

# 2. 导出模型
torch.save(model, "model.pth")

# 3. 输入数据
input_tensor = torch.randn(8, 3, 224, 224).to("cuda")

# 4. 进行推理
time_start = time.time()
with torch.inference_mode():
    output = model(input_tensor)
time_end = time.time()
print("Pytorch 推理结果：", torch.argmax(output, dim=1))
print("Pytorch 推理时间：", time_end - time_start)

Pytorch 推理结果： tensor([904, 904, 490, 490, 490, 904, 490, 490], device='cuda:0')
Pytorch 推理时间： 0.18204164505004883


## 步骤 2：使用 ONNX 优化模型

（目前onnxruntime还不支持cuda12，无法启用gpu后端）

In [10]:
import torch
import torch.onnx
import onnxruntime as ort
import numpy as np
import time


# 1. 转换并导出静态和动态模型
torch.onnx.export(
    torch.load("model.pth", map_location=torch.device("cpu")),
    torch.randn(1, 3, 224, 224).to("cpu"),
    "model.onnx",
)
dynamic_axes = {
    "input": {0: "batch_size", 2: "height", 3: "width"},  # 设置输入张量的名称是'input'，仅固定通道维度
    "output": {0: "batch_size", 2: "height", 3: "width"},  # 设置输出张量的名称是'output'，仅固定通道维度
}
torch.onnx.export(
    torch.load("model.pth", map_location=torch.device("cpu")),
    torch.randn(1, 3, 224, 224).to("cpu"),
    "model_dynamic.onnx",
    export_params=True,
    do_constant_folding=False,
    input_names=["input"],
    output_names=["output"],
    dynamic_axes=dynamic_axes,
)

# 2. 输入数据
input_data = np.random.rand(8, 3, 224, 224).astype(np.float32)
input_data = np.array(input_tensor.to("cpu"), dtype=np.float32)

# 3. 创建推理会话并
sess_options = ort.SessionOptions()  # 创建会话配置
sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL  # 顺序执行
sess_options.graph_optimization_level = (
    ort.GraphOptimizationLevel.ORT_ENABLE_ALL
)  # 启用所有适用的图优化技术
sess_options.optimized_model_filepath = "model_optimized.onnx"  # 优化后的模型保存路径
providers = ["CPUExecutionProvider", "CUDAExecutionProvider"]  # 设置执行提供者
session = ort.InferenceSession("model_dynamic.onnx", sess_options, providers=providers)

# 4. 执行推理
start_time = time.time()
input_name = session.get_inputs()[0].name
output_name = session.get_outputs()[0].name
output_data = session.run([output_name], {input_name: input_data})
end_time = time.time()
print("ONNX 推理结果：", np.argmax(output_data[0], axis=1))
print("ONNX 推理时间：", end_time - start_time)

ONNX 推理结果： [904 904 490 490 490 904 490 490]
ONNX 推理时间： 0.08050775527954102


## 步骤 3：使用 TensorRT 优化模型
（目前存在未知bug，推理结果不正常）

### 原生接口

In [None]:
FP16 = False
if FP16:
    !trtexec --onnx=model_dynamic.onnx --minShapes=input:1x3x112x112 --optShapes=input:1x3x224x224 --maxShapes=input:1x3x448x448 --saveEngine=model_dynamic.trt --explicitBatch --inputIOFormats=fp16:chw --outputIOFormats=fp16:chw --fp16
else:
    !trtexec --onnx=model_dynamic.onnx --minShapes=input:1x3x112x112 --optShapes=input:1x3x224x224 --maxShapes=input:1x3x448x448 --saveEngine=model_dynamic.trt --explicitBatch

### Python接口

In [4]:
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import time


# 1. 通过builder创建一个网络
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(EXPLICIT_BATCH)

# 2. 解析ONNX文件并设置构建配置（需要使用普通未经过优化的onnx模型）
parser = trt.OnnxParser(network, TRT_LOGGER)
with open("model.onnx", "rb") as model:
    parser.parse(model.read())
config = builder.create_builder_config()  # 创建构建设置
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 << 30)  # 设置内存大小
profile = builder.create_optimization_profile()  # 创建优化配置
min_shape = (1, 3, 224, 224)  # 设置最小尺寸
opt_shape = (4, 3, 256, 256)  # 设置优先尺寸
max_shape = (8, 3, 384, 384)  # 设置最大尺寸
profile.set_shape("input", min_shape, opt_shape, max_shape)
config.add_optimization_profile(profile)  # 添加优化配置

# 3. 转换并导出模型
serialized_engine = builder.build_serialized_network(network, config)
with open("model.trt", "wb") as f:
    f.write(serialized_engine)

# 4. 创建引擎和上下文管理器
with open("model.trt", "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())
    context = engine.create_execution_context()

# 5. 分配输入输出的内存
BATCH_SIZE = 8
input_data = np.random.randn(BATCH_SIZE, 3, 224, 224).astype(np.float32)
input_data = np.array(input_tensor.to("cpu"), dtype=np.float32)
output_data = np.empty([BATCH_SIZE, 1000], dtype=np.float32)
d_input = cuda.mem_alloc(1 * input_data.nbytes)
d_output = cuda.mem_alloc(1 * output_data.nbytes)
bindings = [int(d_input), int(d_output)]
stream = cuda.Stream()


# 6. 推理和计时
def predict(input_data):
    # 将数据转换到驱动上去
    cuda.memcpy_htod_async(d_input, input_data, stream)
    # 异步执行模型
    context.execute_async_v2(bindings, stream.handle, None)  # 同步推理为execute_v2
    # 将数据从驱动上转换回来
    cuda.memcpy_dtoh_async(output_data, d_output, stream)
    # 阻塞调用线程从而同步CUDA流
    stream.synchronize()
    return output_data


start_time = time.time()
prediction = predict(input_data)
predicted_class = np.argmax(prediction, axis=1)
end_time = time.time()
print("TensorRT 预测结果：", predicted_class)
print("TensorRT 推理时间：", end_time - start_time)

TensorRT 预测结果： [490   0   0   0   0   0   0   0]
TensorRT 推理时间： 0.0017352104187011719
