## 步骤 1：基准测试 pytorch2.2

In [1]:
import torch
import torchvision.models as models
from torchvision.models import ResNet50_Weights
import time

# 1. 加载预训练模型
model = models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V1).to("cuda")
model.eval()

# 2. 导出模型
torch.save(model, "model.pth")

# 3. 输入数据
raw_data = torch.randn((8, 3, 224, 224), dtype=torch.float32, device="cpu")
input_tensor = raw_data.to("cuda")

# 4. 进行推理
dummy = True
if dummy:
    output = model(input_tensor)
time_start = time.time()
num = 1000
for _ in range(num):
    with torch.inference_mode():
        output = model(input_tensor)
    predicted_class = torch.argmax(output, dim=1)
time_end = time.time()
print("Pytorch 推理结果：", predicted_class)
print("Pytorch 推理时间：", (time_end - time_start) / num)

  _torch_pytree._register_pytree_node(


Pytorch 推理结果： tensor([904, 490, 490, 490, 904, 490, 490, 490], device='cuda:0')
Pytorch 推理时间： 0.0043642823696136476


## 步骤 2：使用 ONNX1.7 优化模型

In [2]:
import torch
import torch.onnx
import onnxruntime as ort
import numpy as np
import time


# 1. 转换并导出静态和动态模型
torch.onnx.export(
    torch.load("model.pth", map_location=torch.device("cpu")),
    torch.randn(8, 3, 224, 224).to("cpu"),
    "model.onnx",
)
torch.onnx.export(
    torch.load("model.pth", map_location=torch.device("cpu")),
    torch.randn(1, 3, 224, 224).to("cpu"),
    "model_dynamic.onnx",
    export_params=True,
    do_constant_folding=False,
    input_names=["input"],
    output_names=["output"],
    dynamic_axes={
        "input": {
            0: "batch_size",
            2: "height",
            3: "width",
        },  # 设置输入张量的名称是'input'，仅固定通道维度
        "output": {
            0: "batch_size",
            2: "height",
            3: "width",
        },  # 设置输出张量的名称是'output'，仅固定通道维度
    },
)

# 2. 输入数据
input_data = np.asarray(raw_data, dtype=np.float32)

# 3. 创建推理会话
tensorrt_accelerate = False
if tensorrt_accelerate:
    sess_options = ort.SessionOptions()  # 创建会话配置
    sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL  # 顺序执行
    providers = [
        "TensorrtExecutionProvider",
        "CUDAExecutionProvider",
    ]  # 设置执行提供者
else:
    sess_options = ort.SessionOptions()  # 创建会话配置
    sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL  # 顺序执行
    sess_options.graph_optimization_level = (
        ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    )  # 启用所有适用的图优化技术
    providers = [
        "CUDAExecutionProvider",
        "CPUExecutionProvider",
    ]  # 设置执行提供者
session = ort.InferenceSession("model_dynamic.onnx", sess_options, providers=providers)

# 4. 执行推理
dummy = True
if dummy:
    input_name = session.get_inputs()[0].name
    output_name = session.get_outputs()[0].name
    output_data = session.run([output_name], {input_name: input_data})
input_name = session.get_inputs()[0].name
output_name = session.get_outputs()[0].name
start_time = time.time()
num = 1000
for i in range(num):
    output_data = session.run([output_name], {input_name: input_data})
    predicted_class = np.argmax(output_data[0], axis=1)
end_time = time.time()
print("ONNX 推理结果：", predicted_class)
print("ONNX 推理时间：", (time_end - time_start) / num)

ONNX 推理结果： [904 490 490 490 904 490 490 490]
ONNX 推理时间： 0.0043642823696136476


## 步骤 3：使用 TensorRT8.6 优化模型

### 原生接口

In [4]:
FP16 = False
if FP16:
    !trtexec --onnx=model_dynamic.onnx --minShapes=input:1x3x112x112 --optShapes=input:4x3x224x224 --maxShapes=input:8x3x448x448 --saveEngine=model_dynamic.trt --explicitBatch --inputIOFormats=fp16:chw --outputIOFormats=fp16:chw --fp16
else:
    !trtexec --onnx=model_dynamic.onnx --minShapes=input:1x3x112x112 --optShapes=input:4x3x224x224 --maxShapes=input:8x3x448x448 --saveEngine=model_dynamic.trt --explicitBatch

&&&& RUNNING TensorRT.trtexec [TensorRT v8601] # trtexec --onnx=model_dynamic.onnx --minShapes=input:1x3x112x112 --optShapes=input:4x3x224x224 --maxShapes=input:8x3x448x448 --saveEngine=model_dynamic.trt --explicitBatch
[02/05/2024-14:33:50] [W] --explicitBatch flag has been deprecated and has no effect!
[02/05/2024-14:33:50] [W] Explicit batch dim is automatically enabled if input model is ONNX or if dynamic shapes are provided when the engine is built.
[02/05/2024-14:33:50] [I] === Model Options ===
[02/05/2024-14:33:50] [I] Format: ONNX
[02/05/2024-14:33:50] [I] Model: model_dynamic.onnx
[02/05/2024-14:33:50] [I] Output:
[02/05/2024-14:33:50] [I] === Build Options ===
[02/05/2024-14:33:50] [I] Max batch: explicit batch
[02/05/2024-14:33:50] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
[02/05/2024-14:33:50] [I] minTiming: 1
[02/05/2024-14:33:50] [I] avgTiming: 8
[02/05/2024-14:33:50] [I] Precision: FP32
[02/05/2024-14:33:50] [I

### Python接口

In [6]:
import pycuda.autoinit
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import time


# 1. 通过builder创建一个网络
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(EXPLICIT_BATCH)

# 2. 解析ONNX文件并设置构建配置（需要使用普通未经过优化的onnx模型）
parser = trt.OnnxParser(network, TRT_LOGGER)
with open("model_dynamic.onnx", "rb") as model:
    parser.parse(model.read())
config = builder.create_builder_config()  # 创建构建设置
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 << 30)  # 设置内存大小
profile = builder.create_optimization_profile()  # 创建优化配置
min_shape = (1, 3, 112, 112)  # 设置最小尺寸
opt_shape = (4, 3, 224, 224)  # 设置优先尺寸
max_shape = (8, 3, 448, 448)  # 设置最大尺寸
profile.set_shape("input", min_shape, opt_shape, max_shape)
config.add_optimization_profile(profile)  # 添加优化配置

# 3. 转换并导出模型
serialized_engine = builder.build_serialized_network(network, config)
with open("model.trt", "wb") as f:
    f.write(serialized_engine)

# 4. 创建引擎和上下文管理器
with open("model_dynamic.trt", "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()

# 5. 分配输入输出的内存
stream = cuda.Stream()
input_data = np.asarray(raw_data, dtype=np.float32)
output_data = np.empty([raw_data.shape[0], 1000], dtype=np.float32)
d_input = cuda.mem_alloc(input_data.nbytes)
d_output = cuda.mem_alloc(output_data.nbytes)
bindings = [int(d_input), int(d_output)]
context.set_input_shape(engine.get_tensor_name(0), input_data.shape)


# 6. 推理和计时
def predict(input_data):
    # 将数据转换到驱动上去
    cuda.memcpy_htod_async(d_input, input_data, stream)
    # 异步执行模型
    context.execute_async_v2(bindings, stream.handle, None)  # 同步推理为execute_v2
    # 将数据从驱动上转换回来
    cuda.memcpy_dtoh_async(output_data, d_output, stream)
    # 阻塞调用线程从而同步CUDA流
    stream.synchronize()
    return output_data


dummy = True
if dummy:
    prediction = predict(input_data)
start_time = time.time()
num = 1000
for _ in range(num):
    prediction = predict(input_data)
    predicted_class = np.argmax(prediction, axis=1)
end_time = time.time()
print("TensorRT 预测结果：", predicted_class)
print("TensorRT 推理时间：", (end_time - start_time) / num)

[02/05/2024-14:36:10] [TRT] [W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
TensorRT 预测结果： [904 490 490 490 904 490 490 490]
TensorRT 推理时间： 0.0033461697101593016
