# 模型部署————ONNX以及TensRT使用教程  
> 本代码设备基本配置如下：  
> **显卡**：TITAN Xp 12G CUDA Version: 13.0  
> **torch版本**：2.3.0+cu121  
> **系统**： Ubuntu 22.04.3 LTS

In [None]:
!pip install tensorrt
!pip install onnx onnxruntime pycuda

## 导出ONNX模型

In [None]:
import time
import torch
import torch.nn as nn
import onnxruntime as ort
from PIL import Image
import numpy as np
from transformers import CLIPProcessor, CLIPModel

import warnings
warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)

 
class ImgModelWrapper(nn.Module):
    def __init__(self, model):
        super(ImgModelWrapper, self).__init__()
        self.model = model

    def forward(self, pixel_values):
        image_features = self.model.get_image_features(pixel_values=pixel_values)
        return image_features

class TxtModelWrapper(nn.Module):
    def __init__(self, model):
        super(TxtModelWrapper, self).__init__()
        self.model = model

    def forward(self, input_ids, attention_mask):
        text_features = self.model.get_text_features(input_ids=input_ids, attention_mask=attention_mask)
        return text_features

def trans_clip_onnx(clip_model_name, image_path, text= ["a photo of a cat"]):
    # 加载模型
    model = CLIPModel.from_pretrained(clip_model_name,
                                    use_safetensors=True)
    processor = CLIPProcessor.from_pretrained(clip_model_name,
                                            use_safetensors=True)
    
    # 处理输入
    image = Image.open(image_path) 
    inputs = processor(text= text, images=image, return_tensors="pt", padding='max_length')
    
    # 转换ONNX
    img_model = ImgModelWrapper(model)
    txt_model = TxtModelWrapper(model)
    
    torch.onnx.export(img_model,
                    (inputs.pixel_values),
                    "clip_img.onnx",
                    opset_version=17,
                    do_constant_folding=True,
                    input_names=['pixel_values'],
                    )
    torch.onnx.export(txt_model,
                    (inputs.input_ids, inputs.attention_mask),
                    "clip_txt.onnx",
                    opset_version=17,
                    do_constant_folding=True,
                    input_names=['input_ids', 'attention_mask'],
                    dynamic_axes={'input_ids': {0: 'batch', 1: 'seq'}, 
                                    'attention_mask': {0: 'batch', 1: 'seq'}},
                    )

def test_model_pt(clip_model_name, image_path):
    model = CLIPModel.from_pretrained(clip_model_name, use_safetensors=True)
    processor = CLIPProcessor.from_pretrained(clip_model_name, use_safetensors=True)
    model.eval()

    image = Image.open(image_path)
    inputs = processor(images=image, return_tensors="pt")

    s_pt_time = time.time()
    with torch.no_grad():
        pt_features = model.get_image_features(pixel_values=inputs.pixel_values)
    pt_features = pt_features.cpu().numpy()
    print(f"原始推理使用时间：{time.time()- s_pt_time:.2f} 秒")

    s_onnx_time = time.time()
    ort_session = ort.InferenceSession("clip_img.onnx", providers=["CPUExecutionProvider"])
    ort_inputs = {"pixel_values": inputs.pixel_values.cpu().numpy()}
    ort_outs = ort_session.run(None, ort_inputs)
    print(f"ONNX 推理使用时间：{time.time()- s_onnx_time:.2f} 秒")

def classify_image_and_compare(clip_model_name, image_path, candidate_labels):
    model = CLIPModel.from_pretrained(clip_model_name, use_safetensors=True)
    processor = CLIPProcessor.from_pretrained(clip_model_name, use_safetensors=True)
    model.eval()

    image = Image.open(image_path).convert("RGB")
    inputs = processor(text=candidate_labels, images=image, return_tensors="pt", padding=True)

    # ---------------- PyTorch 推理 ----------------
    t0 = time.time()
    with torch.no_grad():
        pt_image_features = model.get_image_features(pixel_values=inputs.pixel_values)   # shape (1, D)
        pt_text_features  = model.get_text_features(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask)  # shape (N, D)
    pt_time = time.time() - t0

    # 转 numpy 并做 L2 归一化
    pt_image = pt_image_features.cpu().numpy()
    pt_text  = pt_text_features.cpu().numpy()
    pt_image = pt_image / np.linalg.norm(pt_image, axis=-1, keepdims=True)
    pt_text  = pt_text  / np.linalg.norm(pt_text, axis=-1, keepdims=True)

    # 计算相似度（image @ text.T），得到每个候选标签的分数
    pt_sim = (pt_image @ pt_text.T).squeeze(0)
    pt_best_idx = int(np.argmax(pt_sim))
    pt_best_label = candidate_labels[pt_best_idx]
    pt_best_score = float(pt_sim[pt_best_idx])

    # ---------------- ONNX 推理 ----------------
    ort_img_sess = ort.InferenceSession("clip_img.onnx", providers=["CPUExecutionProvider"])
    ort_txt_sess = ort.InferenceSession("clip_txt.onnx", providers=["CPUExecutionProvider"])

    ort_inputs_img = {"pixel_values": inputs.pixel_values.cpu().numpy().astype(np.float32)}
    ort_inputs_txt = {
        "input_ids": inputs.input_ids.cpu().numpy().astype(np.int64),
        "attention_mask": inputs.attention_mask.cpu().numpy().astype(np.int64)
    }

    t1 = time.time()
    ort_img_out = ort_img_sess.run(None, ort_inputs_img)
    ort_txt_out = ort_txt_sess.run(None, ort_inputs_txt)
    onnx_time = time.time() - t1

    onnx_image = ort_img_out[0]
    onnx_text  = ort_txt_out[0] 
    onnx_image = onnx_image / np.linalg.norm(onnx_image, axis=-1, keepdims=True)
    onnx_text  = onnx_text  / np.linalg.norm(onnx_text, axis=-1, keepdims=True)

    onnx_sim = (onnx_image @ onnx_text.T).squeeze(0)
    onnx_best_idx = int(np.argmax(onnx_sim))
    onnx_best_label = candidate_labels[onnx_best_idx]
    onnx_best_score = float(onnx_sim[onnx_best_idx])

    # ---------------- 输出对比 ----------------
    print("=== PyTorch 结果 ===")
    print(f"预测标签: {pt_best_label}")
    print(f"相似度(score): {pt_best_score:.6f}")
    print(f"推理时间: {pt_time:.6f} 秒 (包含 image & text 推理)")

    print("\n=== ONNX 结果 ===")
    print(f"预测标签: {onnx_best_label}")
    print(f"相似度(score): {onnx_best_score:.6f}")
    print(f"推理时间: {onnx_time:.6f} 秒 (包含 image & text 推理)")

if __name__ == '__main__':
    clip_model = "openai/clip-vit-base-patch32"
    image_path = "./test_2.jpg"

    # trans_clip_onnx(clip_model, image_path)
    candidate_labels = [
        "a photo of a cat",
        "a photo of a dog",
        "a photo of a car",
        "a photo of a building",
        "a photo of a person"
    ]
    classify_image_and_compare(clip_model, image_path, candidate_labels)

## TensoRT推理  
### Linux下安装TensoRT
在安装TensorRT前，**首先确保已经安装了CUDA和cuDNN**。**安装方式一**：`pip install tensorrt`。**安装方式二**：  
**首先**，下载`TensoRT`。直接去访问[网站](https://developer.nvidia.com/tensorrt#)然后直接去下载`TensorRT`（Linux直接去获取window上的下载地址然后直接到终端 `wget -c 地址`，就不需要再去上传到服务器）
> 比如说下载链接：`https://developer.download.nvidia.com/compute/machine-learning/tensorrt/10.13.3/tars/TensorRT-10.13.3.9.Linux.x86_64-gnu.cuda-12.9.tar.gz?t=eyJscyI6ImdzZW8iLCJsc2QiOiJodHRwczovL3d3dy5nb29nbGUuY29tLyJ9` 那么就可以直接：`wget -c "https://developer.download.nvidia.com/compute/machine-learning/tensorrt/10.13.3/tars/TensorRT-10.13.3.9.Linux.x86_64-gnu.cuda-12.9.tar.gz?t=eyJscyI6ImdzZW8iLCJsc2QiOiJodHRwczovL3d3dy5nb29nbGUuY29tLyJ9"      -O TensorRT-10.13.3.9.Linux.x86_64-gnu.cuda-12.9.tar.gz`

![](https://s2.loli.net/2025/10/16/dATZbpzSWYl1wyg.png)
**安装**，安装`TensoRT`  
*第一步、解压安装包*：
```bash
tar -xzvf TensorRT-10.13.3.9.Linux.x86_64-gnu.cuda-12.9.tar.gz
cd TensorRT-10.13.3.9
```
*第二步、添加环境变量*  
```bash
vim ~/.bashrc
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/hy-tmp/TensorRT-10.13.3.9/lib
export PATH=$PATH:/hy-tmp/TensorRT-10.13.3.9/bin
source ~/.bashrc
```  
*第三步、安装python包*  
```bash
python --version # Python 3.11.8
cd python
pip install tensorrt-10.13.3.9-cp311-none-linux_x86_64.whl 
```
> 在python中一般都有 `tensorrt-8.x.x.x-cp3x-none-linux_x86_64.wh` 其中 `cp`后面表示的是 python版本，tensorrt后面表示的是tensorrt版本  

### TensoRT推理
TensorRT有自己的一套推理流程，我们在使用PyTorch或TensorFlow导出模型权重后，需要进一步转换。TensorRT最终需要的是一个TensorRT Engine，这个Engine是由TensorRT的Builder构建，而Builder需要一个TensorRT Network。TensorRT Network是由TensorRT Parser解析的ONNX模型构建的。
![](https://s2.loli.net/2025/10/17/VGwpv8hWS3XIfem.png)
**第一步、转换为TensoRT Engine**。需要完成操作：1、转换并填充Network对象；2、编写构建配置对象（Config）；3、编写优化配置对象（Optimization Profile）整个过程参考下面的`build_engine_from_onnx`函数，对于其中更加具体的细节描述如下：
> 对于该函数主要3个核心内容：1、创建我的3部分核心内容：`builder`、`config`、`parser`。构建、配置、解析。

**1、使用显式批处理**：显式批处理（explicit batch）和隐式批处理（implicit batch）。假设网络的输入张量形状为`(n, c, h, w)`，在隐式批处理模式下，网络只需要指定输入形状为`(c, h, w)`，批次维度是隐式的，并且在运行时可动态指定；在显式批处理模式下，批量维度需要网络显示定义，甚至可以不用批量维度，对于动态批量大小的需求，可以使用Optimization Profile配置动态形状
>`network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))`  
  
**2、动态形状**：直接通过`set_shape`去设定尺寸。一般而言其参数为：输入节点名称，可接受的最小输入尺寸，最优的输入尺寸，可接受的最大输入尺寸


**第二步、使用TensoRT进行推理**。