In [1]:
import torch
import os 
os.environ['HF_ENDPOINT']='https://hf-mirror.com'
from transformers import AutoTokenizer, AutoModelForSequenceClassification
file_path ='/root/RAG/Rerank/model'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained('maidalun1020/bce-reranker-base_v1',cache_dir=file_path)
model = AutoModelForSequenceClassification.from_pretrained(
    'maidalun1020/bce-reranker-base_v1',
    device_map=device,
    torch_dtype=torch.float16,
)
model.eval()

  from .autonotebook import tqdm as notebook_tqdm


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [15]:
pairs=[["你好，好久不见","是啊，好久不见"],["你好，你叫什么名字","我叫RAG，你好，你叫什么名字？"]]
output_ids = tokenizer(pairs, padding='max_length', truncation=True, return_tensors='pt', max_length=512)
input_ids, attention_mask = output_ids['input_ids'], output_ids['attention_mask']
scores = model(input_ids.to(device), attention_mask.to(device), return_dict=True)

In [16]:
output_ids

{'input_ids': tensor([[     0,      6, 124084,  ...,      1,      1,      1],
        [     0,      6, 124084,  ...,      1,      1,      1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [17]:
type(scores)

transformers.modeling_outputs.SequenceClassifierOutput

In [18]:
scores[0]

tensor([[0.4136],
        [0.3779]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddmmBackward0>)

In [20]:
#将pytorch模型转化为onnx引擎
#1、定义输入张量的形状信息
input_id_=torch.randint(2,1000,(1,512)).to('cuda')
attention_mask_=torch.ones(1,512).long().to('cuda')
#转化模型
import torch
torch.onnx.export(
    model, #原模型
    (input_id_.to(torch.int64),attention_mask_.to(torch.int64)), #输入张量，接受一个张量或者元组
    "./rerank.onnx",
    export_params=True, #是否保存模型的权重信息
    opset_version=17, #17支持INormalizationLayer，防止溢出
    do_constant_folding=True,  #是否执行常量折叠优化
    input_names=['input_ids','attention_mask'], #输入的名字
    output_names=['output'],
    dynamic_axes={
        'input_ids':{0:'batch_size',1:'sequence_length'},
        'attention_mask':{0:'batch_size',1:'sequence_length'},
        'output':{0:'batch_size'}
    }          #可变长度，在NLP中批次和序列长度都是可变长度
    
)

In [None]:
# 将 PyTorch 模型转化为 ONNX 引擎
# 1、定义输入张量的形状信息
import numpy as np
import torch

# 创建输入张量
input_id_ = torch.randint(2, 1000, (1, 512),dtype=torch.int64).to('cuda')  # 在 GPU 上创建 input_ids
attention_mask_ = torch.ones((1, 512), dtype=torch.int64).to('cuda')  # 正确创建 attention_mask 并转到 GPU

# 转化模型
torch.onnx.export(
    model,  # 原模型
    (input_id_, attention_mask_),  # 输入张量，接受一个张量或者元组
    "tensorrt_engine/embedding.onnx",
    export_params=True,  # 是否保存模型的权重信息
    opset_version=17,  # 17支持 INormalizationLayer，防止溢出
    do_constant_folding=True,  # 是否执行常量折叠优化
    input_names=['input_ids', 'attention_mask'],  # 输入的名字
    output_names=['last_hidden_state', 'pooler_output'],  # 输出的名字
    dynamic_axes={
        'input_ids': {0: 'batch_size', 1: 'sequence_length'},
        'attention_mask': {0: 'batch_size', 1: 'sequence_length'},
        'last_hidden_state': {0: 'batch_size', 1: 'sequence_length'},
        'pooler_output': {0: 'batch_size'},
    }  # 可变长度，在 NLP 中批次和序列长度都是可变长度
)


In [1]:
import tensorrt as trt

In [1]:
#将onnx转化为tensorrt引擎
import sys
sys.path.append('/root/anaconda3/lib/python3.11/site-packages')
import tensorrt as trt
logger=trt.Logger(trt.Logger.WARNING)
trt.init_libnvinfer_plugins(logger,namespace='')

True

In [2]:

builder=trt.Builder(logger)

In [3]:
config=builder.create_builder_config()

In [4]:
config.set_flag(trt.BuilderFlag.FP16)

In [5]:
profile = builder.create_optimization_profile()
profile.set_shape("input_ids", (20, 512),(64, 512),(200, 512))  # 输入的最小、默认批量大小、最大批次
profile.set_shape("attention_mask",(20,512),(64,512),(200,512))
config.add_optimization_profile(profile)

0

In [6]:
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE,1<<33)

In [8]:
network=builder.create_network()

In [9]:
parser=trt.OnnxParser(network,logger)

In [10]:
success=parser.parse_from_file('rerank.onnx')

[11/28/2024-09:38:55] [TRT] [W] ModelImporter.cpp:429: Make sure input input_ids has Int64 binding.
[11/28/2024-09:38:55] [TRT] [W] ModelImporter.cpp:429: Make sure input attention_mask has Int64 binding.


In [11]:
serialized_engine=builder.build_serialized_network(network,config)

In [12]:
with open('rerank_100k_8G_engine','wb') as f:
          f.write(serialized_engine)

In [2]:

import os
#CUDA_VISIBLE_DEVICES=0
#os.environ['CUDA_VISIBLE_DEVICES']='0'
#读取tensorrt引擎
import sys
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt
logger=trt.Logger(trt.Logger.INFO)
runtime=trt.Runtime(logger)
trt.init_libnvinfer_plugins(logger,'')
with open('/root/autodl-tmp/tensorrt/bce_rerank_100k_8G_engine','rb') as f:
    serialized_engine=f.read()
    engine=runtime.deserialize_cuda_engine(serialized_engine)
import numpy as np
with engine.create_execution_context() as context:
    context.set_input_shape('attention_mask', (20,512))
    context.set_input_shape('input_ids', (20,512))
    input_data=np.arange(20,512,dtype=np.int64)
    attention_mask=np.ones((20,512),dtype=np.int64)
    pooler_output=np.empty((20,1),dtype=np.float32)
    d_input_ids=cuda.mem_alloc(input_data.nbytes)
    d_input_mask=cuda.mem_alloc(input_data.nbytes)
    d_pooler_output=cuda.mem_alloc(pooler_output.nbytes)
    context.set_tensor_address('input_ids', int(d_input_ids))
    context.set_tensor_address('attention_mask',int(d_input_mask))
    context.set_tensor_address('output', int(d_pooler_output))
    stream=cuda.Stream()
    cuda.memcpy_htod_async(d_input_ids,input_data,stream)
    cuda.memcpy_htod_async(d_input_mask,input_data,stream)
    bindings = [int(d_input_ids),int(d_input_mask),int(d_pooler_output)]
    context.execute_v2(bindings)
    cuda.memcpy_dtoh(pooler_output,d_pooler_output)

print(pooler_output)

[12/13/2024-23:03:12] [TRT] [I] The logger passed into createInferRuntime differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.
[12/13/2024-23:03:12] [TRT] [I] Loaded engine size: 1063 MiB
[12/13/2024-23:03:12] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +1060, now: CPU 0, GPU 1060 (MiB)
[12/13/2024-23:03:12] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +6302, now: CPU 0, GPU 7362 (MiB)
[[-3.0896327e-08]
 [-3.2526316e-08]
 [-2.6624102e-08]
 [-3.4854573e-08]
 [-3.1129307e-08]
 [-2.9965001e-08]
 [-3.1594972e-08]
 [-2.5692788e-08]
 [-3.4156084e-08]
 [-3.1594968e-08]
 [ 0.0000000e+00]
 [ 0.0000000e+00]
 [ 0.0000000e+00]
 [ 0.0000000e+00]
 [ 0.0000000e+00]
 [ 0.0000000e+00]
 [ 0.0000000e+00]
 [ 0.0000000e+00]
 [ 0.0000000e+00]
 [ 0.0000000e+00]]
