### 在项目开始之前，我们首先导入相关的库包。

In [None]:
# 导入系统库
import abc
import sys
from functools import partial
import argparse
import os
import random
import time
# 导入python的其他库
import numpy as np
from scipy import stats
import pandas as pd
from tqdm import tqdm
from scipy.special import softmax
from scipy.special import expit
# 导入Paddle库
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import inference

#导入PaddleNLP相关的库
import paddlenlp as ppnlp
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.datasets import load_dataset, MapDataset
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.utils.downloader import get_path_from_url
from visualdl import LogWriter
from utils.data import convert_pairwise_example

In [None]:
# 数据读取逻辑
def read_simcse_text(data_path):
    """Reads data."""
    with open(data_path, 'r', encoding='utf-8') as f:
        for i,line in enumerate(f):
            if i==0:
                continue
            data = line.rstrip()
            # 这里的text_a和text_b是一样的
            yield {'text_a': data, 'text_b': data}

train_set_file='./train_demo.csv'
train_ds = load_dataset(read_simcse_text, data_path=train_set_file, lazy=False)
# 展示3条数据
for i  in range(3):
    print(train_ds[i])

#### 构建Dataloader

在训练神经网络之前，我们需要构建小批量的数据，所以需要借助Dataloader,在组装小批量的数据的之前我们认识一下下面的API:

| API                             | 简介                                       |
| ------------------------------- | :----------------------------------------- |
| `paddlenlp.data.Stack`          | 堆叠N个具有相同shape的输入数据来构建一个batch |
| `paddlenlp.data.Pad`            | 将长度不同的多个句子padding到统一长度，取N个输入数据中的最大长度 |
| `paddlenlp.data.Tuple`          | 将多个batchify函数包装在一起 |

更多数据处理操作详见： [https://paddlenlp.readthedocs.io/zh/latest/data_prepare/data_preprocess.html](https://paddlenlp.readthedocs.io/zh/latest/data_prepare/data_preprocess.html)

In [None]:
# 由于文本是序列数据，数据对齐。如下面会对每个序列补0，和a的长度保持一致
a = [1, 2, 3, 4]
b = [5, 6, 7]
c = [8, 9]
result = Pad(pad_val=0)([a, b, c])
print("Padded Data: \n", result)
print()

# 组装minibatch需要使用
a = [1, 2, 3, 4]
b = [3, 4, 5, 6]
c = [5, 6, 7, 8]
result = Stack()([a, b, c])
print("Stacked Data: \n", result)
print()

data = [
        [[1, 2, 3, 4], [1]],
        [[5, 6, 7], [0]],
        [[8, 9], [1]],
       ]
batchify_fn = Tuple(Pad(pad_val=0), Stack())
ids, labels = batchify_fn(data)
print("ids: \n", ids)
print()
print("labels: \n", labels)
print()

In [None]:
# 明文数据 -> ID 序列训练数据

def create_dataloader(dataset,
                      mode='train',
                      batch_size=1,
                      batchify_fn=None,
                      trans_fn=None):
    if trans_fn:
        dataset = dataset.map(trans_fn)

    shuffle = True if mode == 'train' else False
    if mode == 'train':
        # 分布式批采样器加载数据的一个子集。
        # 每个进程可以传递给DataLoader一个DistributedBatchSampler的实例，每个进程加载原始数据的一个子集。
        batch_sampler = paddle.io.DistributedBatchSampler(
            dataset, batch_size=batch_size, shuffle=shuffle)
    else:
        # 批采样器的基础实现，
        # 用于 paddle.io.DataLoader 中迭代式获取mini-batch的样本下标数组，数组长度与 batch_size 一致。
        batch_sampler = paddle.io.BatchSampler(
            dataset, batch_size=batch_size, shuffle=shuffle)
    # 组装mini-batch
    return paddle.io.DataLoader(
        dataset=dataset,
        batch_sampler=batch_sampler,
        collate_fn=batchify_fn,
        return_list=True)

def convert_example(example, tokenizer, max_seq_length=512, do_evalute=False):

    result = []

    for key, text in example.items():
        if 'label' in key:
            # do_evaluate
            result += [example['label']]
        else:
            # do_train
            encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length)
            input_ids = encoded_inputs["input_ids"]
            token_type_ids = encoded_inputs["token_type_ids"]
            result += [input_ids, token_type_ids]

    return result

# 语义索引的维度最大为64，可以根据自己的情况调节长度
max_seq_length=64
# 根据经验 batch_size越大效果越好
batch_size=32
tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-3.0-medium-zh')
# 给convert_example赋予默认的值，如tokenizer，max_seq_length
trans_func = partial(
        convert_example,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length)
# [pad]对齐的函数
batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # query_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'),  # query_segment
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # title_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'),  # tilte_segment
    ): [data for data in fn(samples)]

# 构建训练的Dataloader
train_data_loader = create_dataloader(
        train_ds,
        mode='train',
        batch_size=batch_size,
        batchify_fn=batchify_fn,
        trans_fn=trans_func)

展示一下输入的dataloader的数据

In [5]:
for idx, batch in enumerate(train_data_loader):
    if idx == 0:
        print(batch)
        break

[Tensor(shape=[32, 62], dtype=int64, place=Place(cpu), stop_gradient=True,
       [[1    , 12056, 1441 , ..., 0    , 0    , 0    ],
        [1    , 338  , 3003 , ..., 0    , 0    , 0    ],
        [1    , 32   , 159  , ..., 0    , 0    , 0    ],
        ...,
        [1    , 296  , 242  , ..., 0    , 0    , 0    ],
        [1    , 19   , 266  , ..., 0    , 0    , 0    ],
        [1    , 12056, 1441 , ..., 0    , 0    , 0    ]]), Tensor(shape=[32, 62], dtype=int64, place=Place(cpu), stop_gradient=True,
       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]), Tensor(shape=[32, 62], dtype=int64, place=Place(cpu), stop_gradient=True,
       [[1    , 12056, 1441 , ..., 0    , 0    , 0    ],
        [1    , 338  , 3003 , ..., 0    , 0    , 0    ],
        [1    , 32   , 159  , ..., 0    , 0    , 0    ],
        ...,
        [1    , 296  , 



上面展示的是一个batch的数据，包含两个Tensor，第一个Tensor表示的是input_ids，第二个Tensor表示的是token_type_ids；第一个Tensor中，32是batch_size的维度，64代表的是序列的长度，表示输入的文本的最大长度是64；第二个Tensor中，32表示的也是batch_size，64表示的是序列的长度。

#### 模型构建

接下来搭建SimCSE模型，主要部分是用query和title分别得到embedding向量，然后计算余弦相似度。

![](https://ai-studio-static-online.cdn.bcebos.com/fa84f7db963c4efd82b971f3ef477c070e5300acee3e4e788b4a6c56dd31c003)


上图是SimCSE的原理图，SimCSE主要是通过dropout来把同一个句子变成正样本（做两次前向，但是dropout有随机因素，所以产生的向量不一样，但是本质上还是表示的是同一句话），把一个batch里面其他的句子变成负样本的。

In [6]:
class SimCSE(nn.Layer):
    def __init__(self,
                 pretrained_model,
                 dropout=None,
                 margin=0.0,
                 scale=20,
                 output_emb_size=None):

        super().__init__()

        self.ptm = pretrained_model
        # 显式的加一个dropout来控制
        self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)

        # if output_emb_size is greater than 0, then add Linear layer to reduce embedding_size, 
        # 考虑到性能和效率，我们推荐把output_emb_size设置成256
        # 向量越大，语义信息越丰富，但消耗资源越多
        self.output_emb_size = output_emb_size
        if output_emb_size > 0:
            weight_attr = paddle.ParamAttr(
                initializer=nn.initializer.TruncatedNormal(std=0.02))
            self.emb_reduce_linear = paddle.nn.Linear(
                768, output_emb_size, weight_attr=weight_attr)

        self.margin = margin
        # 为了使余弦相似度更容易收敛，我们选择把计算出来的余弦相似度扩大scale倍，一般设置成20左右
        self.sacle = scale

    # 加入jit注释能够把该提取向量的函数导出成静态图
    # 对应input_id,token_type_id两个
    @paddle.jit.to_static(input_spec=[paddle.static.InputSpec(shape=[None, None], dtype='int64'),paddle.static.InputSpec(shape=[None, None], dtype='int64')])
    def get_pooled_embedding(self,
                             input_ids,
                             token_type_ids=None,
                             position_ids=None,
                             attention_mask=None,
                             with_pooler=True):

        # Note: cls_embedding is poolerd embedding with act tanh 
        sequence_output, cls_embedding = self.ptm(input_ids, token_type_ids,
                                                  position_ids, attention_mask)

        if with_pooler == False:
            cls_embedding = sequence_output[:, 0, :]

        if self.output_emb_size > 0:
            cls_embedding = self.emb_reduce_linear(cls_embedding)
        cls_embedding = self.dropout(cls_embedding)
        # https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/nn/functional/normalize_cn.html
        cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)
        return cls_embedding

    def forward(self,
                query_input_ids,
                title_input_ids,
                query_token_type_ids=None,
                query_position_ids=None,
                query_attention_mask=None,
                title_token_type_ids=None,
                title_position_ids=None,
                title_attention_mask=None):
        
        # 第 1 次编码: 文本经过无监督语义索引模型编码后的语义向量 
        # [N, 768]
        query_cls_embedding = self.get_pooled_embedding(
            query_input_ids, query_token_type_ids, query_position_ids,
            query_attention_mask)

        # 第 2 次编码: 文本经过无监督语义索引模型编码后的语义向量 
        # [N, 768]
        title_cls_embedding = self.get_pooled_embedding(
            title_input_ids, title_token_type_ids, title_position_ids,
            title_attention_mask)

        # 相似度矩阵: [N, N]
        cosine_sim = paddle.matmul(
            query_cls_embedding, title_cls_embedding, transpose_y=True)

        # substract margin from all positive samples cosine_sim()
        # 填充self.margin值，比如margin为0.2，query_cls_embedding.shape[0]=2 
        # margin_diag: [0.2,0.2]
        margin_diag = paddle.full(
            shape=[query_cls_embedding.shape[0]],
            fill_value=self.margin,
            dtype=paddle.get_default_dtype())
        # input paddle.diag(margin_diag): [[0.2,0],[0,0.2]]
        # input cosine_sim : [[1.0,0.6],[0.6,1.0]]
        # output cosine_sim: [[0.8,0.6],[0.6,0.8]]
        cosine_sim = cosine_sim - paddle.diag(margin_diag)

        # scale cosine to ease training converge
        cosine_sim *= self.sacle

        # 转化成多分类任务: 对角线元素是正例，其余元素为负例
        # labels : [0,1,2,3]
        labels = paddle.arange(0, query_cls_embedding.shape[0], dtype='int64')
        # labels : [[0],[1],[2],[3]]
        labels = paddle.reshape(labels, shape=[-1, 1])

        # 交叉熵损失函数
        loss = F.cross_entropy(input=cosine_sim, label=labels)
        return loss

You can set full_graph=True, then you can assign input spec.




上述代码的相似度矩阵计算示例如下：

![](https://ai-studio-static-online.cdn.bcebos.com/abf8d97a72d34eefafe6610b63aba654bdf853bf5aae4f4287ca7e2d83beab63)

#### 训练配置



In [7]:
# 关键参数
scale=20 # 推荐值: 10 ~ 30
margin=0.1 # 推荐值: 0.0 ~ 0.2
# SimCSE的dropout的参数，也可以使用预训练语言模型默认的dropout参数
dropout=0.2
# 向量映射的维度，默认的输出是768维，推荐通过线性层映射成256维
output_emb_size=256
# 训练的epoch数目
epochs=1
weight_decay=0.0
# 学习率
learning_rate=5E-5
warmup_proportion=0.0

#### 加载预训练模型
1. 加载预训练模型 ERNIE 3.0-Medium 进行热启
2. 定义优化器 AdamOptimizer

In [8]:
# 设置 ERNIE-3.0-Medium-zh 预训练模型
model_name_or_path='ernie-3.0-medium-zh'
pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained(
       model_name_or_path,
       hidden_dropout_prob=dropout,
       attention_probs_dropout_prob=dropout)
print("loading model from {}".format(model_name_or_path))

# 实例化SimCSE，SimCSE使用的Encoder是ERNIE-3.0-Medium-zh
model = SimCSE(
        pretrained_model,
        margin=margin,
        scale=scale,
        output_emb_size=output_emb_size)
# 训练的总步数
num_training_steps = len(train_data_loader) * epochs
# warmpup操作，学习率先上升后下降
lr_scheduler = LinearDecayWithWarmup(learning_rate, num_training_steps,
                                         warmup_proportion)

# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
# 设置优化器
optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)


[32m[2024-06-01 15:36:50,728] [    INFO][0m - Already cached C:\Users\kitx86\.paddlenlp\models\ernie-3.0-medium-zh\model_state.pdparams[0m
[32m[2024-06-01 15:36:50,729] [    INFO][0m - Loading weights file model_state.pdparams from cache at C:\Users\kitx86\.paddlenlp\models\ernie-3.0-medium-zh\model_state.pdparams[0m
[32m[2024-06-01 15:36:51,176] [    INFO][0m - Loaded weights file from disk, setting weights to model.[0m
- This IS expected if you are initializing ErnieModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ErnieModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).[0m
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

loading model from ernie-3.0-medium-zh


#### 模型训练

上面的训练配置完毕以后，下面就可以开始训练了。

In [None]:
save_dir='checkpoint'
save_steps=100
time_start=time.time()
global_step = 0
tic_train = time.time()
for epoch in range(1, epochs + 1):
    for step, batch in enumerate(train_data_loader, start=1):
        query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch
        # 其中query和title为同一条数据
        loss = model(
                query_input_ids=query_input_ids,
                title_input_ids=title_input_ids,
                query_token_type_ids=query_token_type_ids,
                title_token_type_ids=title_token_type_ids)
        # 每隔10个step进行打印日志
        global_step += 1
        if global_step % 10 == 0:
            print("global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss,
                       10 / (time.time() - tic_train)))
            tic_train = time.time()
        # 反向
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.clear_grad()
        # 每隔save_steps保存模型
        if global_step % save_steps == 0:
            save_path = os.path.join(save_dir, "model_%d" % (global_step))
            if not os.path.exists(save_path):
                os.makedirs(save_path)
            save_param_path = os.path.join(save_path, 'model_state.pdparams')
            paddle.save(model.state_dict(), save_param_path)
            tokenizer.save_pretrained(save_path)
time_end=time.time()
print('totally cost {} seconds'.format(time_end-time_start))

#### 模型预测

由于本项目使用的demo数据，在预测部分为了保证效果，我们使用已经用全量数据训练好的模型，首先下载训练好的SimCSE模型，然后进行解压

In [10]:
if(not os.path.exists('simcse_model.zip')):
    get_path_from_url('https://bj.bcebos.com/v1/paddlenlp/models/simcse_model.zip',root_dir='.')
# 解压SimCSE模型
# !unzip -o simcse_model.zip -d pretrained/

In [11]:
from utils.data import convert_example_test

# 加载预训练好的无监督语义索引模型 SimCSE
params_path='./pretrained/model_20000/model_state.pdparams'
state_dict = paddle.load(params_path)
model.set_dict(state_dict)
# 定义两条文本数据
test_data = ['国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据', '语义检索相关的论文']
# 给convert_example_test赋予默认值
test_func = partial(
        convert_example_test,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length)
# pad对齐操作
test_batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # text_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'),  # text_segment
    ): [data for data in fn(samples)]

# conver_example function's input must be dict
corpus_ds = MapDataset(test_data)
# 构造Dataloader
corpus_data_loader = create_dataloader(
        corpus_ds,
        mode='predict',
        batch_size=batch_size,
        batchify_fn=test_batchify_fn,
        trans_fn=test_func)

all_embeddings = []
# 切换成eval模式，固定住 dropout
model.eval()
# 预测的时候不保存梯度
with paddle.no_grad():
    for batch_data in corpus_data_loader:
        input_ids, token_type_ids = batch_data
        input_ids = paddle.to_tensor(input_ids, dtype='int64')
        token_type_ids = paddle.to_tensor(token_type_ids, dtype='int64')
        # 抽取向量
        text_embeddings = model.get_pooled_embedding(input_ids, token_type_ids)
        all_embeddings.append(text_embeddings)

text_embedding=all_embeddings[0]
print(text_embedding.shape)
print(text_embedding.numpy())

You can set full_graph=True, then you can assign input spec.



[2, 256]
[[-2.38936171e-02 -1.86179597e-02 -3.87326293e-02 -2.14510243e-02
   5.80730699e-02 -7.30442628e-02  4.47707027e-02 -4.79865447e-03
  -1.11777507e-01  3.06986496e-02 -7.58780241e-02 -1.39245540e-02
  -3.18336859e-02  7.34863579e-02 -6.20127991e-02 -7.75121376e-02
  -2.94753723e-02 -1.10602945e-01  1.43525824e-02  7.38698058e-03
  -6.20281848e-04 -1.13605276e-01  5.09278849e-03  1.68028884e-02
  -9.10069644e-02 -4.97759227e-03  9.68626812e-02 -4.01212536e-02
  -4.31677513e-02 -6.29915670e-02 -2.77599022e-02 -1.28067151e-01
   4.59447299e-04  1.12159485e-02 -2.34453306e-02 -1.29382834e-02
  -9.12685394e-02  8.15406442e-02 -7.75527675e-03  3.35520543e-02
  -7.50929788e-02 -6.48786947e-02  8.32832009e-02 -3.95188704e-02
   1.33434922e-01 -7.62892142e-02  2.10594863e-01  9.67187062e-02
  -4.11144681e-02  5.90532506e-03  2.87344866e-02 -1.25124678e-01
  -9.30405185e-02 -1.03214726e-01  1.13968330e-03 -1.43238613e-02
   9.50518623e-02 -1.65595077e-02 -4.30812314e-02  5.52676395e-02
 

从输出结果可以看出，两条文本被抽取成了2条256维度的向量。

### 有监督语义索引
#### 数据准备

使用文献的的query, title, keywords，构造带正标签的数据集，不包含负标签样本

```
宁夏社区图书馆服务体系布局现状分析	       宁夏社区图书馆服务体系布局现状分析社区图书馆,社区图书馆服务,社区图书馆服务体系
人口老龄化对京津冀经济	                 京津冀人口老龄化对区域经济增长的影响京津冀,人口老龄化,区域经济增长,固定效应模型
英语广告中的模糊语	                  模糊语在英语广告中的应用及其功能模糊语,英语广告,表现形式,语用功能
甘氨酸二肽的合成	                      甘氨酸二肽合成中缩合剂的选择甘氨酸,缩合剂,二肽
```

In [12]:
def read_text_pair(data_path):
    """Reads data."""
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            data = line.rstrip().split("\t")
            if len(data) != 2:
                continue
            # 可以看到有监督数据使用query title pair的
            # 所以text_a和text_b不一样
            yield {'text_a': data[0], 'text_b': data[1]}

train_set_file='./train.csv'
train_ds = load_dataset(
        read_text_pair, data_path=train_set_file, lazy=False)
# 打印3条文本
for i in range(3):
    print(train_ds[i])

{'text_a': '从《唐律疏义》看唐代封爵贵族的法律特权', 'text_b': '从《唐律疏义》看唐代封爵贵族的法律特权《唐律疏义》,封爵贵族,法律特权'}
{'text_a': '宁夏社区图书馆服务体系布局现状分析', 'text_b': '宁夏社区图书馆服务体系布局现状分析社区图书馆,社区图书馆服务,社区图书馆服务体系'}
{'text_a': '人口老龄化对京津冀经济', 'text_b': '京津冀人口老龄化对区域经济增长的影响京津冀,人口老龄化,区域经济增长,固定效应模型'}


可以看到有监督的In-batch Negatives的训练输入的文本text_a,text_b是不一样的，表示的是text_a和text_b是相似的文本。

#### 模型构建

In [13]:
from utils.base_model import SemanticIndexBase

class SemanticIndexBatchNeg(SemanticIndexBase):
    def __init__(self,
                 pretrained_model,
                 dropout=None,
                 margin=0.3,
                 scale=30,
                 output_emb_size=None):
        super().__init__(pretrained_model, dropout, output_emb_size)

        self.margin = margin
        # Used scaling cosine similarity to ease converge
        self.sacle = scale

    def forward(self,
                query_input_ids,
                title_input_ids,
                query_token_type_ids=None,
                query_position_ids=None,
                query_attention_mask=None,
                title_token_type_ids=None,
                title_position_ids=None,
                title_attention_mask=None):

        query_cls_embedding = self.get_pooled_embedding(
            query_input_ids, query_token_type_ids, query_position_ids,
            query_attention_mask)

        title_cls_embedding = self.get_pooled_embedding(
            title_input_ids, title_token_type_ids, title_position_ids,
            title_attention_mask)

        cosine_sim = paddle.matmul(
            query_cls_embedding, title_cls_embedding, transpose_y=True)

        # substract margin from all positive samples cosine_sim()
        margin_diag = paddle.full(
            shape=[query_cls_embedding.shape[0]],
            fill_value=self.margin,
            dtype=paddle.get_default_dtype())

        cosine_sim = cosine_sim - paddle.diag(margin_diag)

        # scale cosine to ease training converge
        cosine_sim *= self.sacle

        labels = paddle.arange(0, query_cls_embedding.shape[0], dtype='int64')
        labels = paddle.reshape(labels, shape=[-1, 1])

        loss = F.cross_entropy(input=cosine_sim, label=labels)

        return loss

从模型层面来讲SimCSE的结构和Inbatch-Negatives的网络结构没有区别，唯一最大的区别是训练过程使用了有监督的数据。

#### 训练配置

定义模型训练的超参，优化器等等。

In [14]:
# 关键参数
scale=20 # 推荐值: 10 ~ 30
margin=0.1 # 推荐值: 0.0 ~ 0.2
# 最大序列长度
max_seq_length=64
epochs=1
learning_rate=5E-5
warmup_proportion=0.0
weight_decay=0.0
save_steps=10
batch_size=64
output_emb_size=256

In [15]:
pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained(
        'ernie-3.0-medium-zh')
tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-3.0-medium-zh')
trans_func = partial(
        convert_example,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length) 

batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # query_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'),  # query_segment
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # title_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'),  # tilte_segment
    ): [data for data in fn(samples)]  

train_data_loader = create_dataloader(
        train_ds,
        mode='train',
        batch_size=batch_size,
        batchify_fn=batchify_fn,
        trans_fn=trans_func)
# Inbatch-Negatives
model = SemanticIndexBatchNeg(
        pretrained_model,
        margin=margin,
        scale=scale,
        output_emb_size=output_emb_size)

num_training_steps = len(train_data_loader) * epochs

lr_scheduler = LinearDecayWithWarmup(learning_rate, num_training_steps,
                                         warmup_proportion) 

# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)  

[32m[2024-06-01 15:37:42,301] [    INFO][0m - Already cached C:\Users\kitx86\.paddlenlp\models\ernie-3.0-medium-zh\model_state.pdparams[0m
[32m[2024-06-01 15:37:42,302] [    INFO][0m - Loading weights file model_state.pdparams from cache at C:\Users\kitx86\.paddlenlp\models\ernie-3.0-medium-zh\model_state.pdparams[0m
[32m[2024-06-01 15:37:42,510] [    INFO][0m - Loaded weights file from disk, setting weights to model.[0m
- This IS expected if you are initializing ErnieModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ErnieModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).[0m
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

#### 模型训练

模型训练过程如下：

1.从dataloader中取出小批量数据

2.输入到模型中做前向

3.求损失函数

3.反向传播更新梯度

In [None]:
def do_train(model,train_data_loader):
    
    global_step = 0
    tic_train = time.time()
    for epoch in range(1, epochs + 1):
        for step, batch in enumerate(train_data_loader, start=1):
            query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch

            loss = model(
                query_input_ids=query_input_ids,
                title_input_ids=title_input_ids,
                query_token_type_ids=query_token_type_ids,
                title_token_type_ids=title_token_type_ids)

            global_step += 1
            if global_step % 5 == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss,
                       10 / (time.time() - tic_train)))
                tic_train = time.time()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % save_steps == 0:
                save_path = os.path.join(save_dir, "model_%d" % global_step)
                if not os.path.exists(save_path):
                    os.makedirs(save_path)
                save_param_path = os.path.join(save_path, 'model_state.pdparams')
                paddle.save(model.state_dict(), save_param_path)
                tokenizer.save_pretrained(save_path)

do_train(model,train_data_loader)


#### 模型预测

模型预测部分加载训练好的模型，然后输入两条示例数据进行预测抽取向量。

In [18]:
# !wget https://bj.bcebos.com/v1/paddlenlp/models/inbatch_model.zip 

if(not os.path.exists('inbatch_model.zip')):
    get_path_from_url('https://bj.bcebos.com/v1/paddlenlp/models/inbatch_model.zip',root_dir='.')

# !unzip -o inbatch_model.zip -d pretrained/

Archive:  inbatch_model.zip
  inflating: pretrained/model_40/model_state.pdparams  
  inflating: pretrained/model_40/vocab.txt  
  inflating: pretrained/model_40/tokenizer_config.json  


In [16]:
max_seq_length=64
output_emb_size=256
batch_size=1
pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained(
        'ernie-1.0')
tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0')
model = SemanticIndexBatchNeg(
        pretrained_model,
        margin=margin,
        scale=scale,
        output_emb_size=output_emb_size)
params_path='./pretrained/model_40/model_state.pdparams'
test_data = ["国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据"]
# 加载模型
state_dict = paddle.load(params_path)
model.set_dict(state_dict)

test_func = partial(
        convert_example_test,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length)

test_batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # text_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'),  # text_segment
    ): [data for data in fn(samples)]

# conver_example function's input must be dict
corpus_ds = MapDataset(test_data)

corpus_data_loader = create_dataloader(
        corpus_ds,
        mode='predict',
        batch_size=batch_size,
        batchify_fn=test_batchify_fn,
        trans_fn=test_func)

all_embeddings = []
model.eval()
with paddle.no_grad():
    for batch_data in corpus_data_loader:
        input_ids, token_type_ids = batch_data
        input_ids = paddle.to_tensor(input_ids, dtype='int64')
        token_type_ids = paddle.to_tensor(token_type_ids, dtype='int64')
        text_embeddings = model.get_pooled_embedding(input_ids, token_type_ids)
        all_embeddings.append(text_embeddings)

text_embedding=all_embeddings[0]
print(text_embedding.shape)
print(text_embedding.numpy())

[32m[2024-06-01 15:38:09,170] [    INFO][0m - Already cached C:\Users\kitx86\.paddlenlp\models\ernie-1.0\model_state.pdparams[0m
[32m[2024-06-01 15:38:09,170] [    INFO][0m - Loading weights file model_state.pdparams from cache at C:\Users\kitx86\.paddlenlp\models\ernie-1.0\model_state.pdparams[0m
[32m[2024-06-01 15:38:09,545] [    INFO][0m - Loaded weights file from disk, setting weights to model.[0m
- This IS expected if you are initializing ErnieModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ErnieModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).[0m
[32m[2024-06-01 15:38:18,748] [    INFO][0m - All the weights of ErnieModel were initialized from the model checkpoint at ern

[1, 256]
[[ 0.07830597 -0.1403687   0.03433796 -0.14967981 -0.03386056  0.06630653
   0.01357932  0.03531206  0.02411087  0.0200086   0.0572399  -0.08119465
   0.06286903  0.06509128  0.0719341   0.06378152  0.00466652  0.04162082
   0.09570316 -0.04379361  0.02414248  0.04090641  0.00377305  0.02440115
  -0.14362058  0.01603018  0.00888093  0.01143386 -0.05140319  0.05263366
   0.05756421 -0.10277359 -0.02346238  0.01396164  0.01603572  0.05159243
   0.02901114  0.07017182  0.01261908  0.01556086  0.01930514 -0.01243163
  -0.00569786 -0.18707049 -0.03321618  0.00187249  0.0162295   0.07697202
   0.04228468  0.02567651  0.06987978 -0.03897954 -0.03276315  0.00794893
   0.00679751  0.06019309 -0.07035226  0.00915677 -0.04332462  0.03413434
   0.00501309 -0.07866681 -0.11235965 -0.03109914 -0.02952522 -0.08572644
   0.01957936  0.0331405   0.06148316  0.14422092 -0.03898615  0.04563016
  -0.08464758 -0.07827485 -0.03478685  0.00342681 -0.05538592 -0.03798989
  -0.02372422  0.06929096 -0.

### 模型部署

模型部署首先需要把模型转换成静态图模型。

In [17]:
output_path='./output/recall'
model.eval()
# Convert to static graph with specific input description
model = paddle.jit.to_static(
        model,
        input_spec=[
            paddle.static.InputSpec(
                shape=[None, None], dtype="int64"),  # input_ids
            paddle.static.InputSpec(
                shape=[None, None], dtype="int64")  # segment_ids
        ])
# Save in static graph model.
save_path = os.path.join(output_path, "inference")
print(save_path)
paddle.jit.save(model, save_path)

./output/recall\inference




In [18]:
from utils.data import convert_example_recall_infer
from scipy.special import softmax
from scipy import spatial

class RecallPredictor(object):
    def __init__(self,
                 model_dir,
                 device="gpu",
                 max_seq_length=128,
                 batch_size=32,
                 use_tensorrt=False,
                 precision="fp32",
                 cpu_threads=10,
                 enable_mkldnn=False):
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size

        model_file = model_dir + "/output/recall/inference.get_pooled_embedding.pdmodel"
        params_file = model_dir + "/output/recall/inference.get_pooled_embedding.pdiparams"
        if not os.path.exists(model_file):
            raise ValueError("not find model file path {}".format(model_file))
        if not os.path.exists(params_file):
            raise ValueError("not find params file path {}".format(params_file))
        config = paddle.inference.Config(model_file, params_file)

        if device == "gpu":
            # set GPU configs accordingly
            # such as intialize the gpu memory, enable tensorrt
            config.enable_use_gpu(100, 0)
            precision_map = {
                "fp16": inference.PrecisionType.Half,
                "fp32": inference.PrecisionType.Float32,
                "int8": inference.PrecisionType.Int8
            }
            precision_mode = precision_map[precision]

            if use_tensorrt:
                config.enable_tensorrt_engine(
                    max_batch_size=batch_size,
                    min_subgraph_size=30,
                    precision_mode=precision_mode)
        elif device == "cpu":
            # set CPU configs accordingly,
            # such as enable_mkldnn, set_cpu_math_library_num_threads
            config.disable_gpu()
            if enable_mkldnn:
                # cache 10 different shapes for mkldnn to avoid memory leak
                config.set_mkldnn_cache_capacity(10)
                config.enable_mkldnn()
            config.set_cpu_math_library_num_threads(cpu_threads)
        elif device == "xpu":
            # set XPU configs accordingly
            config.enable_xpu(100)

        config.switch_use_feed_fetch_ops(False)
        self.predictor = paddle.inference.create_predictor(config)
        self.input_handles = [
            self.predictor.get_input_handle(name)
            for name in self.predictor.get_input_names()
        ]
        self.output_handle = self.predictor.get_output_handle(
            self.predictor.get_output_names()[0])



    def extract_embedding(self, data, tokenizer):
        """
        Predicts the data labels.
        Args:
            data (obj:`List(str)`): The batch data whose each element is a raw text.
            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
                which contains most of the methods. Users should refer to the superclass for more information regarding methods.
        Returns:
            results(obj:`dict`): All the feature vectors.
        """

        examples = []
        for text in data:
            input_ids, segment_ids = convert_example_recall_infer(text, tokenizer)
            examples.append((input_ids, segment_ids))

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # segment
        ): fn(samples)

        input_ids, segment_ids = batchify_fn(examples)
        self.input_handles[0].copy_from_cpu(input_ids)
        self.input_handles[1].copy_from_cpu(segment_ids)
        self.predictor.run()
        logits = self.output_handle.copy_to_cpu()
        return logits

    def predict(self, data, tokenizer):
        """
        Predicts the data labels.
        Args:
            data (obj:`List(str)`): The batch data whose each element is a raw text.
            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
                which contains most of the methods. Users should refer to the superclass for more information regarding methods.
        Returns:
            results(obj:`dict`): All the predictions probs.
        """

        examples = []
        for idx, text in enumerate(data):
            input_ids, segment_ids = convert_example_recall_infer({idx: text[0]}, tokenizer)
            title_ids, title_segment_ids = convert_example_recall_infer({
                idx: text[1]
            }, tokenizer)
            examples.append(
                (input_ids, segment_ids, title_ids, title_segment_ids))

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'),  # segment
            Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # segment
            Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'),  # segment
        ): fn(samples)


        query_ids, query_segment_ids, title_ids, title_segment_ids = batchify_fn(
            examples)
        self.input_handles[0].copy_from_cpu(query_ids)
        self.input_handles[1].copy_from_cpu(query_segment_ids)
        self.predictor.run()
        query_logits = self.output_handle.copy_to_cpu()

        self.input_handles[0].copy_from_cpu(title_ids)
        self.input_handles[1].copy_from_cpu(title_segment_ids)
        self.predictor.run()
        title_logits = self.output_handle.copy_to_cpu()

        result = [
            float(1 - spatial.distance.cosine(arr1, arr2))
            for arr1, arr2 in zip(query_logits, title_logits)
        ]
        return result

In [19]:
model_dir = './output/recall'
# device='gpu'
device='cpu'
max_seq_length=64
use_tensorrt = False
batch_size =32 
precision = 'fp32'
cpu_threads = 1
enable_mkldnn =False
predictor = RecallPredictor(model_dir, device, max_seq_length,
                          batch_size, use_tensorrt, precision,
                          cpu_threads, enable_mkldnn)


id2corpus = {0: '国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'}
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
res = predictor.extract_embedding(corpus_list, tokenizer)
print('抽取向量')
print(res.shape)
print(res)


抽取向量
(1, 256)
[[ 0.07830597 -0.14036867  0.03433809 -0.14967988 -0.03386061  0.06630668
   0.01357943  0.03531198  0.0241108   0.02000859  0.05723994 -0.08119434
   0.06286899  0.06509119  0.07193426  0.0637813   0.00466663  0.0416208
   0.09570334 -0.0437936   0.02414249  0.04090639  0.00377312  0.02440133
  -0.14362063  0.0160301   0.00888102  0.01143374 -0.05140326  0.05263374
   0.05756426 -0.10277364 -0.02346236  0.0139618   0.01603572  0.05159238
   0.0290112   0.0701718   0.01261923  0.01556097  0.01930504 -0.01243166
  -0.00569788 -0.18707052 -0.03321596  0.00187244  0.01622939  0.07697207
   0.04228462  0.02567671  0.06987959 -0.03897937 -0.0327629   0.00794878
   0.00679753  0.06019303 -0.07035235  0.00915689 -0.04332461  0.0341342
   0.00501306 -0.07866668 -0.1123596  -0.03109922 -0.02952531 -0.08572659
   0.01957938  0.03314063  0.06148323  0.14422084 -0.03898621  0.04563032
  -0.08464757 -0.07827486 -0.03478681  0.00342693 -0.05538582 -0.03798985
  -0.02372419  0.06929099 



In [20]:
corpus_list = [['中西方语言与文化的差异', '中西方文化差异以及语言体现中西方文化,差异,语言体现'],
                   ['中西方语言与文化的差异', '飞桨致力于让深度学习技术的创新与应用更简单']]
res = predictor.predict(corpus_list, tokenizer)
print('计算相似度')
print(res)

计算相似度
[0.9592697000068264, 0.047252719767472806]


导出静态图接下来就是部署了，目前部署支持C++和Pipeline两种方式，由于aistudio不支持部署环境，需要部署的话可以参考链接:[https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/neural_search/recall/in_batch_negative/deploy](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/neural_search/recall/in_batch_negative/deploy)




### 基于Milvus的效果展示

在实际上线中，我们需要使用向量检索引擎，由于aistudio不支持搭建Milvus,有条件的同学可以本地搭建一个，使用Docker安装。


#### 基于Milvus搭建召回服务

我们使用[Milvus](https://milvus.io/)开源工具进行召回，milvus的搭建教程请参考官方教程  [milvus官方安装教程](https://milvus.io/cn/docs/v1.1.1/milvus_docker-cpu.md)本案例使用的是milvus的1.1.1版本，搭建完以后启动milvus


```
cd [Milvus root path]/core/milvus
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:[Milvus root path]/core/milvus/lib
cd scripts
./start_server.sh

```


#### 基于Milvus的召回效果展示


输入的样本为：

```
国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据
```

下面分别是抽取的向量和召回的结果：

```
[1, 256]
[[ 0.06374735 -0.08051944  0.05118101 -0.05855767 -0.06969483  0.05318566
   0.079629    0.02667932 -0.04501902 -0.01187392  0.09590752 -0.05831281
   ....
5677638 国有股权参股对家族企业创新投入的影响混合所有制改革,国有股权,家族企业,创新投入 0.5417419672012329
1321645 高管政治联系对民营企业创新绩效的影响——董事会治理行为的非线性中介效应高管政治联系,创新绩效,民营上市公司,董事会治理行为,中介效应 0.5445536375045776
1340319 国有控股上市公司资产并购重组风险探讨国有控股上市公司,并购重组,防范对策 0.5515031218528748
....
```

上述流程就是召回的全流程，如果对精度要求不高或者数据量不高，可以完全使用召回模型得到的结果。如果数据量比较大，或者有多路召回的结果，则需要下面的排序方案，排序的作用就是对召回的结果进行重排，使得结果更加精确。

In [21]:
# 构建读取函数，读取原始数据
def read(src_path, is_predict=False):
    data=pd.read_csv(src_path,sep='\t')
    for index, row in tqdm(data.iterrows()):
        query=row['query']
        title=row['title']
        neg_title=row['neg_title']
        yield {'query':query, 'title':title,'neg_title':neg_title}

def read_test(src_path, is_predict=False):
    data=pd.read_csv(src_path,sep='\t')
    for index, row in tqdm(data.iterrows()):
        query=row['query']
        title=row['title']
        label=row['label']
        yield {'query':query, 'title':title,'label':label}


test_file='./dev_ranking_demo.csv'
train_file='./train_ranking_demo.csv'

train_ds=load_dataset(read,src_path=train_file,lazy=False)
dev_ds=load_dataset(read_test,src_path=test_file,lazy=False)
print('打印一条训练集')
print(train_ds[0])
print('打印一条验证集')
print(dev_ds[0])

1000it [00:00, 29410.80it/s]
1000it [00:00, 33331.78it/s]

打印一条训练集
{'query': '检测干眼', 'title': '诊断干眼的非侵入性新检测技术及应用价值干眼;非侵入性;检测技术;诊断', 'neg_title': '自身免疫性干眼调节T细胞及细胞因子的表达干眼CD4+CD25+Treg细胞,Th17细胞,干燥综合征,细胞因子'}
打印一条验证集
{'query': '中国特色社会主义文化建设理论', 'title': '建设社会主义文化强国的理论与实践思考社会主义文化强国,社会主义现代化国家,文化建设', 'label': 1}





### 模型构建

![](https://ai-studio-static-online.cdn.bcebos.com/9b52bdae342f4d83ba80f86833d632ada5ed12abd72f4e7e8703002368732351)

排序模型是pair-wise的结构，如图所示，query和titile正样本会经过encoder得到一个输出的相似度S1，query和title负样本也会经过Encoder得到一个输出的相似度S2,然后模型根据S1和S2求Triplet损失，其中S1的相似度要大于S2。



比如：对于文本：
```
个人所得税税务筹划      基于新个税视角下的个人所得税纳税筹划分析新个税;个人所得税;纳税筹划      个人所得税工资薪金税务筹划研究个人所得税,工资薪金,税务筹划
```
最终构造出来一条正样本对和一条负样本对，如下：

```
正样本对：[CLS]个人所得税税务筹划[SEP]基于新个税视角下的个人所得税纳税筹划分析新个税;个人所得税;纳税筹划[SEP]
负样本对：[CLS]个人所得税税务筹划[SEP]个人所得税工资薪金税务筹划研究个人所得税,工资薪金,税务筹划[SEP]
```

In [22]:
class PairwiseMatching(nn.Layer):
    def __init__(self, pretrained_model, dropout=None, margin=0.1):
        super().__init__()
        self.ptm = pretrained_model
        self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
        self.margin = margin

        # hidden_size -> 1, calculate similarity
        self.similarity = nn.Linear(self.ptm.config["hidden_size"], 1)

    # 用于导出静态图模型来计算概率
    @paddle.jit.to_static(input_spec=[paddle.static.InputSpec(shape=[None, None], dtype='int64'),paddle.static.InputSpec(shape=[None, None], dtype='int64')])
    def get_pooled_embedding(self,
                             input_ids,
                             token_type_ids=None,
                             position_ids=None,
                             attention_mask=None):
        _, cls_embedding = self.ptm(input_ids, token_type_ids,
                                        position_ids, attention_mask)
        cls_embedding = self.dropout(cls_embedding)
        # 计算相似度
        sim = self.similarity(cls_embedding)
        return sim


    def predict(self,
                input_ids,
                token_type_ids=None,
                position_ids=None,
                attention_mask=None):

        _, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids,
                                    attention_mask)

        cls_embedding = self.dropout(cls_embedding)
        sim_score = self.similarity(cls_embedding)
        sim_score = F.sigmoid(sim_score)
        return sim_score

    def forward(self,
                pos_input_ids,
                neg_input_ids,
                pos_token_type_ids=None,
                neg_token_type_ids=None,
                pos_position_ids=None,
                neg_position_ids=None,
                pos_attention_mask=None,
                neg_attention_mask=None):

        _, pos_cls_embedding = self.ptm(pos_input_ids, pos_token_type_ids,
                                        pos_position_ids, pos_attention_mask)

        _, neg_cls_embedding = self.ptm(neg_input_ids, neg_token_type_ids,
                                        neg_position_ids, neg_attention_mask)

        pos_embedding = self.dropout(pos_cls_embedding)
        neg_embedding = self.dropout(neg_cls_embedding)

        pos_sim = self.similarity(pos_embedding)
        neg_sim = self.similarity(neg_embedding)

        pos_sim = F.sigmoid(pos_sim)
        neg_sim = F.sigmoid(neg_sim)

        labels = paddle.full(
            shape=[pos_cls_embedding.shape[0]], fill_value=1.0, dtype='float32')

        loss = F.margin_ranking_loss(
            pos_sim, neg_sim, labels, margin=self.margin)

        return loss

You can set full_graph=True, then you can assign input spec.



### 训练配置
配置模型所需要的一些超参数，实例化模型，优化器等等。

In [23]:
# 关键参数
margin=0.2 # 推荐取值 0.0 ~ 0.2
eval_step=100
max_seq_length=128
epochs=3
batch_size=32
warmup_proportion=0.0
weight_decay=0.0
save_step=100

#### 加载预训练模型 ERNIG-Gram
基于 ERNIE-3.0-Medium-zh 热启训练单塔 Pair-wise 排序模型，并定义数据读取的 DataLoader

In [24]:
pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained(
        'ernie-3.0-medium-zh')
tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained(
        'ernie-3.0-medium-zh')

trans_func_train = partial(
        convert_pairwise_example,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length)

trans_func_eval = partial(
        convert_pairwise_example,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        phase="eval")

batchify_fn_train = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # pos_pair_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'),  # pos_pair_segment
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # neg_pair_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64')  # neg_pair_segment
    ): [data for data in fn(samples)]

batchify_fn_eval = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # pair_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'),  # pair_segment
        Stack(dtype="int64")  # label
    ): [data for data in fn(samples)]

train_data_loader = create_dataloader(
        train_ds,
        mode='train',
        batch_size=batch_size,
        batchify_fn=batchify_fn_train,
        trans_fn=trans_func_train)

dev_data_loader = create_dataloader(
        dev_ds,
        mode='dev',
        batch_size=batch_size,
        batchify_fn=batchify_fn_eval,
        trans_fn=trans_func_eval)
model = PairwiseMatching(pretrained_model, margin=margin)

[32m[2024-06-01 15:40:32,596] [    INFO][0m - Already cached C:\Users\kitx86\.paddlenlp\models\ernie-3.0-medium-zh\model_state.pdparams[0m
[32m[2024-06-01 15:40:32,597] [    INFO][0m - Loading weights file model_state.pdparams from cache at C:\Users\kitx86\.paddlenlp\models\ernie-3.0-medium-zh\model_state.pdparams[0m
[32m[2024-06-01 15:40:32,912] [    INFO][0m - Loaded weights file from disk, setting weights to model.[0m
- This IS expected if you are initializing ErnieModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ErnieModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).[0m
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

In [25]:
# 打印训练集的batch数据
for item in train_data_loader:
    print(item)
    break

# 打印验证集的batch数据
for item in dev_data_loader:
    print(item)
    break

[Tensor(shape=[32, 71], dtype=int64, place=Place(cpu), stop_gradient=True,
       [[1    , 12052, 127  , ..., 0    , 0    , 0    ],
        [1    , 69   , 12   , ..., 92   , 2    , 0    ],
        [1    , 12052, 215  , ..., 0    , 0    , 0    ],
        ...,
        [1    , 12   , 403  , ..., 57   , 393  , 2    ],
        [1    , 1086 , 306  , ..., 0    , 0    , 0    ],
        [1    , 756  , 387  , ..., 0    , 0    , 0    ]]), Tensor(shape=[32, 71], dtype=int64, place=Place(cpu), stop_gradient=True,
       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 1, 1, 1],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]), Tensor(shape=[32, 88], dtype=int64, place=Place(cpu), stop_gradient=True,
       [[1    , 12052, 127  , ..., 0    , 0    , 0    ],
        [1    , 69   , 12   , ..., 0    , 0    , 0    ],
        [1    , 12052, 215  , ..., 0    , 0    , 0    ],
        ...,
        [1    , 12   , 



训练集的数据包含4个Tensor，分别表示的是query和正样本title的input_ids和token_type_ids，以及query和负样本title的input_ids和token_type_ids。

验证集则不一样，包含3个Tensor，除了了query后天title拼接成的input_id和token_type_ids的形式为，还有label，表明这条query和title是否相似，1表示的是相似，0表示的是不相似。

### 模型训练

下面是模型训练过程，由于在训练的时候使用了评估，所以先构建评估函数。

In [26]:
@paddle.no_grad()
def evaluate(model, metric, data_loader, phase="dev"):
    model.eval()
    metric.reset()

    for idx, batch in enumerate(data_loader):
        input_ids, token_type_ids, labels = batch
        # 类别为正的概率
        pos_probs = model.predict(input_ids=input_ids, token_type_ids=token_type_ids)
        # 类别为负的概率
        neg_probs = 1.0 - pos_probs

        preds = np.concatenate((neg_probs, pos_probs), axis=1)
        metric.update(preds=preds, labels=labels)

    print("eval_{} auc:{:.3}".format(phase, metric.accumulate()))
    metric.reset()
    model.train()

下面是排序模型的训练过程。

In [38]:
def do_train(model,train_data_loader,dev_data_loader):

    num_training_steps = len(train_data_loader) * epochs

    lr_scheduler = LinearDecayWithWarmup(learning_rate, num_training_steps,
                                         warmup_proportion)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)
    # 使用AUC作为评估指标
    metric = paddle.metric.Auc()

    global_step = 0
    tic_train = time.time()
    for epoch in range(1, epochs + 1):
        for step, batch in enumerate(train_data_loader, start=1):
            pos_input_ids, pos_token_type_ids, neg_input_ids, neg_token_type_ids = batch

            loss = model(
                pos_input_ids=pos_input_ids,
                neg_input_ids=neg_input_ids,
                pos_token_type_ids=pos_token_type_ids,
                neg_token_type_ids=neg_token_type_ids)
            # 每隔10个step打印日志
            global_step += 1
            if global_step % 10 == 0 :
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss,
                       10 / (time.time() - tic_train)))
                tic_train = time.time()
            # 反向求梯度
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            # 每隔eval_step进行评估
            if global_step % eval_step == 0:
                evaluate(model, metric, dev_data_loader, "dev")
            # 每隔save_steps保存模型
            if global_step % save_step == 0:
                save_path = os.path.join(save_dir, "model_%d" % global_step)
                if not os.path.exists(save_path):
                    os.makedirs(save_path)
                save_param_path = os.path.join(save_path, 'model_state.pdparams')
                paddle.save(model.state_dict(), save_param_path)
                tokenizer.save_pretrained(save_path)

do_train(model,train_data_loader,dev_data_loader)

KeyboardInterrupt: 


### 效果评估

下面是效果评估，首先下载训练好的预训练模型，然后进行解压。

In [40]:
if(not os.path.exists('ernie_gram_sort.zip')):
    get_path_from_url('https://bj.bcebos.com/v1/paddlenlp/models/ernie_gram_sort.zip',root_dir='.')
# !unzip -o ernie_gram_sort.zip -d pretrained/

'unzip' 不是内部或外部命令，也不是可运行的程序
或批处理文件。


加载训练好的模型，进行评估。

In [27]:
pretrained_model = ppnlp.transformers.ErnieGramModel.from_pretrained(
        'ernie-gram-zh')
tokenizer = ppnlp.transformers.ErnieGramTokenizer.from_pretrained(
        'ernie-gram-zh')
model = PairwiseMatching(pretrained_model, margin=margin)
init_from_ckpt='pretrained/model_30000/model_state.pdparams'
state_dict = paddle.load(init_from_ckpt)
model.set_dict(state_dict)
metric = paddle.metric.Auc()
evaluate(model, metric, dev_data_loader, "dev")

[32m[2024-06-01 15:41:45,580] [    INFO][0m - Already cached C:\Users\kitx86\.paddlenlp\models\ernie-gram-zh\model_state.pdparams[0m
[32m[2024-06-01 15:41:45,580] [    INFO][0m - Loading weights file model_state.pdparams from cache at C:\Users\kitx86\.paddlenlp\models\ernie-gram-zh\model_state.pdparams[0m
[32m[2024-06-01 15:41:48,975] [    INFO][0m - Loaded weights file from disk, setting weights to model.[0m
[32m[2024-06-01 15:41:56,887] [    INFO][0m - All model checkpoint weights were used when initializing ErnieGramModel.
[0m
[32m[2024-06-01 15:41:56,888] [    INFO][0m - All the weights of ErnieGramModel were initialized from the model checkpoint at ernie-gram-zh.
If your task is similar to the task the model of the checkpoint was trained on, you can already use ErnieGramModel for predictions without further training.[0m
[32m[2024-06-01 15:41:56,920] [    INFO][0m - Already cached C:\Users\kitx86\.paddlenlp\models\ernie-gram-zh\vocab.txt[0m
[32m[2024-06-01 15:41:

KeyboardInterrupt: 

排序模块用的指标是AUC，随机抽出一对样本，用训练得到的分类起来对两个样本进行预测，预测得到正样本概率>负样本的概率的概率。 一般AUC达到0.7以上就算是不错的，但也要根据任务场景进行分析，有的可能连0.7也达不到，但是效果也是非常不错的。

### 模型推理



In [28]:
from utils.data import read_text_pair
input_file='test_pairwise.csv'
valid_ds = load_dataset(read_text_pair, data_path=input_file, lazy=False)
# 打印一条数据
print(valid_ds[0])

{'query': '中西方语言与文化的差异', 'title': '第二语言习得的一大障碍就是文化差异。'}


In [29]:
trans_func = partial(
        convert_pairwise_example,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        phase="predict")

batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'),  # segment_ids
    ): [data for data in fn(samples)]

test_data_loader = create_dataloader(
        valid_ds,
        mode='predict',
        batch_size=batch_size,
        batchify_fn=batchify_fn,
        trans_fn=trans_func)
# 打印测试的样本
for item in test_data_loader:
    print(item)
    break

[Tensor(shape=[5, 53], dtype=int64, place=Place(cpu), stop_gradient=True,
       [[1   , 12  , 213 , 58  , 405 , 545 , 54  , 68  , 73  , 5   , 859 , 712 ,
         2   , 131 , 177 , 405 , 545 , 489 , 116 , 5   , 7   , 19  , 843 , 1767,
         113 , 10  , 68  , 73  , 859 , 712 , 12043, 2   , 0   , 0   , 0   , 0   ,
         0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   ,
         0   , 0   , 0   , 0   , 0   ],
        [1   , 12  , 213 , 58  , 405 , 545 , 54  , 68  , 73  , 5   , 859 , 712 ,
         2   , 1465, 68  , 73  , 367 , 591 , 86  , 12  , 20  , 68  , 73  , 51  ,
         137 , 241 , 812 , 216 , 1043, 3093, 1140, 1465, 68  , 73  , 30  , 12  ,
         20  , 68  , 73  , 30  , 241 , 812 , 30  , 1197, 1285, 2   , 0   , 0   ,
         0   , 0   , 0   , 0   , 0   ],
        [1   , 12  , 213 , 58  , 405 , 545 , 54  , 68  , 73  , 5   , 859 , 712 ,
         2   , 158 , 12  , 213 , 58  , 119 , 495 , 68  , 73  , 111 , 38  , 5   ,
         859 , 712 , 335 , 514 , 65



In [30]:
def predict(model, data_loader):

    batch_probs = []
    model.eval()

    with paddle.no_grad():
        for batch_data in data_loader:
            input_ids, token_type_ids = batch_data

            input_ids = paddle.to_tensor(input_ids, dtype='int64')
            token_type_ids = paddle.to_tensor(token_type_ids, dtype='int64')
            # 输入query title pair得到预测的概率
            batch_prob = model.predict(
                input_ids=input_ids, token_type_ids=token_type_ids).numpy()

            batch_probs.append(batch_prob)
        if(len(batch_prob)==1):
            batch_probs=np.array(batch_probs)
        else:
            batch_probs = np.concatenate(batch_probs, axis=0)
        return batch_probs



y_probs = predict(model, test_data_loader)
valid_ds = load_dataset(read_text_pair, data_path=input_file, lazy=False)
# 打印输出
for idx, prob in enumerate(y_probs):
    text_pair = valid_ds[idx]
    text_pair["pred_prob"] = prob[0]
    print(text_pair)

{'query': '中西方语言与文化的差异', 'title': '第二语言习得的一大障碍就是文化差异。', 'pred_prob': 0.8511221}
{'query': '中西方语言与文化的差异', 'title': '跨文化视角下中国文化对外传播路径琐谈跨文化,中国文化,传播,翻译', 'pred_prob': 0.7862962}
{'query': '中西方语言与文化的差异', 'title': '从中西方民族文化心理的差异看英汉翻译语言,文化,民族文化心理,思维方式,翻译', 'pred_prob': 0.91767514}
{'query': '中西方语言与文化的差异', 'title': '中英文化差异对翻译的影响中英文化,差异,翻译的影响', 'pred_prob': 0.8601747}
{'query': '中西方语言与文化的差异', 'title': '浅谈文化与语言习得文化,语言,文化与语言的关系,文化与语言习得意识,跨文化交际', 'pred_prob': 0.8944413}


### 预测部署

首先把动态图模型转换成静态图模型。

In [31]:
output_path='output/rank'
model.eval()

# Convert to static graph with specific input description
model = paddle.jit.to_static(
        model,
        input_spec=[
            paddle.static.InputSpec(
                shape=[None, None], dtype="int64"),  # input_ids
            paddle.static.InputSpec(
                shape=[None, None], dtype="int64")  # segment_ids
        ])
# Save in static graph model.
save_path = os.path.join(output_path, "inference")
paddle.jit.save(model, save_path)

You can set full_graph=True, then you can assign input spec.

You can set full_graph=True, then you can assign input spec.



定义Predictor用于加载静态图的模型参数进行预测。

In [32]:
class Predictor(object):
    def __init__(self,
                 model_dir,
                 device="gpu",
                 max_seq_length=128,
                 batch_size=32,
                 use_tensorrt=False,
                 precision="fp32",
                 cpu_threads=10,
                 enable_mkldnn=False):
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size

        model_file = model_dir + "/output/rank/inference.get_pooled_embedding.pdmodel"
        params_file = model_dir + "/output/rank/inference.get_pooled_embedding.pdiparams"
        if not os.path.exists(model_file):
            raise ValueError("not find model file path {}".format(model_file))
        if not os.path.exists(params_file):
            raise ValueError("not find params file path {}".format(params_file))
        config = paddle.inference.Config(model_file, params_file)

        if device == "gpu":
            # set GPU configs accordingly
            # such as intialize the gpu memory, enable tensorrt
            config.enable_use_gpu(100, 0)
            precision_map = {
                "fp16": inference.PrecisionType.Half,
                "fp32": inference.PrecisionType.Float32,
                "int8": inference.PrecisionType.Int8
            }
            precision_mode = precision_map[precision]

            if use_tensorrt:
                config.enable_tensorrt_engine(
                    max_batch_size=batch_size,
                    min_subgraph_size=30,
                    precision_mode=precision_mode)
        elif device == "cpu":
            # set CPU configs accordingly,
            # such as enable_mkldnn, set_cpu_math_library_num_threads
            config.disable_gpu()
            if enable_mkldnn:
                # cache 10 different shapes for mkldnn to avoid memory leak
                config.set_mkldnn_cache_capacity(10)
                config.enable_mkldnn()
            config.set_cpu_math_library_num_threads(cpu_threads)
        elif device == "xpu":
            # set XPU configs accordingly
            config.enable_xpu(100)

        config.switch_use_feed_fetch_ops(False)
        self.predictor = paddle.inference.create_predictor(config)
        self.input_handles = [
            self.predictor.get_input_handle(name)
            for name in self.predictor.get_input_names()
        ]
        self.output_handle = self.predictor.get_output_handle(
            self.predictor.get_output_names()[0])

     

    def predict(self, data, tokenizer):
        
        examples = []
        for text in data:
            input_ids, segment_ids = convert_example_ranking(
                text,
                tokenizer,
                max_seq_length=self.max_seq_length,
                is_test=True)
            examples.append((input_ids, segment_ids))

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # segment
        ): fn(samples)


        input_ids, segment_ids = batchify_fn(examples)
        self.input_handles[0].copy_from_cpu(input_ids)
        self.input_handles[1].copy_from_cpu(segment_ids)
        self.predictor.run()
        sim_score = self.output_handle.copy_to_cpu()

        sim_score = expit(sim_score)

        return sim_score

读取测试集的文本，把文本利用convert_example_ranking函数转换成id向量的形式。

In [33]:
def convert_example_ranking(example, tokenizer, max_seq_length=512, is_test=False):

    query, title = example["query"], example["title"]

    encoded_inputs = tokenizer(
        text=query, text_pair=title, max_seq_len=max_seq_length)

    input_ids = encoded_inputs["input_ids"]
    token_type_ids = encoded_inputs["token_type_ids"]

    if not is_test:
        label = np.array([example["label"]], dtype="int64")
        return input_ids, token_type_ids, label
    else:
        return input_ids, token_type_ids

input_file='test_pairwise.csv'

test_ds = load_dataset(read_text_pair,data_path=input_file, lazy=False)

data = [{'query': d['query'], 'title': d['title']} for d in test_ds]

batches = [
        data[idx:idx + batch_size]
        for idx in range(0, len(data), batch_size)
    ]
print(batches[0])

[{'query': '中西方语言与文化的差异', 'title': '第二语言习得的一大障碍就是文化差异。'}, {'query': '中西方语言与文化的差异', 'title': '跨文化视角下中国文化对外传播路径琐谈跨文化,中国文化,传播,翻译'}, {'query': '中西方语言与文化的差异', 'title': '从中西方民族文化心理的差异看英汉翻译语言,文化,民族文化心理,思维方式,翻译'}, {'query': '中西方语言与文化的差异', 'title': '中英文化差异对翻译的影响中英文化,差异,翻译的影响'}, {'query': '中西方语言与文化的差异', 'title': '浅谈文化与语言习得文化,语言,文化与语言的关系,文化与语言习得意识,跨文化交际'}]


实例化Predictor，然后进行预测。

In [34]:
model_dir='output/rank'
device='gpu'
max_seq_length=128
batch_size=32
# 可以安装对应的Tensorrt之后进行加速
use_tensorrt=False
# 精度，也可以选择fp16，精度几乎无损
precision='fp32'
# cpu的线程数目
cpu_threads=10
# 可以在CPU的情况下进行加速
enable_mkldnn=False

predictor = Predictor(model_dir, device, max_seq_length,
                          batch_size, use_tensorrt, precision,
                          cpu_threads, enable_mkldnn)
results = []
for batch_data in batches:
    results.extend(predictor.predict(batch_data, tokenizer))

for idx, text in enumerate(data):
    print('Data: {} \t prob: {}'.format(text, results[idx]))


Data: {'query': '中西方语言与文化的差异', 'title': '第二语言习得的一大障碍就是文化差异。'} 	 prob: [0.8511221]
Data: {'query': '中西方语言与文化的差异', 'title': '跨文化视角下中国文化对外传播路径琐谈跨文化,中国文化,传播,翻译'} 	 prob: [0.78629637]
Data: {'query': '中西方语言与文化的差异', 'title': '从中西方民族文化心理的差异看英汉翻译语言,文化,民族文化心理,思维方式,翻译'} 	 prob: [0.91767514]
Data: {'query': '中西方语言与文化的差异', 'title': '中英文化差异对翻译的影响中英文化,差异,翻译的影响'} 	 prob: [0.86017483]
Data: {'query': '中西方语言与文化的差异', 'title': '浅谈文化与语言习得文化,语言,文化与语言的关系,文化与语言习得意识,跨文化交际'} 	 prob: [0.8944415]


