In [None]:
# 导入系统库
from functools import partial
import os
import time

# 导入python的其他库
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.special import expit

# 导入Paddle库
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import inference

#导入PaddleNLP相关的库
import paddlenlp as ppnlp
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.datasets import load_dataset, MapDataset
from paddlenlp.transformers import LinearDecayWithWarmup
from NeuralSearch.utils.data import convert_pairwise_example

# 忽略所有警告
import warnings
warnings.filterwarnings("ignore")

### 模型构建

![](https://ai-studio-static-online.cdn.bcebos.com/9b52bdae342f4d83ba80f86833d632ada5ed12abd72f4e7e8703002368732351)

排序模型是pair-wise的结构，如图所示，query和titile正样本会经过encoder得到一个输出的相似度S1，query和title负样本也会经过Encoder得到一个输出的相似度S2,然后模型根据S1和S2求Triplet损失，其中S1的相似度要大于S2。



In [None]:
# 构建读取函数，读取原始数据
def read(src_path, is_predict=False):
    data=pd.read_csv(src_path,sep='\t')
    for index, row in tqdm(data.iterrows()):
        query=row['query']
        title=row['title']
        neg_title=row['neg_title']
        yield {'query':query, 'title':title,'neg_title':neg_title}

def read_test(src_path, is_predict=False):
    data=pd.read_csv(src_path,sep='\t')
    for index, row in tqdm(data.iterrows()):
        query=row['query']
        title=row['title']
        label=row['label']
        yield {'query':query, 'title':title,'label':label}

def create_dataloader(dataset,
                      mode='train',
                      batch_size=1,
                      batchify_fn=None,
                      trans_fn=None):
    if trans_fn:
        dataset = dataset.map(trans_fn)

    shuffle = True if mode == 'train' else False
    if mode == 'train':
        # 分布式批采样器加载数据的一个子集。
        # 每个进程可以传递给DataLoader一个DistributedBatchSampler的实例，每个进程加载原始数据的一个子集。
        batch_sampler = paddle.io.DistributedBatchSampler(
            dataset, batch_size=batch_size, shuffle=shuffle)
    else:
        # 批采样器的基础实现，
        # 用于 paddle.io.DataLoader 中迭代式获取mini-batch的样本下标数组，数组长度与 batch_size 一致。
        batch_sampler = paddle.io.BatchSampler(
            dataset, batch_size=batch_size, shuffle=shuffle)
    # 组装mini-batch
    return paddle.io.DataLoader(
        dataset=dataset,
        batch_sampler=batch_sampler,
        collate_fn=batchify_fn,
        return_list=True)


test_file='./dev_ranking_demo.csv'
train_file='./train_ranking_demo.csv'

train_ds=load_dataset(read,src_path=train_file,lazy=False)
dev_ds=load_dataset(read_test,src_path=test_file,lazy=False)
print('打印一条训练集')
print(train_ds[0])
print('打印一条验证集')
print(dev_ds[0])

比如：对于文本：
```
个人所得税税务筹划      基于新个税视角下的个人所得税纳税筹划分析新个税;个人所得税;纳税筹划      个人所得税工资薪金税务筹划研究个人所得税,工资薪金,税务筹划
```
最终构造出来一条正样本对和一条负样本对，如下：

```
正样本对：[CLS]个人所得税税务筹划[SEP]基于新个税视角下的个人所得税纳税筹划分析新个税;个人所得税;纳税筹划[SEP]
负样本对：[CLS]个人所得税税务筹划[SEP]个人所得税工资薪金税务筹划研究个人所得税,工资薪金,税务筹划[SEP]
```

In [None]:
class PairwiseMatching(nn.Layer):
    def __init__(self, pretrained_model, dropout=None, margin=0.1):
        super().__init__()
        self.ptm = pretrained_model
        self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
        self.margin = margin

        # hidden_size -> 1, calculate similarity
        self.similarity = nn.Linear(self.ptm.config["hidden_size"], 1)

    # 用于导出静态图模型来计算概率
    @paddle.jit.to_static(input_spec=[paddle.static.InputSpec(shape=[None, None], dtype='int64'),paddle.static.InputSpec(shape=[None, None], dtype='int64')])
    def get_pooled_embedding(self,
                             input_ids,
                             token_type_ids=None,
                             position_ids=None,
                             attention_mask=None):
        _, cls_embedding = self.ptm(input_ids, token_type_ids,
                                        position_ids, attention_mask)
        cls_embedding = self.dropout(cls_embedding)
        # 计算相似度
        sim = self.similarity(cls_embedding)
        return sim


    def predict(self,
                input_ids,
                token_type_ids=None,
                position_ids=None,
                attention_mask=None):

        _, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids,
                                    attention_mask)

        cls_embedding = self.dropout(cls_embedding)
        sim_score = self.similarity(cls_embedding)
        sim_score = F.sigmoid(sim_score)
        return sim_score

    def forward(self,
                pos_input_ids,
                neg_input_ids,
                pos_token_type_ids=None,
                neg_token_type_ids=None,
                pos_position_ids=None,
                neg_position_ids=None,
                pos_attention_mask=None,
                neg_attention_mask=None):

        _, pos_cls_embedding = self.ptm(pos_input_ids, pos_token_type_ids,
                                        pos_position_ids, pos_attention_mask)

        _, neg_cls_embedding = self.ptm(neg_input_ids, neg_token_type_ids,
                                        neg_position_ids, neg_attention_mask)

        pos_embedding = self.dropout(pos_cls_embedding)
        neg_embedding = self.dropout(neg_cls_embedding)

        pos_sim = self.similarity(pos_embedding)
        neg_sim = self.similarity(neg_embedding)

        pos_sim = F.sigmoid(pos_sim)
        neg_sim = F.sigmoid(neg_sim)

        labels = paddle.full(
            shape=[pos_cls_embedding.shape[0]], fill_value=1.0, dtype='float32')

        loss = F.margin_ranking_loss(
            pos_sim, neg_sim, labels, margin=self.margin)

        return loss

### 训练配置
配置模型所需要的一些超参数，实例化模型，优化器等等。

In [None]:
# 关键参数
margin=0.2 # 推荐取值 0.0 ~ 0.2
eval_step=100
max_seq_length=128
epochs=3
batch_size=32
warmup_proportion=0.0
weight_decay=0.0
save_step=100

#### 加载预训练模型 ERNIG-Gram
基于 ERNIE-3.0-Medium-zh 热启训练单塔 Pair-wise 排序模型，并定义数据读取的 DataLoader

In [None]:
# 基于 ERNIE-3.0-Medium-zh 热启训练单塔 Pair-wise 排序模型，并定义数据读取的 DataLoader
pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained(
        'ernie-3.0-medium-zh')
tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained(
        'ernie-3.0-medium-zh')

trans_func_train = partial(
        convert_pairwise_example,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length)

trans_func_eval = partial(
        convert_pairwise_example,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        phase="eval")

batchify_fn_train = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # pos_pair_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'),  # pos_pair_segment
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # neg_pair_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64')  # neg_pair_segment
    ): [data for data in fn(samples)]

batchify_fn_eval = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # pair_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'),  # pair_segment
        Stack(dtype="int64")  # label
    ): [data for data in fn(samples)]

train_data_loader = create_dataloader(
        train_ds,
        mode='train',
        batch_size=batch_size,
        batchify_fn=batchify_fn_train,
        trans_fn=trans_func_train)

dev_data_loader = create_dataloader(
        dev_ds,
        mode='dev',
        batch_size=batch_size,
        batchify_fn=batchify_fn_eval,
        trans_fn=trans_func_eval)
model = PairwiseMatching(pretrained_model, margin=margin)

训练集的数据包含4个Tensor，分别表示的是query和正样本title的input_ids和token_type_ids，以及query和负样本title的input_ids和token_type_ids。

验证集则不一样，包含3个Tensor，除了了query后天title拼接成的input_id和token_type_ids的形式为，还有label，表明这条query和title是否相似，1表示的是相似，0表示的是不相似。

### 模型训练

下面是模型训练过程，由于在训练的时候使用了评估，所以先构建评估函数。

In [None]:
@paddle.no_grad()
def evaluate(model, metric, data_loader, phase="dev"):
    model.eval()
    metric.reset()

    for idx, batch in enumerate(data_loader):
        input_ids, token_type_ids, labels = batch
        # 类别为正的概率
        pos_probs = model.predict(input_ids=input_ids, token_type_ids=token_type_ids)
        # 类别为负的概率
        neg_probs = 1.0 - pos_probs

        preds = np.concatenate((neg_probs, pos_probs), axis=1)
        metric.update(preds=preds, labels=labels)

    print("eval_{} auc:{:.3}".format(phase, metric.accumulate()))
    metric.reset()
    model.train()

下面是排序模型的训练过程。

In [None]:
save_dir='checkpoint'
# 关键参数
scale=20 # 推荐值: 10 ~ 30
margin=0.1 # 推荐值: 0.0 ~ 0.2
# SimCSE的dropout的参数，也可以使用预训练语言模型默认的dropout参数
dropout=0.2
# 向量映射的维度，默认的输出是768维，推荐通过线性层映射成256维
output_emb_size=256
# 训练的epoch数目
epochs=1
weight_decay=0.0
# 学习率
learning_rate=5E-5
warmup_proportion=0.0

def do_train(model,train_data_loader,dev_data_loader):

    num_training_steps = len(train_data_loader) * epochs

    lr_scheduler = LinearDecayWithWarmup(learning_rate, num_training_steps,
                                         warmup_proportion)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)
    # 使用AUC作为评估指标
    metric = paddle.metric.Auc()

    global_step = 0
    tic_train = time.time()
    for epoch in range(1, epochs + 1):
        for step, batch in enumerate(train_data_loader, start=1):
            pos_input_ids, pos_token_type_ids, neg_input_ids, neg_token_type_ids = batch

            loss = model(
                pos_input_ids=pos_input_ids,
                neg_input_ids=neg_input_ids,
                pos_token_type_ids=pos_token_type_ids,
                neg_token_type_ids=neg_token_type_ids)
            # 每隔10个step打印日志
            global_step += 1
            if global_step % 10 == 0 :
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss,
                       10 / (time.time() - tic_train)))
                tic_train = time.time()
            # 反向求梯度
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            # 每隔eval_step进行评估
            if global_step % eval_step == 0:
                evaluate(model, metric, dev_data_loader, "dev")
            # 每隔save_steps保存模型
            if global_step % save_step == 0:
                save_path = os.path.join(save_dir, "model_%d" % global_step)
                if not os.path.exists(save_path):
                    os.makedirs(save_path)
                save_param_path = os.path.join(save_path, 'model_state.pdparams')
                paddle.save(model.state_dict(), save_param_path)
                tokenizer.save_pretrained(save_path)

do_train(model,train_data_loader,dev_data_loader)


### 效果评估

加载训练好的模型，进行评估。

In [None]:
pretrained_model = ppnlp.transformers.ErnieGramModel.from_pretrained(
        'ernie-gram-zh')
tokenizer = ppnlp.transformers.ErnieGramTokenizer.from_pretrained(
        'ernie-gram-zh')
model = PairwiseMatching(pretrained_model, margin=margin)
init_from_ckpt='pretrained/model_30000/model_state.pdparams'
state_dict = paddle.load(init_from_ckpt)
model.set_dict(state_dict)
metric = paddle.metric.Auc()
evaluate(model, metric, dev_data_loader, "dev")

排序模块用的指标是AUC，随机抽出一对样本，用训练得到的分类起来对两个样本进行预测，预测得到正样本概率>负样本的概率的概率。 一般AUC达到0.7以上就算是不错的，但也要根据任务场景进行分析，有的可能连0.7也达不到，但是效果也是非常不错的。

### 模型推理

In [None]:
from NeuralSearch.utils.data import read_text_pair
input_file='test_pairwise.csv'
valid_ds = load_dataset(read_text_pair, data_path=input_file, lazy=False)
# 打印一条数据
print(valid_ds[0])

In [None]:
trans_func = partial(
        convert_pairwise_example,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        phase="predict")

batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'),  # segment_ids
    ): [data for data in fn(samples)]

test_data_loader = create_dataloader(
        valid_ds,
        mode='predict',
        batch_size=batch_size,
        batchify_fn=batchify_fn,
        trans_fn=trans_func)
# 打印测试的样本
for item in test_data_loader:
    print(item)
    break

In [None]:
def predict(model, data_loader):

    batch_probs = []
    model.eval()

    with paddle.no_grad():
        for batch_data in data_loader:
            input_ids, token_type_ids = batch_data

            input_ids = paddle.to_tensor(input_ids, dtype='int64')
            token_type_ids = paddle.to_tensor(token_type_ids, dtype='int64')
            # 输入query title pair得到预测的概率
            batch_prob = model.predict(
                input_ids=input_ids, token_type_ids=token_type_ids).numpy()

            batch_probs.append(batch_prob)
        if(len(batch_prob)==1):
            batch_probs=np.array(batch_probs)
        else:
            batch_probs = np.concatenate(batch_probs, axis=0)
        return batch_probs



y_probs = predict(model, test_data_loader)
valid_ds = load_dataset(read_text_pair, data_path=input_file, lazy=False)
# 打印输出
for idx, prob in enumerate(y_probs):
    text_pair = valid_ds[idx]
    text_pair["pred_prob"] = prob[0]
    print(text_pair)

### 预测部署

首先把动态图模型转换成静态图模型。

In [None]:
output_path='output/rank'
model.eval()

# Convert to static graph with specific input description
model = paddle.jit.to_static(
        model,
        input_spec=[
            paddle.static.InputSpec(
                shape=[None, None], dtype="int64"),  # input_ids
            paddle.static.InputSpec(
                shape=[None, None], dtype="int64")  # segment_ids
        ])
# Save in static graph model.
save_path = os.path.join(output_path, "inference")
paddle.jit.save(model, save_path)

In [None]:
def convert_example_ranking(example, tokenizer, max_seq_length=512, is_test=False):

    query, title = example["query"], example["title"]

    encoded_inputs = tokenizer(
        text=query, text_pair=title, max_seq_len=max_seq_length)

    input_ids = encoded_inputs["input_ids"]
    token_type_ids = encoded_inputs["token_type_ids"]

    if not is_test:
        label = np.array([example["label"]], dtype="int64")
        return input_ids, token_type_ids, label
    else:
        return input_ids, token_type_ids


定义Predictor用于加载静态图的模型参数进行预测。

In [None]:
class Predictor(object):
    def __init__(self,
                 model_dir,
                 device="gpu",
                 max_seq_length=128,
                 batch_size=32,
                 use_tensorrt=False,
                 precision="fp32",
                 cpu_threads=10,
                 enable_mkldnn=False):
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size

        model_file = model_dir + "/output/rank/inference.get_pooled_embedding.pdmodel"
        params_file = model_dir + "/output/rank/inference.get_pooled_embedding.pdiparams"
        if not os.path.exists(model_file):
            raise ValueError("not find model file path {}".format(model_file))
        if not os.path.exists(params_file):
            raise ValueError("not find params file path {}".format(params_file))
        config = paddle.inference.Config(model_file, params_file)

        if device == "gpu":
            # set GPU configs accordingly
            # such as intialize the gpu memory, enable tensorrt
            config.enable_use_gpu(100, 0)
            precision_map = {
                "fp16": inference.PrecisionType.Half,
                "fp32": inference.PrecisionType.Float32,
                "int8": inference.PrecisionType.Int8
            }
            precision_mode = precision_map[precision]

            if use_tensorrt:
                config.enable_tensorrt_engine(
                    max_batch_size=batch_size,
                    min_subgraph_size=30,
                    precision_mode=precision_mode)
        elif device == "cpu":
            # set CPU configs accordingly,
            # such as enable_mkldnn, set_cpu_math_library_num_threads
            config.disable_gpu()
            if enable_mkldnn:
                # cache 10 different shapes for mkldnn to avoid memory leak
                config.set_mkldnn_cache_capacity(10)
                config.enable_mkldnn()
            config.set_cpu_math_library_num_threads(cpu_threads)
        elif device == "xpu":
            # set XPU configs accordingly
            config.enable_xpu(100)

        config.switch_use_feed_fetch_ops(False)
        self.predictor = paddle.inference.create_predictor(config)
        self.input_handles = [
            self.predictor.get_input_handle(name)
            for name in self.predictor.get_input_names()
        ]
        self.output_handle = self.predictor.get_output_handle(
            self.predictor.get_output_names()[0])



    def predict(self, data, tokenizer):

        examples = []
        for text in data:
            input_ids, segment_ids = convert_example_ranking(
                text,
                tokenizer,
                max_seq_length=self.max_seq_length,
                is_test=True)
            examples.append((input_ids, segment_ids))

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # segment
        ): fn(samples)


        input_ids, segment_ids = batchify_fn(examples)
        self.input_handles[0].copy_from_cpu(input_ids)
        self.input_handles[1].copy_from_cpu(segment_ids)
        self.predictor.run()
        sim_score = self.output_handle.copy_to_cpu()

        sim_score = expit(sim_score)

        return sim_score

读取测试集的文本，把文本利用convert_example_ranking函数转换成id向量的形式。

In [None]:
input_file='test_pairwise.csv'

test_ds = load_dataset(read_text_pair,data_path=input_file, lazy=False)

data = [{'query': d['query'], 'title': d['title']} for d in test_ds]

batches = [
        data[idx:idx + batch_size]
        for idx in range(0, len(data), batch_size)
    ]
print(batches[0])

实例化Predictor，然后进行预测。

In [None]:
model_dir='output/rank'
device='gpu'
max_seq_length=128
batch_size=32
# 可以安装对应的Tensorrt之后进行加速
use_tensorrt=False
# 精度，也可以选择fp16，精度几乎无损
precision='fp32'
# cpu的线程数目
cpu_threads=10
# 可以在CPU的情况下进行加速
enable_mkldnn=False

predictor = Predictor(model_dir, device, max_seq_length,
                          batch_size, use_tensorrt, precision,
                          cpu_threads, enable_mkldnn)
results = []
for batch_data in batches:
    results.extend(predictor.predict(batch_data, tokenizer))

for idx, text in enumerate(data):
    print('Data: {} \t prob: {}'.format(text, results[idx]))
