In [1]:
%cd /home/aistudio/work/

/home/aistudio/work


In [2]:
# 导入所需的第三方库
import numpy as np
import time
import os
import paddle
import paddle.nn.functional as F
import paddle.nn as nn

In [3]:
!pip install --upgrade paddlenlp==2.3.4

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.1.2[0m[39;49m -> [0m[32;49m22.2.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
# 导入paddlenlp所需的相关包
import paddlenlp as ppnlp
from paddlenlp.data import Tuple, Pad

In [5]:
# 关键参数
scale=20 # 推荐值: 10 ~ 30
margin=0.1 # 推荐值: 0.0 ~ 0.2
# SimCSE的dropout的参数，也可以使用预训练语言模型默认的dropout参数
dropout=0.2
# 向量映射的维度，默认的输出是768维，推荐通过线性层映射成256维
output_emb_size=256
# 语义索引的维度，可以根据自己的情况调节长度
max_seq_length=140
# 根据经验 batch_size越大效果越好
batch_size=30
# 训练的epoch数目
epochs=1
weight_decay=0.0
# 学习率
learning_rate=5E-5
warmup_proportion=0.0

In [6]:
# 使用erine-m模型
MODEL_NAME_OR_PATH = "ernie-m-large"
# 从保存的参数中读取
# MODEL_NAME_OR_PATH = 'checkpoint'
pretrained_model = ppnlp.transformers.ErnieMModel.from_pretrained(
    MODEL_NAME_OR_PATH, 
    hidden_dropout_prob=dropout,
    attention_probs_dropout_prob=dropout)
# 定义模型对应的tokenizer，tokenizer可以把原始输入文本转化成模型model可接受的输入数据格式。需注意tokenizer类要与选择的模型相对应，具体可以查看PaddleNLP相关文档
tokenizer = ppnlp.transformers.AutoTokenizer.from_pretrained('./ernie-m-large')

[2022-08-06 10:52:03,188] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/ernie-m-large/ernie_m_large.pdparams
W0806 10:52:03.191593 32432 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 10.1
W0806 10:52:03.196009 32432 gpu_resources.cc:91] device: 0, cuDNN Version: 7.6.
[2022-08-06 10:52:11,280] [    INFO] - We are using <class 'paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer'> to load './ernie-m-large'.


In [27]:
class SimCSE(nn.Layer):
    def __init__(self,
                 pretrained_model,
                 dropout=None,
                 margin=0.0,
                 scale=20,
                 output_emb_size=None):

        super().__init__()

        self.ptm = pretrained_model
        # 显式的加一个dropout来控制
        self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)

        # if output_emb_size is greater than 0, then add Linear layer to reduce embedding_size, 
        # 考虑到性能和效率，我们推荐把output_emb_size设置成256
        # 向量越大，语义信息越丰富，但消耗资源越多
        self.output_emb_size = output_emb_size
        if output_emb_size > 0:
            weight_attr = paddle.ParamAttr(
                initializer=nn.initializer.TruncatedNormal(std=0.02))
            self.emb_reduce_linear = paddle.nn.Linear(
                1024, output_emb_size, weight_attr=weight_attr)

        self.margin = margin
        # 为了使余弦相似度更容易收敛，我们选择把计算出来的余弦相似度扩大scale倍，一般设置成20左右
        self.sacle = scale

    # 加入jit注释能够把该提取向量的函数导出成静态图
    # 对应input_id
    @paddle.jit.to_static(input_spec=[paddle.static.InputSpec(shape=[None, None], dtype='int64')])
    def get_pooled_embedding(self,
                             input_ids,
                             position_ids=None,
                             attention_mask=None,
                             with_pooler=True):

        # Note: cls_embedding is poolerd embedding with act tanh 
        sequence_output, cls_embedding = self.ptm(input_ids, position_ids, attention_mask)

        if with_pooler == False:
            cls_embedding = sequence_output[:, 0, :]

        if self.output_emb_size > 0:
            cls_embedding = self.emb_reduce_linear(cls_embedding)
        cls_embedding = self.dropout(cls_embedding)
        # https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/nn/functional/normalize_cn.html
        cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)
        return cls_embedding

    def forward(self,
                query_input_ids,
                title_input_ids=None,
                query_position_ids=None,
                query_attention_mask=None,
                title_position_ids=None,
                title_attention_mask=None):
        
        # 第 1 次编码: 文本经过无监督语义索引模型编码后的语义向量 
        # [N, 768]
        query_cls_embedding = self.get_pooled_embedding(
            query_input_ids, query_position_ids, query_attention_mask)

        # 第 2 次编码: 文本经过无监督语义索引模型编码后的语义向量 
        # [N, 768]
        title_cls_embedding = query_cls_embedding

        # 相似度矩阵: [N, N]
        cosine_sim = paddle.matmul(
            query_cls_embedding, title_cls_embedding, transpose_y=True)

        # substract margin from all positive samples cosine_sim()
        # 填充self.margin值，比如margin为0.2，query_cls_embedding.shape[0]=2 
        # margin_diag: [0.2,0.2]
        margin_diag = paddle.full(
            shape=[query_cls_embedding.shape[0]],
            fill_value=self.margin,
            dtype=paddle.get_default_dtype())
        # input paddle.diag(margin_diag): [[0.2,0],[0,0.2]]
        # input cosine_sim : [[1.0,0.6],[0.6,1.0]]
        # output cosine_sim: [[0.8,0.6],[0.6,0.8]]
        cosine_sim = cosine_sim - paddle.diag(margin_diag)

        # scale cosine to ease training converge
        cosine_sim *= self.sacle

        # 转化成多分类任务: 对角线元素是正例，其余元素为负例
        # labels : [0,1,2,3]
        labels = paddle.arange(0, query_cls_embedding.shape[0], dtype='int64')
        # labels : [[0],[1],[2],[3]]
        labels = paddle.reshape(labels, shape=[-1, 1])

        # 交叉熵损失函数
        loss = F.cross_entropy(input=cosine_sim, label=labels)
        return loss

In [28]:
model = SimCSE(pretrained_model, output_emb_size=output_emb_size)

In [29]:
params_path = 'ernie-m-large/model_state.pdparams'

if params_path and os.path.isfile(params_path):
    state_dict = paddle.load(params_path)
    model.set_dict(state_dict)
    print("Loaded parameters from %s" % params_path)

Loaded parameters from ernie-m-large/model_state.pdparams


In [30]:
# 动转静，通过`input_spec`给出模型所需输入数据的描述，shape中的None代表可变的大小，类似上面静态图模式中的`paddle.static.data`
model.eval()
model = paddle.jit.to_static(
        model,
        input_spec=[
            paddle.static.InputSpec(shape=[None, None],
                                    dtype="int64")  # input_ids
        ])

In [31]:
# 保存动转静后的模型，得到 infer_model/model.pdmodel 和 infer_model/model.pdiparams 文件
infer_model_save_path = os.path.join('ernie-m-large', 'infer_model/model')
paddle.jit.save(model, infer_model_save_path)