In [None]:
from utils_8.data import load_vocab, load_lcqmc_data
# 加载训练集，验证集，测试集
train_data, dev_data, test_data = load_lcqmc_data('lcqmc')
# 加载词表
word2id_dict = load_vocab()

In [None]:
from paddle.io import Dataset

class LCQMCDataset(Dataset):
    def __init__(self, data, word2id_dict):
        # 词表
        self.word2id_dict = word2id_dict
        # 数据
        self.examples = data
        # ['CLS']的id，占位符
        self.cls_id = self.word2id_dict['[CLS]']
        # ['SEP']的id，句子的分隔
        self.sep_id = self.word2id_dict['[SEP]']

    def __getitem__(self, idx):
        # 返回单条样本
        example = self.examples[idx]
        text, segment, label = self.words_to_id(example)
        return text, segment, label
    
    def __len__(self):
        # 返回样本的个数
        return len(self.examples)
    
    def words_to_id(self, example):
        text_a, text_b, label = example
        # text_a 转换成id的形式
        input_ids_a = [self.word2id_dict[item] if item in self.word2id_dict else self.word2id_dict['[UNK]'] for item in text_a]
        # text_b 转换成id的形式
        input_ids_b = [self.word2id_dict[item] if item in self.word2id_dict else self.word2id_dict['[UNK]'] for item in text_b]
        # 加入[CLS], [SEP]
        input_ids = [self.cls_id] + input_ids_a + [self.sep_id] + input_ids_b + [self.sep_id]
        # 对句子text_a,text_b做id的区分，进行的分隔
        segment_ids = [0]*(len(input_ids_a)+2) + [1]*(len(input_ids_b)+1)
        return input_ids, segment_ids, int(label)
    
    @property
    def label_list(self):
        # 0表示不相似，1表示相似
        return ['0', '1']

# 加载训练集
train_dataset = LCQMCDataset(train_data, word2id_dict)
# 加载验证集
dev_dataset = LCQMCDataset(dev_data, word2id_dict)
# 加载测试集
test_dataset = LCQMCDataset(test_data, word2id_dict)

In [None]:
from paddle.io import DataLoader
import paddle

def collate_fn(batch_data, pad_val=0, max_seq_len=512):
    input_ids, segment_ids, labels = [], [], []
    max_len = 0
    # print(batch_data)
    for example in batch_data:
        input_id, segment_id, label = example
        # 对数据序列进行截断
        input_ids.append(input_id[:max_seq_len])
        segment_ids.append(segment_id[:max_seq_len])
        labels.append(label)
        # 保存序列最大长度
        max_len = max(max_len, len(input_id))
    # 对数据序列进行填充至最大长度
    for i in range(len(labels)):
        input_ids[i] = input_ids[i] + [pad_val]*(max_len-len(input_ids[i]))
        segment_ids[i] = segment_ids[i] + [pad_val]*(max_len-len(segment_ids[i]))
    return (paddle.to_tensor(input_ids), paddle.to_tensor(segment_ids)), paddle.to_tensor(labels)

batch_size = 32
# 构建训练集，验证集，测试集的dataloader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)
dev_loader = DataLoader(dataset=dev_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

# 打印输出一条mini-batch的数据
for idx, item in enumerate(train_loader):
    if idx == 0:
        print(item)
        break

In [None]:
import paddle
import paddle.nn as nn

# 输入编码
class WordEmbedding(nn.Layer):
    def __init__(self, vocab_size, emb_size, padding_idx=0):
        super(WordEmbedding, self).__init__()
        # Embedding的维度
        self.emb_size = emb_size
        # 使用随机正态（高斯）分布初始化 embedding
        self.word_embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=emb_size, padding_idx=padding_idx,
                                           weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(0.0, emb_size ** -0.5)))
        
    def forward(self, word):
        word_emb = self.emb_size ** 0.5 * self.word_embedding(word)
        return word_emb
    
paddle.seed(2023)
# 构造一个输入
X = paddle.to_tensor([1, 0, 2])
# 表示构造的输入编码的词汇表的大小是10，每个词的维度是4
word_embed = WordEmbedding(10, 4)
print("输入编码为：{}".format(X.numpy()))
word_out = word_embed(X)
print("输出编码为：{}".format(word_out.numpy()))

In [None]:
# 分段编码
class SegmentEmbedding(nn.Layer):
    def __init__(self, vocab_size, emb_size):
        super(SegmentEmbedding, self).__init__()
        # Embedding的维度
        self.emb_size = emb_size
        # 分段编码
        self.seg_embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=emb_size)

    def forward(self, word):
        seg_embedding = self.seg_embedding(word)
        return seg_embedding
    
paddle.seed(2023)
# 构造一个输入,0表示第0句的token，1表示第1句的token
X = paddle.to_tensor([0, 0, 1, 1])
word_embed = SegmentEmbedding(2, 4)
print("输入编码为：{}".format(X.numpy()))
word_out = word_embed(X)
print("输出为：{}".format(word_out.numpy()))

In [None]:
# 位置编码
import numpy as np
import paddle

# position_size 为句子划分成字符或者词的长度，hidden_size为词向量的维度。
def get_sinusoid_encoding(position_size, hidden_size):
    """位置编码"""

    def cal_angle(pos, hidden_idx):
        # 公式里的 i = hid_idx // 2
        return pos / np.power(10000, 2*(hidden_idx//2)/hidden_size)
    
    def get_posi_angle_vec(pos):
        return [cal_angle(pos, hidden_j) for hidden_j in range(hidden_size)]
    
    sinusoid = np.array([get_posi_angle_vec(pos_i) for pos_i in range(position_size)])
    # dim 2i 偶数正弦
    # 从0开始，每隔2间隔取余弦
    sinusoid[:, 0::2] = np.sin(sinusoid[:, 0::2])
    # dim 2i 1  奇数余弦
    # 从1开始，每隔2间隔取余弦
    sinusoid[:, 1::2] = np.cos(sinusoid[:, 1::2])
    # position_size × hidden_size  得到每一个词的位置向量
    return sinusoid.astype("float32")

paddle.seed(2023)
position_size = 4
hidden_size = 3
encoding_vec = get_sinusoid_encoding(position_size=position_size, hidden_size=hidden_size)
print("位置编码的输出：{}".format(encoding_vec))

In [None]:
class PositionalEmbedding(nn.Layer):
    def __init__(self, max_length, emb_size):
        super(PositionalEmbedding, self).__init__()
        self.emb_size = emb_size
        # 使用三角函数初始化Embedding
        self.pos_encoder = nn.Embedding(num_embeddings=max_length, embedding_dim=self.emb_size,
                                        weight_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Assign(get_sinusoid_encoding(max_length, self.emb_size))))
        
    def forward(self, pos):
        pos_emb = self.pos_encoder(pos)
        # 关闭位置编码的梯度更新
        pos_emb.stop_gradient = True
        return pos_emb
    
paddle.seed(2023)
out = paddle.randint(low=0, high=5, shape=[3])
print('输入向量为：{}'.format(out.numpy()))
pos_embed = PositionalEmbedding(4, 5)
pos_out = pos_embed(out)
print('位置编码的输出为：{}'.format(pos_out.numpy()))

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

def plot_curve(size, y):
    plt.figure(figsize=(15, 5))
    plt.plot(np.arange(size), y[0, :, 4:5].numpy(), color='#E20079', linestyle='-')
    plt.plot(np.arange(size), y[0, :, 5:6].numpy(), color='#3D3D3F', linestyle='--')
    plt.plot(np.arange(size), y[0, :, 6:7].numpy(), color='#8e004D', linestyle='-.')
    plt.plot(np.arange(size), y[0, :, 7:8].numpy(), color='#284279', linestyle=':')
    plt.legend(["dim %d"%p for p in [4, 5, 6, 7]], fontsize='large')
    plt.savefig('att-vis2.pdf')

model = PositionalEmbedding(emb_size=20, max_length=5000)
# 生成0~99这100个数，表示0~99这100个位置
size = 100
X = paddle.arange((size)).reshape([1, size])
# print(X)
# 对这100个位置进行编码，得到每个位置的向量表示
# y: [1,100,20]
y = model(X)
# print(y)
# 把这100个位置的第4，5，6，7列的数据可视化出来
plot_curve(size=size, y=y)

In [None]:
class TransformerEmbeddings(nn.Layer):
    """
    包括输入编码，分段编码，位置编码
    """
    def __init__(self, vocab_size, hidden_size=768, hidden_dropout_prob=0.1, position_size=512, segment_size=2):
        super(TransformerEmbeddings, self).__init__()
        # 输入编码向量
        self.word_embeddings = WordEmbedding(vocab_size=vocab_size, emb_size=hidden_size)
        # 位置编码向量
        self.position_embeddings = PositionalEmbedding(max_length=position_size, emb_size=hidden_size)
        # 分段编码
        self.segment_embeddings = SegmentEmbedding(vocab_size=segment_size, emb_size=hidden_size)
        # 层规范化
        self.layer_norm = nn.LayerNorm(hidden_size)
        # Dropout操作
        self.dropout = nn.Dropout(hidden_dropout_prob)
    
    def forward(self, input_ids, segment_ids=None, position_ids=None):
        if position_ids is None:
            # 初始化全1的向量，比如[1,1,1,1]
            ones = paddle.ones_like(input_ids, dtype="int64")
            # 累加输入,求出序列前K个的长度,比如[1,2,3,4]
            seq_length = paddle.cumsum(ones, axis=-1)
            # position id的形式：比如[0, 1, 2, 3]
            position_ids = seq_length - ones
            position_ids.stop_gradient = True
        # 输入编码
        input_embeddings = self.word_embeddings(input_ids)
        # 分段编码
        segment_embeddings = self.segment_embeddings(segment_ids)
        # 位置编码
        position_embeddings = self.position_embeddings(position_ids)
        # 输入张量, 分段张量，位置张量进行叠加
        embeddings = input_embeddings + segment_embeddings + position_embeddings
        # 层规范化
        embeddings = self.layer_norm(embeddings)
        # DropOut
        embeddings = self.dropout(embeddings)
        return embeddings


Transformer组块: 多头注意力层、加与规范化层、前馈层、加与规范化层

In [None]:
import paddle.nn as nn

class AddNorm(nn.Layer):
    """加与规范化"""
    def __init__(self, size, dropout_rate):
        super(AddNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(size)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, X, H):
        """
            X：表示被包裹的非线性层的输入
            H：表示被包裹的非线性层的输出
        """
        H = X+self.dropout(H)
        return self.layer_norm(H)

In [None]:
import paddle.nn.functional as F

class PositionwiseFFN(nn.Layer):
    """逐位前馈层"""
    def __init__(self, input_size, mid_size, dropout=0.1):
        super(PositionwiseFFN, self).__init__()
        self.W_1 = nn.Linear(input_size, mid_size)
        self.W_2 = nn.Linear(mid_size, input_size)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, X):
        return self.W_2(self.dropout(F.relu(self.W_1(X))))

In [None]:
class TransformerBlock(nn.Layer):
    def __init__(self, input_size, head_num, ffn_size, dropout=0.1, attn_dropout=None, act_dropout=None):
        super(TransformerBlock, self).__init__()
        # 输入数据的维度
        self.input_size = input_size
        # 多头自注意力多头的个数
        self.head_num = head_num
        # 逐位前馈层的大小
        self.ffn_size = ffn_size
        # 加与规范化里面 Dropout的参数
        self.dropout = dropout
        # 多头注意力里面的 Dropout参数
        self.attn_dropout = dropout if attn_dropout is None else attn_dropout
        # 逐位前馈层里面的 Dropout参数
        self.act_dropout = dropout if act_dropout is None else act_dropout
        # 多头自注意力机制
        self.multi_head_attention = nn.MultiHeadAttention(embed_dim=self.input_size, num_heads=self.head_num, dropout=self.attn_dropout, need_weights=True)
        # 逐位前馈层
        self.ffn = PositionwiseFFN(self.input_size, self.ffn_size, self.act_dropout)
        # 加与规范化
        self.addnorm = AddNorm(size=self.input_size, dropout_rate=self.dropout)

    def forward(self, X, src_mask=None):
        # 多头注意力
        X_atten, atten_weights = self.multi_head_attention(X, attn_mask=src_mask)
        # 加与规范化
        X = self.addnorm(X, X_atten)
        # 前馈层
        X_ffn = self.ffn(X)
        # 加与规范化
        X = self.addnorm(X, X_ffn)
        return X, atten_weights

In [None]:
# 模型汇总
class Model_Transformer(nn.Layer):
    def __init__(self, vocab_size, n_block=2, hidden_size=768, heads_num=12, intermediate_size=3072, hidden_dropout=0.1,
                 attention_dropout=0.1, act_dropout=0, position_size=512, num_classes=2, padding_idx=0):
        super(Model_Transformer, self).__init__()
        # 词表大小
        self.vocab_size = vocab_size
        # Transformer的编码器的数目
        self.n_block = n_block
        # 每个词映射成稠密向量的维度
        self.hidden_size = hidden_size
        # 多头注意力的个数
        self.heads_num = heads_num
        # 逐位前馈层的的维度
        self.intermediate_size = intermediate_size
        # Embedding层的 Dropout
        self.hidden_dropout = hidden_dropout
        # 多头注意力的dropout的 dropout参数
        self.attention_dropout = attention_dropout
        # 位置编码的大小 position_size
        self.position_size = position_size
        # 类别数
        self.num_classes = num_classes
        # 逐位前馈层的dropout
        self.act_dropout = act_dropout
        # [PAD]字符的ID
        self.padding_idx = padding_idx
        # 实例化输入编码，分段编码和位置编码
        self.embeddings = TransformerEmbeddings(vocab_size=self.vocab_size, hidden_size=self.hidden_size, hidden_dropout_prob=self.hidden_dropout, position_size=self.position_size)
        # 实例化Transformer的编码器
        self.layers = nn.LayerList([])
        for i in range(n_block):
            encoder_layer = TransformerBlock(input_size=hidden_size, head_num=heads_num, ffn_size=intermediate_size, dropout=hidden_dropout,
                                             attn_dropout=attention_dropout, act_dropout=act_dropout)
            self.layers.append(encoder_layer)
        # 全连接层
        self.dense = nn.Linear(hidden_size, hidden_size)
        # 双曲正切激活函数
        self.activation = nn.Tanh()
        # 最后一层分类器
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, inputs, position_ids=None, attention_mask=None):
        input_ids, segment_ids = inputs
        # 构建Mask矩阵，把Pad的位置即input_ids中为0的位置设置为True,非0的位置设置为False
        if attention_mask is None:
            attention_mask = paddle.unsqueeze((input_ids == self.padding_idx).astype("float32") * -1e9, axis=[1,2])
        # 抽取特征向量
        embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, segment_ids=segment_ids)
        sequence_output = embedding_output
        self._attention_weights = []
        # Transformer的输出和注意力权重的输出
        for i, encoder_layer in enumerate(self.layers):
            sequence_output, atten_weights = encoder_layer(sequence_output, src_mask=attention_mask)
            self._attention_weights.append(atten_weights)
        # 选择第0个位置的向量作为句向量
        first_token_tensor = sequence_output[:, 0]
        # 输出层
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        # 句子级别的输出经过分类器
        logits = self.classifier(pooled_output)
        return logits
    
    @property
    def attention_weight(self):
        return self._attention_weights

In [None]:
from nndl_8_2 import Accuracy, RunnerV3
import os
import paddle.nn.functional as F

paddle.seed(2023)
heads_num = 4
epochs = 3
vocab_size = 21128
num_classes = 2
padding_idx = word2id_dict['[PAD]']
# 注意力多头的数目
# 交叉熵损失
criterion = nn.CrossEntropyLoss()
# 评估的时候采用准确率指标
metric = Accuracy()
model = Model_Transformer(vocab_size=vocab_size, n_block=1, num_classes=num_classes, padding_idx=padding_idx, heads_num=heads_num)
# 排除所有的偏置和LayerNorm的参数
decay_params = [p.name for n, p in model.named_parameters()
                if not any(nd in n for nd in ["bias", "norm"])]

# 定义Optimizer
optimizer = paddle.optimizer.AdamW(learning_rate=5E-5, parameters=model.parameters(), weight_decay=0.0, apply_decay_param_fun=lambda x:x in decay_params)

runner = RunnerV3(model=model, optimizer=optimizer, loss_fn=criterion, metric=metric)
save_path = './checkpoint/model_best03.pdparams'
runner.train(train_loader=train_loader, dev_loader=dev_loader, num_epochs=epochs, log_steps=100, eval_steps=500, save_path=save_path)

In [None]:
from nndl_8_2 import plot

plot(runner=runner, fig_name='att-loss-acc3.pdf')

In [None]:
model_path = './checkpoint/model_best03.pdparams'
runner.load_model(model_path=model_path)
accuracy, _ = runner.evaluate(test_loader)
print(f"Evaluate on test set, Accuracy:{accuracy:.5f}")

In [None]:
model_path = './checkpoint/model_best03.pdparams'
runner.load_model(model_path=model_path)
text_a = '电脑怎么录像'
text_b = '如何在计算机上录视频'
# [CLS]转换成id
cls_id = word2id_dict['[CLS]']
# [SEP]转换成id
sep_id = word2id_dict['[SEP]']
# text_a转换成id的形式
input_ids_a = [word2id_dict[item] if item in word2id_dict else word2id_dict['[UNK]']
               for item in text_a]
# text_b转换成id的形式
input_ids_b = [word2id_dict[item] if item in word2id_dict else word2id_dict['[UNK]']
               for item in text_b]
# 两个句子拼接成id的形式
input_ids = [cls_id] + input_ids_a +[sep_id] + input_ids_b + [sep_id]
# 分段id的形式
segment_ids = [0]*(len(input_ids_a)+2) + [1]*(len(input_ids_b)+1)
# 转换成Tensor张量
input_ids = paddle.to_tensor([input_ids])
segment_ids = paddle.to_tensor([segment_ids])
inputs = [input_ids, segment_ids]
# 模型预测
logits = runner.predict(inputs)
# 取概率最大的索引
label_id = paddle.argmax(logits, axis=1).numpy()[0]
# print(logits)
print('预测的label标签 {}'.format(label_id))

注意力可视化

In [None]:
# 首先加载模型
model_path = './checkpoint/model_best03.pdparams'
loaded_dict = paddle.load(model_path)
model.load_dict(loaded_dict)
model.eval()
# 输入一条样本
text_a = '电脑怎么录像?'
text_b = '如何在计算机上录视频'
texts = ['CLS'] + list(text_a) + ['SEP'] + list(text_b) + ['SEP']
# text_a和text_b分别转换成id的形式
input_ids_a = [word2id_dict[item] if item in word2id_dict else word2id_dict['[UNK]']
               for item in text_a]
input_ids_b = [word2id_dict[item] if item in word2id_dict else word2id_dict['[UNK]']
               for item in text_b]
# text_a 和text_b 拼接
input_ids = [cls_id] + input_ids_a + [sep_id] + input_ids_b + [sep_id]
# 分段编码的id的形式
segment_ids = [0]*(len(input_ids_a)+2) + [1]*(len(input_ids_b)+1)
print("输入的文本为：{}".format(texts))
print("输入的id形式：{}".format(input_ids))
# 转换成Tensor
input_ids = paddle.to_tensor([input_ids])
segment_ids = paddle.to_tensor([segment_ids])
inputs = [input_ids, segment_ids]
# 评估模式
model.eval()
# 模型预测
with paddle.no_grad():
    pooled_output = model(inputs)
# 获取多头注意力权重
atten_weights = model.attention_weight[0].numpy()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.font_manager import FontProperties
import pandas as pd

data_attention = atten_weights[0]
plt.clf()
font_size = 25
font = FontProperties(fname='simhei.ttf', size=font_size)
# 可视化其中的head，总共heads_num 个head
for head in range(heads_num):
    data = pd.DataFrame(data=data_attention[head], index=texts, columns=texts)
    f, ax = plt.subplots(figsize=(13, 13))
    # 使用heatmap可视化
    sns.heatmap(data=data, ax=ax, cmap='OrRd', cbar=False)
    # y轴旋转270度
    label_y = ax.get_yticklabels()
    plt.setp(label_y, rotation=270, horizontalalignment="right", fontproperties=font)
    # x轴旋转0度
    label_x = ax.get_xticklabels()
    plt.setp(label_x, rotation=0, horizontalalignment='right', fontproperties=font)
    plt.savefig('att-vis3_{}.pdf'.format(head))
    plt.show()

基于框架API实现文本语义匹配

In [None]:
class Model_Transformer_v1(nn.Layer):
    def __init__(self, vocab_size, n_block=1, hidden_size=768, heads_num=12, intermediate_size=3072, hidden_dropout=0.1,
                 attention_dropout=0.1, act_dropout=0, position_size=512, num_classes=2, padding_idx=0):
        super(Model_Transformer_v1, self).__init__()
        # 词表大小
        self.vocab_size = vocab_size
        # Transformer的编码器数目
        self.n_block = n_block
        # 每个词映射成稠密向量的维度
        self.hidden_size = hidden_size
        # 多头注意力的个数
        self.heads_num = heads_num
        # 逐位前馈层的的维度
        self.intermediate_size = intermediate_size
        # Embedding层的 Dropout
        self.hidden_dropout = hidden_dropout
        # 多头注意力的dropout的 dropout参数
        self.attention_dropout = attention_dropout
        # 位置编码的大小 position_size
        self.position_size = position_size
        # 类别数
        self.num_classes = num_classes
        # 逐位前馈层的dropout
        self.act_dropout = act_dropout
        # [PAD]字符的ID
        self.padding_idx = padding_idx
        # 实例化输入编码，分段编码和位置编码
        self.embeddings = TransformerEmbeddings(vocab_size=self.vocab_size, hidden_size=self.hidden_size,
                                               hidden_dropout_prob=self.hidden_dropout, position_size=self.position_size)
        # 实例化Transformer的编码器
        self.layers = nn.LayerList([])
        for i in range(n_block):
            # 使用框架API
            encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size, nhead=heads_num, dim_feedforward=intermediate_size,
                                                       dropout=hidden_dropout, attn_dropout=attention_dropout, act_dropout=act_dropout)
            self.layers.append(encoder_layer)
        # 全连接层
        self.dense = nn.Linear(hidden_size, hidden_size)
        # 双曲正切激活函数
        self.activation = nn.Tanh()
        # 最后一层分类器
        self.classifier = nn.Linear(in_features=hidden_size, out_features=num_classes)

    def forward(self, inputs, position_ids=None, attention_mask=None):
        input_ids, segment_ids = inputs
        # 构建Mask矩阵，把Pad的位置即input_ids中为0的位置设置为True,非0的位置设置为False
        if attention_mask is None:
            attention_mask = paddle.unsqueeze((input_ids == self.padding_idx).astype("float32") * -1e9, axis=[1,2])
        # 抽取特征向量
        embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, segment_ids=segment_ids)
        sequence_output = embedding_output
        self._attention_weights = []
        # Transformer的输出和注意力权重的输出
        for i, encoder_layer in enumerate(self.layers):
            sequence_output = encoder_layer(sequence_output, src_mask=attention_mask)
        # 选择第0个位置的向量作为句向量
        first_token_tensor = sequence_output[:,0]
        # 输出层
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        # 句子级别的输出经过分类器
        logits = self.classifier(pooled_output)
        return logits

In [None]:
# 模型训练

paddle.seed(2023)
heads_num = 4
epochs = 3
vocab_size = 21128
num_classes = 2
# 注意力多头的数目
# 交叉熵损失
criterion = nn.CrossEntropyLoss()
# 评估的时候采用准确率指标
metric = Accuracy()
# Transformer的分类器
model = Model_Transformer_v1(vocab_size=vocab_size, n_block=1, num_classes=num_classes, heads_num=heads_num,
                             padding_idx=padding_idx)
# 排除所有的偏置和LayerNorm的参数
decay_params = [p.name for n, p in model.named_parameters()
                if not any(nd in n for nd in ['bias', 'norm'])]

# 定义Optimizer
optimizer = paddle.optimizer.AdamW(learning_rate=5E-5, parameters=model.parameters(), weight_decay=0.0,
                                   apply_decay_param_fun=lambda x:x in decay_params)

runner = RunnerV3(model=model, optimizer=optimizer, loss_fn=criterion, metric=metric)
save_path = './checkpoint/model_best04.pdparams'
runner.train(train_loader=train_loader, dev_loader=dev_loader, num_epochs=epochs, log_steps=100, eval_steps=500, save_path=save_path)

In [None]:
model_path = './checkpoint/model_best04.pdparams'
runner.load_model(model_path=model_path)
accuracy, _ = runner.evaluate(test_loader)
# print("Evaluate on test set, Accuracy: {:.5f}".format(accuracy))
print(f"Evaluate on test set, Accuracy: {accuracy:.5f}")

增加Transformer层数的实验

In [60]:
paddle.seed(2023)
heads_num = 4
epochs = 3
vocab_size = 21128
num_classes = 2
# 注意力多头的数目
# 交叉熵损失
criterion = nn.CrossEntropyLoss()
# 评估的时候采用准确率指标
metric = Accuracy()
# Transformer的分类模型
model = Model_Transformer_v1(vocab_size=vocab_size, n_block=2, num_classes=num_classes, heads_num=heads_num, padding_idx=padding_idx)
# 排除所有的偏置和LayerNorm的参数
decay_params = [p.name for n,p in model.named_parameters()
                if not any(nd in n for nd in ['bias', 'norm'])]

# 定义 Optimizer
optimizer = paddle.optimizer.AdamW(learning_rate=5E-5, parameters=model.parameters(), weight_decay=0.0,
                                   apply_decay_param_fun=lambda x:x in decay_params)

runner = RunnerV3(model=model, optimizer=optimizer, loss_fn=criterion, metric=metric)
save_path = './checkpoint/model_best05.pdparams'
runner.train(train_loader=train_loader, dev_loader=dev_loader, num_epochs=epochs, log_steps=100,
             eval_steps=500, save_path=save_path)

[Train] epoch: 0/3, step: 1300/22386, loss: 0.53706
[Train] epoch: 0/3, step: 1400/22386, loss: 0.47577
[Train] epoch: 0/3, step: 1500/22386, loss: 0.71870
[Evaluate]  dev score: 0.58328, dev loss: 0.71074
[Evaluate] best accuracy performence has been updated: 0.55987 --> 0.58328
[Train] epoch: 0/3, step: 1600/22386, loss: 0.54842
[Train] epoch: 0/3, step: 1700/22386, loss: 0.60748
[Train] epoch: 0/3, step: 1800/22386, loss: 0.61626
[Train] epoch: 0/3, step: 1900/22386, loss: 0.39963
[Train] epoch: 0/3, step: 2000/22386, loss: 0.45947
[Evaluate]  dev score: 0.58828, dev loss: 0.72194
[Evaluate] best accuracy performence has been updated: 0.58328 --> 0.58828
[Train] epoch: 0/3, step: 2100/22386, loss: 0.66097
[Train] epoch: 0/3, step: 2200/22386, loss: 0.40756
[Train] epoch: 0/3, step: 2300/22386, loss: 0.45504
[Train] epoch: 0/3, step: 2400/22386, loss: 0.52569
[Train] epoch: 0/3, step: 2500/22386, loss: 0.65277
[Evaluate]  dev score: 0.61350, dev loss: 0.67520
[Evaluate] best accuracy

In [61]:
model_path = './checkpoint/model_best05.pdparams'
runner.load_model(model_path=model_path)
accuracy, _ = runner.evaluate(test_loader)
print(f"Evaluate on test set, Accuracy: {accuracy:.5f}")

Evaluate on test set, Accuracy: 0.70688
