In [1]:
import torch
from torch import nn
from d2l import torch as d2l

### 输入表示

In [2]:
def get_tokens_and_segments(tokens_a, tokens_b=None):
    """获取输入序列的词元及其片段索引"""
    tokens = ['<cls>'] + tokens_a + ['<sep>']
    # 0和1分别标记片段A和B
    segments = [0] * (len(tokens_a) + 2)
    if tokens_b is not None:
        tokens += tokens_b + ['<sep>']
        segments += [1] * (len(tokens_a) + 1)
    return tokens, segments

**BERTEncoder类。与TransformerEncoder不同，BERTEncoder使⽤⽚段嵌⼊和可学习的位置嵌⼊。**

In [52]:
class BERTEncoder(nn.Module):
    """Bert编码器"""

    def __init__(self, vocab_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, num_layers,
                 dropout, max_len=1000, key_size=768, query_size=768, value_size=768, **kwargs):
        super(BERTEncoder, self).__init__(**kwargs)
        self.token_embedding = nn.Embedding(vocab_size, num_hiddens)
        self.segment_embedding = nn.Embedding(2, num_hiddens)
        self.blks = nn.Sequential()
        for i in range(num_layers):
            self.blks.add_module(f'{i}', d2l.EncoderBlock(key_size, query_size, value_size, num_hiddens, norm_shape,
                                                          ffn_num_input, ffn_num_hiddens, num_heads, dropout, True))
            # 在BERT中，位置嵌入是可学习的，因此我们创建一个足够长的位置参数
            self.pos_embedding = nn.Parameter(torch.randn(1, max_len, num_hiddens))
            # print(f'pos_embedding: {self.pos_embedding}')

    def forward(self, tokens, segments, valid_lens):
        # 在以下代码段中，X的形状保持不变：(批量大小，最大序列长度，num_hiddens)
        # print(f'tokens: {tokens}')
        # print(f'segments: {segments}')
        # print(f'valid_lens: {valid_lens}')
        X = self.token_embedding(tokens) + self.segment_embedding(segments)
        # print(f'tokens and segments embedding: {X}')
        X = X + self.pos_embedding.data[:, :X.shape[1], :]
        # print(f'tokens and segments and pos embedding: {X}')
        for blk in self.blks:
            X = blk(X, valid_lens)
            # print(f'X in blk: {X}')
        return X


**假设词表⼤⼩为10000，为了演⽰BERTEncoder的前向推断，让我们创建⼀个实例并初始化它的参数**

In [53]:
vocab_size, num_hiddens, ffn_num_hiddens, num_heads = 10000, 768, 1024, 4
norm_shape, ffn_num_input, num_layers, dropout = [768], 768, 2, 0.2
encoder = BERTEncoder(vocab_size, num_hiddens, norm_shape, ffn_num_input,
                      ffn_num_hiddens, num_heads, num_layers, dropout)

In [54]:
tokens = torch.randint(0, vocab_size, (2, 8))
segments = torch.tensor([[0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 0, 1, 1, 1, 1, 1]])
encoded_X = encoder(tokens, segments, None)
encoded_X.shape

torch.Size([2, 8, 768])

In [55]:
print(f'encoded_X: {encoded_X}')

encoded_X: tensor([[[-1.3943,  0.0458, -0.5456,  ...,  0.2800,  0.9279,  0.0133],
         [ 0.4907, -1.6218, -0.5231,  ..., -0.8197,  1.4296,  0.7573],
         [ 0.0541, -2.1216,  0.6969,  ..., -0.2749,  0.4089,  0.7051],
         ...,
         [-1.9607, -2.1127, -0.8636,  ..., -1.3949,  1.0475, -1.3129],
         [-1.2140, -0.5986,  0.2625,  ..., -2.6860, -0.6815, -0.4885],
         [-0.1200, -1.2591,  0.6356,  ..., -0.0253, -0.3451, -0.3682]],

        [[-1.6944,  0.4904,  1.3444,  ..., -0.9685,  0.9147,  0.1177],
         [-0.5462, -1.6796, -0.4770,  ..., -0.8922, -0.5093, -0.3878],
         [-0.1904,  0.0251, -1.6325,  ..., -0.3845, -0.5029,  0.5741],
         ...,
         [ 0.1681, -0.7113, -1.0024,  ..., -1.1642,  0.8973, -1.2125],
         [-0.4128,  0.0712,  1.0136,  ..., -1.4053, -0.3935, -0.8266],
         [-1.3334, -1.0205,  0.0063,  ..., -1.3598, -1.1006,  1.1586]]],
       grad_fn=<NativeLayerNormBackward0>)


### 预训练任务
**BERTEncoder的前向推断给出了输⼊⽂本的每个词元和插⼊的特殊标记“<cls>”及“<seq>”的BERT表⽰。
接下来，我们将使⽤这些表⽰来计算预训练BERT的损失函数。预训练包括以下两个任务：掩蔽语⾔模型和下
⼀句预测。**

#### 遮蔽语言模型（Masked Language Modeling）
**语⾔模型使⽤左侧的上下⽂预测词元。为了双向编码上下⽂以表⽰每个词元，BERT随机掩蔽词元并使⽤来⾃双向上下⽂的词元以⾃监督的⽅式预测掩蔽词元。此任务称为掩蔽语⾔模型。**
**在这个预训练任务中，将随机选择15%的词元作为预测的掩蔽词元。要预测⼀个掩蔽词元⽽不使⽤标签作弊，⼀个简单的⽅法是总是⽤⼀个特殊的“<mask>”替换输⼊序列中的词元。然⽽，⼈造特殊词元“<mask>”不会出现在微调中。为了避免预训练和微调之间的这种不匹配，如果为预测⽽屏蔽词元（例如，在“this movie is great”中选择掩蔽和预测“great”），则在输⼊中将其替换为：
• 80%时间为特殊的“<mask>“词元（例如，“this movie is great”变为“this movie is<mask>”；
• 10%时间为随机词元（例如，“this movie is great”变为“this movie is drink”）；
• 10%时间内为不变的标签词元（例如，“this movie is great”变为“this movie is great”）。
请注意，在15%的时间中，有10%的时间插⼊了随机词元。这种偶然的噪声⿎励BERT在其双向上下⽂编码中不那么偏向于掩蔽词元（尤其是当标签词元保持不变时）。**

**我们实现了下⾯的MaskLM类来预测BERT预训练的掩蔽语⾔模型任务中的掩蔽标记。预测使⽤单隐藏层的多层感知机（self.mlp）。在前向推断中，它需要两个输⼊：BERTEncoder的编码结果和⽤于预测的词元位置。输出是这些位置的预测结果。**

In [56]:
class MaskLM(nn.Module):
    """BERT的遮蔽语言模型任务"""

    def __init__(self, vocab_size, num_hiddens, num_inputs=768, **kwargs):
        super(MaskLM, self).__init__(**kwargs)
        self.mlp = nn.Sequential(nn.Linear(num_inputs, num_hiddens),
                                 nn.ReLU(),
                                 nn.LayerNorm(num_hiddens),
                                 nn.Linear(num_hiddens, vocab_size))

    def forward(self, X, pred_positions):
        num_pred_position = pred_positions.shape[1]
        print(f'num_pred_position: {num_pred_position}')
        pred_positions = pred_positions.reshape(-1)
        print(f'pred_positions: {pred_positions}')
        batch_size = X.shape[0]
        print(f'batch_size: {batch_size}')
        batch_idx = torch.arange(0, batch_size)
        # 假设batch_size=2，num_pred_positions=3
        # 那么batch_idx是np.array（[0,0,0,1,1,1]）
        batch_idx = torch.repeat_interleave(batch_idx, num_pred_position)
        print(f'batch_idx: {batch_idx}')
        masked_X = X[batch_idx, pred_positions]
        print(f'masked_X: {masked_X}')
        masked_X = masked_X.reshape((batch_size, num_pred_position, -1))
        print(f'masked_X: {masked_X}')
        mlm_Y_hat = self.mlp(masked_X)
        return mlm_Y_hat


**为了演⽰MaskLM的前向推断，我们创建了其实例mlm并对其进⾏了初始化。回想⼀下，来⾃BERTEncoder的正向推断encoded_X表⽰2个BERT输⼊序列。我们将mlm_positions定义为在encoded_X的任⼀输⼊序列中预测的3个指⽰。mlm的前向推断返回encoded_X的所有掩蔽位置mlm_positions处的预测结果mlm_Y_hat。对于每个预测，结果的⼤⼩等于词表的⼤⼩。**

In [57]:
mlm = MaskLM(vocab_size, num_hiddens)
mlm_position = torch.tensor([[1, 5, 2], [6, 1, 5]])
mlm_Y_hat = mlm(encoded_X, mlm_position)
mlm_Y_hat.shape

num_pred_position: 3
pred_positions: tensor([1, 5, 2, 6, 1, 5])
batch_size: 2
batch_idx: tensor([0, 0, 0, 1, 1, 1])
masked_X: tensor([[ 0.4907, -1.6218, -0.5231,  ..., -0.8197,  1.4296,  0.7573],
        [-1.9607, -2.1127, -0.8636,  ..., -1.3949,  1.0475, -1.3129],
        [ 0.0541, -2.1216,  0.6969,  ..., -0.2749,  0.4089,  0.7051],
        [-0.4128,  0.0712,  1.0136,  ..., -1.4053, -0.3935, -0.8266],
        [-0.5462, -1.6796, -0.4770,  ..., -0.8922, -0.5093, -0.3878],
        [ 0.1681, -0.7113, -1.0024,  ..., -1.1642,  0.8973, -1.2125]],
       grad_fn=<IndexBackward0>)
masked_X: tensor([[[ 0.4907, -1.6218, -0.5231,  ..., -0.8197,  1.4296,  0.7573],
         [-1.9607, -2.1127, -0.8636,  ..., -1.3949,  1.0475, -1.3129],
         [ 0.0541, -2.1216,  0.6969,  ..., -0.2749,  0.4089,  0.7051]],

        [[-0.4128,  0.0712,  1.0136,  ..., -1.4053, -0.3935, -0.8266],
         [-0.5462, -1.6796, -0.4770,  ..., -0.8922, -0.5093, -0.3878],
         [ 0.1681, -0.7113, -1.0024,  ..., -1.1642,  

torch.Size([2, 3, 10000])

In [58]:
print(f'mlm_Y_hat: {mlm_Y_hat}')

mlm_Y_hat: tensor([[[-0.7321,  0.2989, -0.0655,  ..., -0.0333, -0.2578,  0.9658],
         [-0.5161, -0.1170, -0.3803,  ...,  0.4254, -0.1207, -0.7576],
         [-0.4004, -0.0165,  0.9563,  ...,  0.0673,  0.4122,  0.7785]],

        [[ 0.2433,  1.0621,  1.3504,  ..., -0.6983, -0.3634,  0.0222],
         [ 0.7957,  0.2007,  0.2906,  ...,  0.7890,  0.6155,  0.4064],
         [ 0.3141,  0.5112,  0.1292,  ..., -0.1900, -0.3586, -0.2589]]],
       grad_fn=<AddBackward0>)


**通过掩码下的预测词元mlm_Y的真实标签mlm_Y_hat，我们可以计算在BERT预训练中的遮蔽语⾔模型任务
的交叉熵损失。**

In [59]:
print(f'tokens: {tokens}')

tokens: tensor([[ 636, 6395, 7491, 9497, 7918, 2362, 6437, 5562],
        [7906,  485, 9052, 3778, 9504, 5929, 1732, 8488]])


In [60]:
mlm_Y = torch.tensor([[7, 8, 9], [10, 20, 30]])
loss = nn.CrossEntropyLoss(reduction='none')
mlm_l = loss(mlm_Y_hat.reshape((-1, vocab_size)), mlm_Y.reshape(-1))
mlm_l.shape

torch.Size([6])

#### 下⼀句预测（Next Sentence Prediction）

**尽管掩蔽语⾔建模能够编码双向上下⽂来表⽰单词，但它不能显式地建模⽂本对之间的逻辑关系。为了帮助
理解两个⽂本序列之间的关系，BERT在预训练中考虑了⼀个⼆元分类任务——下⼀句预测。在为预训练⽣成
句⼦对时，有⼀半的时间它们确实是标签为“真”的连续句⼦；在另⼀半的时间⾥，第⼆个句⼦是从语料库
中随机抽取的，标记为“假”。**

**下⾯的NextSentencePred类使⽤单隐藏层的多层感知机来预测第⼆个句⼦是否是BERT输⼊序列中第⼀
个句⼦的下⼀个句⼦。由于Transformer编码器中的⾃注意⼒，特殊词元“<cls>”的BERT表⽰已经对输⼊的
两个句⼦进⾏了编码。因此，多层感知机分类器的输出层（self.output）以X作为输⼊，其中X是多层感
知机隐藏层的输出，⽽MLP隐藏层的输⼊是编码后的“<cls>”词元。**

In [61]:
class NextSentencePred(nn.Module):
    """BERT的下一句预测任务"""

    def __init__(self, num_inputs, **kwargs):
        super(NextSentencePred, self).__init__(**kwargs)
        self.output = nn.Linear(num_inputs, 2)

    def forward(self, x):
        # X的形状：（batch_size, num_hiddens）
        return self.output(x)

**可以看到，NextSentencePred实例的前向推断返回每个BERT输⼊序列的⼆分类预测。**

In [62]:
encoded_X_nsp = encoded_X
encoded_X_nsp = torch.flatten(encoded_X_nsp, start_dim=1)
# NSP的输入形状：（batch_size,num_hiddens）
nsp = NextSentencePred(encoded_X_nsp.shape[-1])
nsp_Y_hat = nsp(encoded_X_nsp)
nsp_Y_hat.shape

torch.Size([2, 2])

In [63]:
print(encoded_X.shape)

torch.Size([2, 8, 768])


In [64]:
encoded_X_nsp.shape

torch.Size([2, 6144])

In [65]:
nsp_y = torch.tensor([0, 1])
nsp_l = loss(nsp_Y_hat, nsp_y)
nsp_l.shape

torch.Size([2])

In [66]:
print(nsp_l)

tensor([1.0632, 0.8091], grad_fn=<NllLossBackward0>)


**值得注意的是，上述两个预训练任务中的所有标签都可以从预训练语料库中获得，⽽⽆需⼈⼯标注。原始
的BERT已经在图书语料库 [Zhu et al., 2015]和英⽂维基百科的连接上进⾏了预训练。这两个⽂本语料库⾮常
庞⼤：它们分别有8亿个单词和25亿个单词。**

#### 整合代码

**在预训练BERT时，最终的损失函数是掩蔽语⾔模型损失函数和下⼀句预测损失函数的线性组合。现在我们
可以通过实例化三个类BERTEncoder、MaskLM和NextSentencePred来定义BERTModel类。前向推断返
回编码后的BERT表⽰encoded_X、掩蔽语⾔模型预测mlm_Y_hat和下⼀句预测nsp_Y_hat。**

In [67]:
class BERTModel(nn.Module):
    """BERT模型"""

    def __init__(self, vocab_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, num_layers,
                 dropout, max_len=1000, key_size=768, query_size=768, value_size=768, hid_in_features=768,
                 mlm_in_features=768, nsp_in_features=768):
        super(BERTModel, self).__init__()
        self.encoder = BERTEncoder(vocab_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads,
                                   num_layers, dropout, max_len=max_len, key_size=key_size, query_size=query_size,
                                   value_size=value_size)
        self.hidden = nn.Sequential(nn.Linear(hid_in_features, num_hiddens), nn.Tanh())
        self.mlm = MaskLM(vocab_size, num_hiddens, mlm_in_features)
        self.nsp = NextSentencePred(nsp_in_features)

    def forward(self, tokens, segments, valid_lens=None, pred_positions=None):
        encoded_X = self.encoder(tokens, segments, valid_lens)
        if pred_positions is not None:
            mlm_Y_hat = self.mlm(encoded_X, pred_positions)
        else:
            mlm_Y_hat = None
        nsp_Y_hat = self.nsp(self.hidden(encoded_X[:, 0, :]))
        return encoded_X, mlm_Y_hat, nsp_Y_hat