**参考资料：**
* [BertWithPretrained](https://github.com/moon-hotel/BertWithPretrained)

**模型架构：**

![Bert_model](Model.jpg "model")

## 一、全局参数管理对象config的创建

In [1]:
import json
import copy
import six
import logging

class BertConfig(object):
    """Configuration for `BertModel`."""

    def __init__(self,
                 vocab_size=21128, #词汇字典大小 
                 hidden_size=768, #隐藏层大小
                 num_hidden_layers=12, # 自注意力块的个数
                 num_attention_heads=12,  # 每个自注意力块的多头个数
                 intermediate_size=3072,  #中间层大小 
                 pad_token_id=0, # 填充字符
                 hidden_act="gelu", #激活函数类型
                 hidden_dropout_prob=0.1, # dropout率
                 attention_probs_dropout_prob=0.1, # dropout率
                 max_position_embeddings=512, # 单句最大长度
                 type_vocab_size=2, # 句子顺序，1或2
                 initializer_range=0.02):
        """Constructs BertConfig.
        Args:
          vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
          hidden_size: Size of the encoder layers and the pooler layer.
          num_hidden_layers: Number of hidden layers in the Transformer encoder.
          num_attention_heads: Number of attention heads for each attention layer in
            the Transformer encoder.
          intermediate_size: The size of the "intermediate" (i.e., feed-forward)
            layer in the Transformer encoder.
          hidden_act: The non-linear activation function (function or string) in the
            encoder and pooler.
          hidden_dropout_prob: The dropout probability for all fully connected
            layers in the embeddings, encoder, and pooler.
          attention_probs_dropout_prob: The dropout ratio for the attention
            probabilities.
          max_position_embeddings: The maximum sequence length that this model might
            ever be used with. Typically set this to something large just in case
            (e.g., 512 or 1024 or 2048).
          type_vocab_size: The vocabulary size of the `token_type_ids` passed into
            `BertModel`.
          initializer_range: The stdev of the truncated_normal_initializer for
            initializing all weight matrices.
        """
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.pad_token_id = pad_token_id
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range

    @classmethod
    def from_dict(cls, json_object): # 从json文件中读取参数配置
        """Constructs a `BertConfig` from a Python dictionary of parameters."""
        config = BertConfig(vocab_size=None)
        for (key, value) in six.iteritems(json_object):
            config.__dict__[key] = value
        return config

    @classmethod
    def from_json_file(cls, json_file):
        """Constructs a `BertConfig` from a json file of parameters."""
        """从json配置文件读取配置信息"""
        with open(json_file, 'r') as reader:
            text = reader.read()
        logging.info(f"成功导入BERT配置文件 {json_file}")
        return cls.from_dict(json.loads(text))

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

In [3]:
config = BertConfig()
for i in config.__dict__: # 查看全局变量设置
    print(i, '=', config.__dict__[i])

vocab_size = 21128
hidden_size = 768
num_hidden_layers = 12
num_attention_heads = 12
hidden_act = gelu
intermediate_size = 3072
pad_token_id = 0
hidden_dropout_prob = 0.1
attention_probs_dropout_prob = 0.1
max_position_embeddings = 512
type_vocab_size = 2
initializer_range = 0.02


## 二、输入编码

**模型架构：**

![Bert_model](Input.jpg "model")

In [4]:
import torch.nn as nn
import torch
from torch.nn.init import normal_

In [5]:
# 词嵌入 编码
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, hidden_size, pad_token_id=0, initializer_range=0.02):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=pad_token_id) # 直接调用 21128 ——> 768
        self._reset_parameters(initializer_range)

    def forward(self, input_ids):
        """
        :param input_ids: shape : [input_ids_len, batch_size]
        :return: shape: [input_ids_len, batch_size, hidden_size]
        """
        return self.embedding(input_ids)  # 512x64 ——> 512x64x768

    def _reset_parameters(self, initializer_range):
        r"""Initiate parameters."""
        """
        初始化
        """
        for p in self.parameters():
            if p.dim() > 1:
                normal_(p, mean=0.0, std=initializer_range)

In [6]:
# 单词位置 编码
class PositionalEmbedding(nn.Module):
    """
    位置编码。
      *** 注意： Bert中的位置编码完全不同于Transformer中的位置编码，
                前者本质上也是一个普通的Embedding层，而后者是通过公式计算得到，
                而这也是为什么Bert只能接受长度为512字符的原因，因为位置编码的最大size为512 ***
      # Since the position embedding table is a learned variable, we create it
      # using a (long) sequence length `max_position_embeddings`. The actual
      # sequence length might be shorter than this, for faster training of
      # tasks that do not have long sequences.
                                                 ————————  GoogleResearch
    https://github.com/google-research/bert/blob/eedf5716ce1268e56f0a50264a88cafad334ac61/modeling.py
    """

    def __init__(self, hidden_size, max_position_embeddings=512, initializer_range=0.02):
        super(PositionalEmbedding, self).__init__()
        # 因为BERT预训练模型的长度为512
        self.embedding = nn.Embedding(max_position_embeddings, hidden_size) # 512x768
        self._reset_parameters(initializer_range)

    def forward(self, position_ids):
        """
        :param position_ids: [1,position_ids_len]
        :return: [position_ids_len, 1, hidden_size]
        """
        return self.embedding(position_ids).transpose(0, 1)  # 1x512 ——> 512x1x768

    def _reset_parameters(self, initializer_range):
        r"""Initiate parameters."""
        """
        初始化
        """
        for p in self.parameters():
            if p.dim() > 1:
                normal_(p, mean=0.0, std=initializer_range)

In [7]:
# 句子编码
class SegmentEmbedding(nn.Module):
    def __init__(self, type_vocab_size, hidden_size, initializer_range=0.02):
        super(SegmentEmbedding, self).__init__()
        self.embedding = nn.Embedding(type_vocab_size, hidden_size) # 2x768
        self._reset_parameters(initializer_range)

    def forward(self, token_type_ids):
        """

        :param token_type_ids:  shape: [token_type_ids_len, batch_size]
        :return: shape: [token_type_ids_len, batch_size, hidden_size]
        """
        return self.embedding(token_type_ids) # 512x64 ——> 512x64x768

    def _reset_parameters(self, initializer_range):
        r"""Initiate parameters."""
        """
        初始化
        """
        for p in self.parameters():
            if p.dim() > 1:
                normal_(p, mean=0.0, std=initializer_range)

In [8]:
class BertEmbeddings(nn.Module):
    """
    BERT Embedding which is consisted with under features
        1. TokenEmbedding : normal embedding matrix
        2. PositionalEmbedding : normal embedding matrix
        2. SegmentEmbedding : adding sentence segment info, (sent_A:1, sent_B:2)
        sum of all these features are output of BERTEmbedding
    """

    def __init__(self, config):
        super().__init__()
        self.word_embeddings = TokenEmbedding(vocab_size=config.vocab_size,
                                              hidden_size=config.hidden_size,
                                              pad_token_id=config.pad_token_id,
                                              initializer_range=config.initializer_range)
        # return shape [src_len,batch_size,hidden_size]

        self.position_embeddings = PositionalEmbedding(max_position_embeddings=config.max_position_embeddings,
                                                       hidden_size=config.hidden_size,
                                                       initializer_range=config.initializer_range)
        # return shape [src_len,1,hidden_size]

        self.token_type_embeddings = SegmentEmbedding(type_vocab_size=config.type_vocab_size,
                                                      hidden_size=config.hidden_size,
                                                      initializer_range=config.initializer_range)
        # return shape  [src_len,batch_size,hidden_size]

        self.LayerNorm = nn.LayerNorm(config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.register_buffer("position_ids",
                             torch.arange(config.max_position_embeddings).expand((1, -1)))
        # shape: [1, max_position_embeddings]

    def forward(self,
                input_ids=None,
                position_ids=None,
                token_type_ids=None):
        """
        :param input_ids:  输入序列的原始token id, shape: [src_len, batch_size]
        :param position_ids: 位置序列，本质就是 [0,1,2,3,...,src_len-1], shape: [1,src_len]
        :param token_type_ids: 句子分隔token, 例如[0,0,0,0,1,1,1,1]用于区分两个句子 shape:[src_len,batch_size]
        :return: [src_len, batch_size, hidden_size]
        """
        src_len = input_ids.size(0)
        token_embedding = self.word_embeddings(input_ids)
        # shape:[src_len,batch_size,hidden_size]

        if position_ids is None:  # 在实际建模时这个参数其实可以不用传值
            position_ids = self.position_ids[:, :src_len]  # [1,src_len]
        positional_embedding = self.position_embeddings(position_ids)
        # [src_len, 1, hidden_size]

        if token_type_ids is None:  # 如果输入模型的只有一个序列，那么这个参数也不用传值
            token_type_ids = torch.zeros_like(input_ids,
                                              device=self.position_ids.device)  # [src_len, batch_size]
            
        segment_embedding = self.token_type_embeddings(token_type_ids)
        # [src_len,batch_size,hidden_size]

        embeddings = token_embedding + positional_embedding + segment_embedding
        # [src_len,batch_size,hidden_size] + [src_len,1,hidden_size] + [src_len,batch_size,hidden_size]
        embeddings = self.LayerNorm(embeddings)  # [src_len, batch_size, hidden_size]
        embeddings = self.dropout(embeddings)

        
        return embeddings

## 三、训练数据预处理

In [48]:
class Vocab:
    """
    根据本地的vocab文件，构造一个词表
    vocab = Vocab()
    print(vocab.itos)  # 得到一个列表，返回词表中的每一个词；
    print(vocab.itos[2])  # 通过索引返回得到词表中对应的词；
    print(vocab.stoi)  # 得到一个字典，返回词表中每个词的索引；
    print(vocab.stoi['我'])  # 通过单词返回得到词表中对应的索引
    print(len(vocab))  # 返回词表长度
    """
    UNK = '[UNK]'

    def __init__(self, vocab_path):
        self.stoi = {}
        self.itos = []
        with open(vocab_path, 'r', encoding='utf-8') as f:
            for i, word in enumerate(f):
                w = word.strip('\n')
                self.stoi[w] = i
                self.itos.append(w)

    def __getitem__(self, token):
        return self.stoi.get(token, self.stoi.get(Vocab.UNK))

    def __len__(self):
        return len(self.itos)

In [53]:
def build_vocab(vocab_path):
    """
    vocab = Vocab()
    print(vocab.itos)  # 得到一个列表，返回词表中的每一个词；
    print(vocab.itos[2])  # 通过索引返回得到词表中对应的词；
    print(vocab.stoi)  # 得到一个字典，返回词表中每个词的索引；
    print(vocab.stoi['我'])  # 通过单词返回得到词表中对应的索引
    """
    return Vocab(vocab_path)

vocab = build_vocab('./data/vocab.txt')
print(vocab.stoi['我'])
print(vocab.itos[0])

2769
[PAD]


In [40]:
import random

text = ['红酥手，黄縢酒，满城春色宫墙柳。东风恶，欢情薄。一怀愁绪，几年离索。春如旧，人空瘦，泪痕红鲛绡透。桃花落，闲池阁。山盟虽在，锦书难托。',
        '十年生死两茫茫。不思量，自难忘。千里孤坟，无处话凄凉。纵使相逢应不识，尘满面，鬓如霜。夜来幽梦忽还乡，小轩窗，正梳妆。相顾无言，惟有泪千行。料得年年断肠处，明月夜，短松冈。']

paragraphs = []

for line in text:
    paragraphs.append([line[0]])
    line = line.strip() # 去掉换行符和两边的空格
    for w in line[1:]:
        if paragraphs[-1][-1][-1] in '。':
            paragraphs[-1][-1] = paragraphs[-1][-1][:-1]
            paragraphs[-1].append(w)
        else:
            paragraphs[-1][-1] += w
    paragraphs[-1][-1] = paragraphs[-1][-1][:-1]
     
random.shuffle(paragraphs) # 将所有段落打乱

for i in paragraphs:
    print(i)





['十年生死两茫茫', '不思量，自难忘', '千里孤坟，无处话凄凉', '纵使相逢应不识，尘满面，鬓如霜', '夜来幽梦忽还乡，小轩窗，正梳妆', '相顾无言，惟有泪千行', '料得年年断肠处，明月夜，短松冈']
['红酥手，黄縢酒，满城春色宫墙柳', '东风恶，欢情薄', '一怀愁绪，几年离索', '春如旧，人空瘦，泪痕红鲛绡透', '桃花落，闲池阁', '山盟虽在，锦书难托']


In [41]:
# NSP任务
def get_next_sentence_sample(sentence, next_sentence, paragraphs):
    if random.random() < 0.5: # 产生[0,1)之间的一个随机数
        is_next = True
    else:
        new_next_sentence = next_sentence
        while next_sentence == new_next_sentence:
            new_next_sentence = random.choice(random.choice(paragraphs))
        next_sentence = new_next_sentence
        is_next = False
    return sentence, next_sentence, is_next

In [85]:
from transformers import BertTokenizer
model_name = 'bert-base-chinese'
tokenizer = BertTokenizer.from_pretrained(model_name).tokenize

for paragraph in paragraphs:
    for i in range(len(paragraph) - 1):
        sentence, next_sentence, is_next = get_next_sentence_sample(paragraph[i], paragraph[i + 1], paragraphs) # 构造 NSP 样本
        token_a_ids = [vocab[token] for token in tokenizer(sentence)]
        token_b_ids = [vocab[token] for token in tokenizer(next_sentence)]
        token_ids = [vocab['[CLS]']] + token_a_ids + [vocab['[SEP]']] + token_b_ids
        token_ids += [vocab['[SEP]']]
        seg1 = [0] * (len(token_a_ids) + 2)  # 2 表示[CLS]和中间的[SEP]这两个字符
        seg2 = [1] * (len(token_b_ids) + 1)
        segs = seg1 + seg2
        break

print(f" ## 当前句文本：{sentence}")
print(f" ## 下一句文本：{next_sentence}")
print(f" ## 下一句标签：{is_next}")
print(token_ids)
print(segs)
print(vocab.stoi['红'])
print(vocab.stoi['[CLS]'])


 ## 当前句文本：红酥手，黄縢酒，满城春色宫墙柳
 ## 下一句文本：东风恶，欢情薄
 ## 下一句标签：True
[101, 5273, 6989, 2797, 8024, 7942, 100, 6983, 8024, 4007, 1814, 3217, 5682, 2151, 1870, 3394, 102, 691, 7599, 2626, 8024, 3614, 2658, 5946, 102]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
5273
101


In [91]:
# MLM任务数据处理
def replace_masked_tokens(token_ids, candidate_pred_positions, num_mlm_preds):
    """
    本函数的作用是根据给定的token_ids、候选mask位置以及需要mask的数量来返回被mask后的token_ids以及标签信息
    :param token_ids:
    :param candidate_pred_positions:
    :param num_mlm_preds:
    :return:
    """
    pred_positions = []
    mlm_input_tokens_id = [token_id for token_id in token_ids]
    for mlm_pred_position in candidate_pred_positions:
        if len(pred_positions) >= num_mlm_preds:
            break  # 如果已经mask的数量大于等于num_mlm_preds则停止mask
        masked_token_id = None
        # 80%的时间：将词替换为['MASK']词元，但这里是直接替换为['MASK']对应的id
        if random.random() < 0.8:  # 0.8
            masked_token_id = vocab['[MASK]']
        else:
            # 10%的时间：保持词不变
            if random.random() < 0.5:  # 0.5
                masked_token_id = token_ids[mlm_pred_position]
            # 10%的时间：用随机词替换该词
            else:
                masked_token_id = random.randint(0, len(vocab.stoi) - 1)
        mlm_input_tokens_id[mlm_pred_position] = masked_token_id
        pred_positions.append(mlm_pred_position)  # 保留被mask位置的索引信息
    # 构造mlm任务中需要预测位置对应的正确标签，如果其没出现在pred_positions则表示该位置不是mask位置
    # 则在进行损失计算时需要忽略掉这些位置（即为PAD_IDX）；而如果其出现在mask的位置，则其标签为原始token_ids对应的id
    mlm_label = [0 if idx not in pred_positions
                    else token_ids[idx] for idx in range(len(token_ids))]
    return mlm_input_tokens_id, mlm_label

def get_masked_sample(token_ids):
    """
    本函数的作用是将传入的 一段token_ids的其中部分进行mask处理
    :param token_ids:         e.g. [101, 1031, 4895, 2243, 1033, 10029, 2000, 2624, 1031,....]
    :return: mlm_input_tokens_id:  [101, 1031, 103, 2243, 1033, 10029, 2000, 103,  1031, ...]
                        mlm_label:  [ 0,   0,   4895,  0,    0,    0,    0,   2624,  0,...]
    """
    candidate_pred_positions = []  # 候选预测位置的索引
    for i, ids in enumerate(token_ids):
        # 在遮蔽语言模型任务中不会预测特殊词元，所以如果该位置是特殊词元
        # 那么该位置就不会成为候选mask位置
        if ids in [vocab['[CLS]'], vocab['[SEP]']]:
            continue
        candidate_pred_positions.append(i)
        # 保存候选位置的索引， 例如可能是 [ 2,3,4,5, ....]
    random.shuffle(candidate_pred_positions)  # 将所有候选位置打乱，更利于后续随机
    # 被掩盖位置的数量，BERT模型中默认将15%的Token进行mask
    num_mlm_preds = max(1, round(len(token_ids) * 0.15))
    # print(f" ## Mask数量为: {num_mlm_preds}")
    mlm_input_tokens_id, mlm_label = replace_masked_tokens(
        token_ids, candidate_pred_positions, num_mlm_preds)
    return mlm_input_tokens_id, mlm_label


In [87]:
mlm_input_tokens_id, mlm_label= get_masked_sample(token_ids)
print(vocab['[MASK]'])
print(mlm_input_tokens_id)
print(mlm_label)

 ## Mask数量为: 4
103
[101, 5273, 103, 2797, 8024, 7942, 103, 6983, 8024, 4007, 103, 3217, 5682, 2151, 1870, 3394, 102, 691, 7599, 2626, 8024, 103, 2658, 5946, 102]
[0, 0, 6989, 0, 0, 0, 100, 0, 0, 0, 1814, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3614, 0, 0, 0]


In [88]:
print(f"## 当前句文本：{sentence}")
print(f"## 下一句文本：{next_sentence}")
print(f"## 是否为下一句：{is_next}")
print(f"==================")
print(f"## Mask之前的词元结果:{[vocab.itos[t] for t in token_ids]}")
print(f"## Mask之前的label ids:{token_ids}")
print(f"## 两句话位置标记segs:{segs}")
print(f"==================")
print(f"## Mask之后的词元结果:{[vocab.itos[t] for t in mlm_input_tokens_id]}")
print(f"## Mask之前的label ids:{mlm_label}")
print(f"## 当前样本构造结束================== \n\n")



## 当前句文本：红酥手，黄縢酒，满城春色宫墙柳
## 下一句文本：东风恶，欢情薄
## 是否为下一句：True
## Mask之前的词元结果:['[CLS]', '红', '酥', '手', '，', '黄', '[UNK]', '酒', '，', '满', '城', '春', '色', '宫', '墙', '柳', '[SEP]', '东', '风', '恶', '，', '欢', '情', '薄', '[SEP]']
## Mask之前的label ids:[101, 5273, 6989, 2797, 8024, 7942, 100, 6983, 8024, 4007, 1814, 3217, 5682, 2151, 1870, 3394, 102, 691, 7599, 2626, 8024, 3614, 2658, 5946, 102]
## 两句话位置标记segs:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
## Mask之后的词元结果:['[CLS]', '红', '[MASK]', '手', '，', '黄', '[MASK]', '酒', '，', '满', '[MASK]', '春', '色', '宫', '墙', '柳', '[SEP]', '东', '风', '恶', '，', '[MASK]', '情', '薄', '[SEP]']
## Mask之前的label ids:[0, 0, 6989, 0, 0, 0, 100, 0, 0, 0, 1814, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3614, 0, 0, 0]




In [92]:
def data_process(paragraphs):
    """
    本函数的作用是是根据格式化后的数据制作NSP和MLM两个任务对应的处理完成的数据
    :param filepath:
    :return:
    """
    # paragraphs = paragraphs
    # 返回的是一个二维列表，每个列表可以看做是一个段落（其中每个元素为一句话）
    data = []
    max_len = 0 # 这里的max_len用来记录整个数据集中最长序列的长度，在后续可将其作为padding长度的标准

    for paragraph in paragraphs:  # 遍历每个段落
        for i in range(len(paragraph) - 1):  # 遍历一个段落中的每一句话
            sentence, next_sentence, is_next = get_next_sentence_sample(paragraph[i], paragraph[i + 1], paragraphs) # 构造 NSP 样本
            token_a_ids = [vocab[token] for token in tokenizer(sentence)]
            token_b_ids = [vocab[token] for token in tokenizer(next_sentence)]
            token_ids = [vocab['[CLS]']] + token_a_ids + [vocab['[SEP]']] + token_b_ids
            token_ids += [vocab['[SEP]']]
            seg1 = [0] * (len(token_a_ids) + 2)  # 2 表示[CLS]和中间的[SEP]这两个字符
            seg2 = [1] * (len(token_b_ids) + 1)
            segs = seg1 + seg2
            segs = torch.tensor(segs, dtype=torch.long)
            nsp_lable = torch.tensor(int(is_next), dtype=torch.long)
            mlm_input_tokens_id, mlm_label = get_masked_sample(token_ids)
            token_ids = torch.tensor(mlm_input_tokens_id, dtype=torch.long)
            mlm_label = torch.tensor(mlm_label, dtype=torch.long)
            max_len = max(max_len, token_ids.size(0))
            data.append([token_ids, segs, nsp_lable, mlm_label])
            
    all_data = {'data': data, 'max_len': max_len}
    return all_data

In [93]:
all_data = data_process(paragraphs)

In [109]:
for t in all_data['data'][0]:
    print(t)

tensor([ 101, 1282, 2399, 4495, 3647,  697, 5755, 5755,  102,  679,  103,  103,
        8024, 5632, 3398, 2563,  102])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1])
tensor(1)
tensor([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 2590, 7030,
           0,    0, 7410,    0,    0])


In [105]:
all_data['max_len']

33

In [114]:
def pad_sequence(sequences, batch_first=False, max_len=None, padding_value=0):
    """
    对一个List中的元素进行padding
    Pad a list of variable length Tensors with ``padding_value``
    a = torch.ones(25)
    b = torch.ones(22)
    c = torch.ones(15)
    pad_sequence([a, b, c],max_len=None).size()
    torch.Size([25, 3])
        sequences:
        batch_first: 是否把batch_size放到第一个维度
        padding_value:
        max_len :
                当max_len = 50时，表示以某个固定长度对样本进行padding，多余的截掉；
                当max_len=None是，表示以当前batch中最长样本的长度对其它进行padding；
    Returns:
    """
    if max_len is None:
        max_len = max([s.size(0) for s in sequences])
    out_tensors = []
    for tensor in sequences:
        if tensor.size(0) < max_len:
            tensor = torch.cat([tensor, torch.tensor([padding_value] * (max_len - tensor.size(0)))], dim=0)
        else:
            tensor = tensor[:max_len]
        out_tensors.append(tensor)
    out_tensors = torch.stack(out_tensors, dim=1)
    if batch_first:
        return out_tensors.transpose(0, 1)
    
    return out_tensors

In [163]:
def generate_batch(data_batch):
    max_len = all_data['max_len']
    # print('ok')
    b_token_ids, b_segs, b_nsp_label, b_mlm_label = [], [], [], []
    for (token_ids, segs, nsp_lable, mlm_label) in data_batch:
        # 开始对一个batch中的每一个样本进行处理
        b_token_ids.append(token_ids)
        b_segs.append(segs)
        b_nsp_label.append(nsp_lable)
        b_mlm_label.append(mlm_label)
        # print('ok')
        
    b_token_ids = pad_sequence(b_token_ids,  # [batch_size,max_len]
                                padding_value=0,
                                batch_first=False,
                                max_len=max_len)
    # b_token_ids:  [src_len,batch_size]

    b_segs = pad_sequence(b_segs,  # [batch_size,max_len]
                            padding_value=0,
                            batch_first=False,
                            max_len=max_len)
    # b_segs: [src_len,batch_size]

    b_mlm_label = pad_sequence(b_mlm_label,  # [batch_size,max_len]
                                padding_value=0,
                                batch_first=False,
                                max_len=max_len)
    # b_mlm_label:  [src_len,batch_size]

    b_mask = (b_token_ids == 0).transpose(0, 1)
    # b_mask: [batch_size,max_len]

    b_nsp_label = torch.tensor(b_nsp_label, dtype=torch.long)
    # b_nsp_label: [batch_size]
    
    return b_token_ids, b_segs, b_mask, b_mlm_label, b_nsp_label

In [164]:
import torch
from torch.utils.data import DataLoader

train_loader = DataLoader(all_data['data'], batch_size=4,
                            shuffle=True, collate_fn = generate_batch)

len(train_loader.dataset)

11

In [187]:
for b_token_ids, b_segs, b_mask, b_mlm_label, b_nsp_label in train_loader:
    print('token_ids:',b_token_ids.shape)
    print('segs:',b_segs.shape)
    print('mask:',b_mask.shape)
    print('mlm_label:',b_mlm_label.shape)
    print('nsp_label:',b_nsp_label.shape)
    print('mask:',b_mask[0])
    break

token_ids: torch.Size([33, 4])
segs: torch.Size([33, 4])
mask: torch.Size([4, 33])
mlm_label: torch.Size([33, 4])
nsp_label: torch.Size([4])
mask: tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True])


## 编码处理测试

In [194]:
bert_embedding = BertEmbeddings(config)
for i,(b_token_ids, b_segs, b_mask, b_mlm_label, b_nsp_label) in enumerate(train_loader):
    bert_embedding_result = bert_embedding(input_ids = b_token_ids, token_type_ids = b_segs)
    print(bert_embedding_result.shape)
    break


torch.Size([33, 4, 768])


## BERT的Encoder实现

**模型架构：**

![Bert_Encoder](BertEncoder.jpg "model")

In [195]:
# 自注意力机制
class BertSelfAttention(nn.Module):
    """
    实现多头注意力机制，对应的是GoogleResearch代码中的attention_layer方法
    https://github.com/google-research/bert/blob/eedf5716ce1268e56f0a50264a88cafad334ac61/modeling.py#L558
    """

    def __init__(self, config):
        super(BertSelfAttention, self).__init__()

        MultiHeadAttention = nn.MultiheadAttention

        self.multi_head_attention = MultiHeadAttention(embed_dim=config.hidden_size, # 编码维度
                                                       num_heads=config.num_attention_heads, # 多头
                                                       dropout=config.attention_probs_dropout_prob) # 丢弃率

    def forward(self, query, key, value, attn_mask=None, key_padding_mask=None):
        """
        :param query: # [tgt_len, batch_size, hidden_size], tgt_len 表示目标序列的长度
        :param key:  #  [src_len, batch_size, hidden_size], src_len 表示源序列的长度
        :param value: # [src_len, batch_size, hidden_size], src_len 表示源序列的长度
        :param attn_mask: # [tgt_len,src_len] or [num_heads*batch_size,tgt_len, src_len]
        一般只在解码时使用，为了并行一次喂入所有解码部分的输入，所以要用mask来进行掩盖当前时刻之后的位置信息
        在Bert中，attention_mask指代的其实是key_padding_mask，因为Bert主要是基于Transformer Encoder部分构建的，
        所有没有Decoder部分，因此也就不需要用mask来进行掩盖当前时刻之后的位置信息
        :param key_padding_mask: [batch_size, src_len], src_len 表示源序列的长度
        :return:
        attn_output: [tgt_len, batch_size, hidden_size]
        attn_output_weights: # [batch_size, tgt_len, src_len]
        """
        return self.multi_head_attention(query, key, value, attn_mask=attn_mask, key_padding_mask=key_padding_mask)

In [196]:
# 自注意机制输出标准化
class BertSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        """
        :param hidden_states: [src_len, batch_size, hidden_size]
        :param input_tensor: [src_len, batch_size, hidden_size]
        :return: [src_len, batch_size, hidden_size]
        """
        # hidden_states = self.dense(hidden_states)  # [src_len, batch_size, hidden_size]
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor) # 加上残差连接

        return hidden_states

In [197]:
# bert注意力机制实现
class BertAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.self = BertSelfAttention(config)
        self.output = BertSelfOutput(config)

    def forward(self,
                hidden_states,
                attention_mask=None):
        """

        :param hidden_states: [src_len, batch_size, hidden_size]
        :param attention_mask: [batch_size, src_len]
        :return: [src_len, batch_size, hidden_size]
        """
        self_outputs = self.self(hidden_states,
                                 hidden_states,
                                 hidden_states,
                                 attn_mask = None,
                                 key_padding_mask = attention_mask)
        # self_outputs[0] shape: [src_len, batch_size, hidden_size]
        attention_output = self.output(self_outputs[0], hidden_states)
        
        return attention_output

In [204]:
# 激活函数
def get_activation(activation_string):
    act = activation_string.lower()
    if act == "linear":
        return None
    elif act == "relu":
        return nn.ReLU()
    elif act == "gelu":
        return nn.GELU()
    elif act == "tanh":
        return nn.Tanh()
    else:
        raise ValueError("Unsupported activation: %s" % act)

# 全连接层
class BertIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = get_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states):
        """

        :param hidden_states: [src_len, batch_size, hidden_size]
        :return: [src_len, batch_size, intermediate_size]
        """
        hidden_states = self.dense(hidden_states)  # [src_len, batch_size, intermediate_size]

        if self.intermediate_act_fn is None:
            hidden_states = hidden_states
        else:
            hidden_states = self.intermediate_act_fn(hidden_states)

        return hidden_states

# BERT输出层
class BertOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        """

        :param hidden_states: [src_len, batch_size, intermediate_size]
        :param input_tensor: [src_len, batch_size, hidden_size]
        :return: [src_len, batch_size, hidden_size]
        """
        hidden_states = self.dense(hidden_states)  # [src_len, batch_size, hidden_size]
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

In [199]:
class BertLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.bert_attention = BertAttention(config)
        self.bert_intermediate = BertIntermediate(config)
        self.bert_output = BertOutput(config)

    def forward(self,
                hidden_states,
                attention_mask=None):
        """

        :param hidden_states: [src_len, batch_size, hidden_size]
        :param attention_mask: [batch_size, src_len] mask掉padding部分的内容
        :return: [src_len, batch_size, hidden_size]
        """
        attention_output = self.bert_attention(hidden_states, attention_mask)
        # [src_len, batch_size, hidden_size]
        intermediate_output = self.bert_intermediate(attention_output)
        # [src_len, batch_size, intermediate_size]
        layer_output = self.bert_output(intermediate_output, attention_output)
        # [src_len, batch_size, hidden_size]
        return layer_output

In [200]:
class BertEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.bert_layers = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])

    def forward(
            self,
            hidden_states,
            attention_mask=None):
        """

        :param hidden_states: [src_len, batch_size, hidden_size]
        :param attention_mask: [batch_size, src_len]
        :return:
        """
        all_encoder_layers = []
        layer_output = hidden_states
        
        for i, layer_module in enumerate(self.bert_layers):
            layer_output = layer_module(layer_output,
                                        attention_mask)
            #  [src_len, batch_size, hidden_size]
            all_encoder_layers.append(layer_output)

        return all_encoder_layers

In [208]:
class BertPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()
        self.config = config

    def forward(self, hidden_states):
        """

        :param hidden_states:  [src_len, batch_size, hidden_size]
        :return: [batch_size, hidden_size]
        """
        # if 'pooler_type' not in self.config.__dict__:
        #     raise ValueError("pooler_type must be in ['first_token_transform', 'all_token_average']"
        #                      "请在配置文件config.json中添加一个pooler_type参数")
        # if self.config.pooler_type == "first_token_transform":
        #     token_tensor = hidden_states[0, :].reshape(-1, self.config.hidden_size)
        # elif self.config.pooler_type == "all_token_average":
        token_tensor = torch.mean(hidden_states, dim=0)
        pooled_output = self.dense(token_tensor)  # [batch_size, hidden_size]
        pooled_output = self.activation(pooled_output)
        
        return pooled_output  # [batch_size, hidden_size]

In [206]:
class BertModel(nn.Module):
    """

    """

    def __init__(self, config):
        super().__init__()
        self.bert_embeddings = BertEmbeddings(config)
        self.bert_encoder = BertEncoder(config)
        self.bert_pooler = BertPooler(config)
        self.config = config
        self._reset_parameters()

    def forward(self,
                input_ids=None,
                attention_mask=None,
                token_type_ids=None,
                position_ids=None):
        """
        ***** 一定要注意，attention_mask中，被mask的Token用1(True)表示，没有mask的用0(false)表示
        这一点一定一定要注意
        :param input_ids:  [src_len, batch_size]
        :param attention_mask: [batch_size, src_len] mask掉padding部分的内容
        :param token_type_ids: [src_len, batch_size]  # 如果输入模型的只有一个序列，那么这个参数也不用传值
        :param position_ids: [1,src_len] # 在实际建模时这个参数其实可以不用传值
        :return:
        """
        embedding_output = self.bert_embeddings(input_ids=input_ids,
                                                position_ids=position_ids,
                                                token_type_ids=token_type_ids)
        # embedding_output: [src_len, batch_size, hidden_size]

        all_encoder_outputs = self.bert_encoder(embedding_output,
                                                attention_mask=attention_mask)
        # all_encoder_outputs 为一个包含有num_hidden_layers个层的输出

        sequence_output = all_encoder_outputs[-1]  # 取最后一层
        # sequence_output: [src_len, batch_size, hidden_size]

        pooled_output = self.bert_pooler(sequence_output)
        # 默认是最后一层的first token 即[cls]位置经dense + tanh 后的结果
        # pooled_output: [batch_size, hidden_size]
        
        return pooled_output, all_encoder_outputs
    
    def _reset_parameters(self):
        r"""Initiate parameters in the transformer model."""
        """
        初始化
        """
        for p in self.parameters():
            if p.dim() > 1:
                normal_(p, mean=0.0, std=self.config.initializer_range)


In [210]:
bert_model = BertModel(config)

for i,(b_token_ids, b_segs, b_mask, b_mlm_label, b_nsp_label) in enumerate(train_loader):
    bert_model_output, all_encoder_outputs  = bert_model(input_ids = b_token_ids, attention_mask=b_mask, token_type_ids = b_segs)
    print(bert_model_output.shape)
    print(all_encoder_outputs[-1].shape)
    break


torch.Size([4, 768])
torch.Size([33, 4, 768])


## 预训练任务实现


In [211]:
# MLM实现
class BertForLMTransformHead(nn.Module):
    """
    用于BertForMaskedLM中的一次变换。 因为在单独的MLM任务中
    和最后NSP与MLM的整体任务中均要用到，所以这里单独抽象为一个类便于复用

    ref: https://github.com/google-research/bert/blob/master/run_pretraining.py
        第248-262行
    """

    def __init__(self, config, bert_model_embedding_weights=None):
        """
        :param config:
        :param bert_model_embedding_weights:
        the output-weights are the same as the input embeddings, but there is
        an output-only bias for each token. 即TokenEmbedding层中的词表矩阵
        """
        super(BertForLMTransformHead, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = get_activation(config.hidden_act)
        else:
            self.transform_act_fn = config.hidden_act

        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
        # [hidden_size, vocab_size]
        
        self.decoder.bias = nn.Parameter(torch.zeros(config.vocab_size))

    def forward(self, hidden_states):
        """
        :param hidden_states: [src_len, batch_size, hidden_size] Bert最后一层的输出
        :return:
        """
        hidden_states = self.dense(hidden_states)  # [src_len, batch_size, hidden_size]
        hidden_states = self.transform_act_fn(hidden_states)  # [src_len, batch_size, hidden_size]
        hidden_states = self.LayerNorm(hidden_states)  # [src_len, batch_size, hidden_size]
        hidden_states = self.decoder(hidden_states)
        # hidden_states:  [src_len, batch_size, vocab_size]
        
        return hidden_states

In [212]:
class BertForPretrainingModel(nn.Module):
    """
    BERT预训练模型，包括MLM和NSP两个任务
    """
    def __init__(self, config, bert_pretrained_model_dir=None):
        super(BertForPretrainingModel, self).__init__()
        self.bert = BertModel(config)
        weights = None
        self.mlm_prediction = BertForLMTransformHead(config, weights) # 句子mask预测
        self.nsp_prediction = nn.Linear(config.hidden_size, 2) # 句子间预测
        self.config = config

    def forward(self, input_ids,  # [src_len, batch_size]
                attention_mask=None,  # [batch_size, src_len] mask掉padding部分的内容
                token_type_ids=None,  # [src_len, batch_size]
                position_ids=None,
                masked_lm_labels=None,  # [src_len,batch_size]
                next_sentence_labels=None):  # [batch_size]
        # 自注意输出
        pooled_output, all_encoder_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask,token_type_ids=token_type_ids,position_ids=position_ids)
        
        # 取Bert最后一层的输出
        sequence_output = all_encoder_outputs[-1]  
        # sequence_output: [src_len, batch_size, hidden_size]
        
        mlm_prediction_logits = self.mlm_prediction(sequence_output)
        # mlm_prediction_logits: [src_len, batch_size, vocab_size]

        nsp_pred_logits = self.nsp_prediction(pooled_output)
        # nsp_pred_logits： [batch_size, 2]
        
        if masked_lm_labels is not None and next_sentence_labels is not None:
            loss_fct_mlm = nn.CrossEntropyLoss(ignore_index=0)
            # MLM任务在构造数据集时pandding部分和MASK部分都是用的0来填充，所以ignore_index需要指定为0
            loss_fct_nsp = nn.CrossEntropyLoss()
            # 由于NSP中的分类标签中含有0，上面MLM中的损失指定了ignore_index=0，所以这里需要重新定义一个CrossEntropyLoss
            # 如果MLM任务在padding和MASK中用100之类的来代替，那么两者可以共用一个CrossEntropyLoss
            mlm_loss = loss_fct_mlm(mlm_prediction_logits.reshape(-1, self.config.vocab_size),
                                    masked_lm_labels.reshape(-1))
            nsp_loss = loss_fct_nsp(nsp_pred_logits.reshape(-1, 2),
                                    next_sentence_labels.reshape(-1))
            total_loss = mlm_loss + nsp_loss
            return total_loss, mlm_prediction_logits, nsp_pred_logits
        else:
            return mlm_prediction_logits, nsp_pred_logits
        # [src_len, batch_size, vocab_size], [batch_size, 2]

In [216]:
Bert_Pre_model = BertForPretrainingModel(config)

for i,(b_token_ids, b_segs, b_mask, b_mlm_label, b_nsp_label) in enumerate(train_loader):
    total_loss, mlm_prediction_logits, nsp_pred_logits  = Bert_Pre_model(input_ids = b_token_ids, attention_mask=b_mask, token_type_ids = b_segs, next_sentence_labels=b_nsp_label, masked_lm_labels = b_mlm_label)
    print(total_loss)
    print(mlm_prediction_logits.shape)
    print(nsp_pred_logits.shape)
    break

tensor(10.8286, grad_fn=<AddBackward0>)
torch.Size([33, 4, 21128])
torch.Size([4, 2])


## 预训练模型整体实现

In [218]:
import argparse
## 超参数配置
parser = argparse.ArgumentParser() # 创建参数解析器的实例

# 添加一个选项参数
parser.add_argument("--lr", type=float, default=0.0002, help="adam: learning rate")
parser.add_argument("--b1", type=float, default=0.5, help="adam: decay of first order momentum of gradient")
parser.add_argument("--b2", type=float, default=0.999, help="adam: decay of first order momentum of gradient")


# 访问参数值
# opt = parser.parse_args() # 解析命令行中的参数
opt = parser.parse_args(args=[])                 ## 在jupyter notebook中运行时，换为此行
print(opt)

Namespace(lr=0.0002, b1=0.5, b2=0.999)


In [222]:
Bert_Pre_model = BertForPretrainingModel(config)
optimizer = torch.optim.Adam(Bert_Pre_model.parameters(), lr=opt.lr, betas=(opt.b1, opt.b2))

## ----------
##  Training
## ----------
## 进行多个epoch的训练
for epoch in range(10):                               ## epoch:50
    losses = 0
    for i,(b_token_ids, b_segs, b_mask, b_mlm_label, b_nsp_label) in enumerate(train_loader):
        loss, mlm_prediction_logits, nsp_pred_logits  = Bert_Pre_model(input_ids = b_token_ids, attention_mask=b_mask, token_type_ids = b_segs, next_sentence_labels=b_nsp_label, masked_lm_labels = b_mlm_label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses += loss.item()

    train_loss = losses / len(train_loader)
    
    print('整体训练误差：','Epoch %d'%(epoch + 1), train_loss)



整体训练误差： Epoch 1 11.202541987101236
整体训练误差： Epoch 2 8.934702396392822
整体训练误差： Epoch 3 8.143661975860596
整体训练误差： Epoch 4 7.198670069376628
整体训练误差： Epoch 5 6.7806064287821455
整体训练误差： Epoch 6 6.426120758056641
整体训练误差： Epoch 7 8.323813279469809
整体训练误差： Epoch 8 6.039237817128499
整体训练误差： Epoch 9 5.490671475728353
整体训练误差： Epoch 10 5.126372655232747
