# Bert 的Pytorch实现

```
内容参考blog：https://wmathor.com/index.php/archives/1457/
```

## 准备数据集

In [1]:
# import依赖包
import torch
import torch.nn as nn
import numpy as np
import re    # 对数据集进行分句子，以及删除不需要的标点符号
import math
from random import random  # 生辰随机数
from random import randrange
from random import shuffle
from random import randint
import torch.optim as optim
import torch.utils.data as Data

In [10]:
# 构造fake trian data
text = (
    'Hello, how are you? I am Romeo.\n' # R
    'Hello, Romeo My name is Juliet. Nice to meet you.\n' # J
    'Nice meet you too. How are you today?\n' # R
    'Great. My baseball team won the competition.\n' # J
    'Oh Congratulations, Juliet\n' # R
    'Thank you Romeo\n' # J
    'Where are you going today?\n' # R
    'I am going shopping. What about you?\n' # J
    'I am going to visit my grandmother. she is not very well' # R
)

# 对数据集进行分句子，以及删除不需要的标点符号
sentences = re.sub("[,.!?\\-]", "", text.lower()).split("\n")

## 注意实际上text的处理会更加繁琐，而且真实的input text也不会简单可以按照\n 来进行分句处理。
# 不重复的word vocab
vocab = list(set(" ".join(sentences).split()))
word2idx = {'[PAD]':0, '[CLS]':1, '[SEP]':2, '[MASK]':3}
for i, w in enumerate(vocab):
    word2idx[w] = i + 4
idx2word = {i: w for w, i in word2idx.items()}
vocab_size = len(word2idx)

token_list = []
for sentence in sentences:
    temp = [word2idx[s] for s in sentence.split()]
    token_list.append(temp)
token_list

[[30, 10, 6, 22, 32, 21, 39],
 [30, 39, 7, 19, 28, 15, 8, 37, 24, 22],
 [8, 24, 22, 29, 10, 6, 22, 33],
 [25, 7, 4, 16, 13, 9, 17],
 [5, 26, 15],
 [14, 22, 39],
 [36, 6, 22, 12, 33],
 [32, 21, 12, 23, 27, 31, 22],
 [32, 21, 12, 37, 38, 7, 11, 20, 28, 34, 35, 18]]

In [3]:
# 模型的config文件参数
# BERT parameter
max_seq_len = 30
batch_size = 6
max_pred = 5
layers_num = 6
heads_num = 12
model_dim = 756
ffn_dim = 756 * 4
per_head_dim = 756 / 12
segments_num = 2
epoch_num = 180
dropout=0.1

## 数据预处理
* 按照MASK的标准深沉数据： 一句话中的15%被MASK，被MASK中 80%被[MASK] 替换，10% 被随机替换，10% 不做任何变化
* 构造Dataloader，方便数据训练的时候进行迭代

In [4]:
# make fake data
def make_data():
    """构造一个batch_size（6） 的样本，其中NSP任务为positive和negative的样本各3个 
    """
    batch = []
    positive = negative = 0
    while positive != batch_size / 2 or negative != batch_size / 2:
        # 一条数据一条数据的生成
        tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences))
        tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]
        input_ids = [word2idx['[CLS]']] + tokens_a + [word2idx['[SEP]']] + tokens_b + [word2idx['[SEP]']]
        segment_ids = [0] + len(tokens_a) * [0] + [0] + [1] * len(tokens_b) + [1]
        # MASK LM
        ## 按照 sentence 长度的15%来确定需要 MASK 的位置
        n_pred = min(max_pred, max(int(len(input_ids) * 0.15), 1))
        # 过滤special token得到备选mask的index
        cand_mask_pos = [i for i, v in enumerate(input_ids) if v > 3]
        shuffle(cand_mask_pos)
        masked_pos, masked_token = [], []
        for pos in cand_mask_pos[:n_pred]:
            masked_pos.append(pos)
            masked_token.append(input_ids[pos])
            random_value = random()
            if random_value < 0.8:
                input_ids[pos] = word2idx['[MASK]']
            elif random_value > 0.9:
                # 随机选择其他token 进行替换
                index = randint(0, vocab_size -1)
                while index < 3 or index == input_ids[pos]:
                    index = randint(0, vocab_size -1)
                input_ids[pos] = index
            
        # zero padding 的token 也要mask 15% （这一步很奇怪）
        if max_pred > n_pred:
            n_pad = max_pred - n_pred
            masked_token = masked_token + [0] * n_pad
            masked_pos = masked_pos + [0] * n_pad

        # zero-padding
        n_pads = max_seq_len - len(input_ids)
        input_ids = input_ids + n_pads * [0]
        segment_ids = segment_ids + n_pads * [0]
        
        if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
            batch.append([input_ids, segment_ids, masked_token, masked_pos, True])
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:
            batch.append([input_ids, segment_ids, masked_token, masked_pos, False])
            negative += 1
    return batch

batch = make_data()
print(len(batch))

6


In [5]:
input_ids, segment_ids, masked_token, masked_pos, isNext = zip(*batch) # 此时 input_ids 是 list 类型，直接传入BERT model 会报错

# 这一步不能少，否则在迭代数据时出问题
input_ids, segment_ids, masked_token, masked_pos, isNext = \
    torch.LongTensor(input_ids), torch.LongTensor(segment_ids), torch.LongTensor(masked_token), torch.LongTensor(masked_pos), torch.LongTensor(isNext)

class MyDataSet(Data.Dataset):

    def __init__(self, input_ids, segment_ids, masked_token, maksed_pos, isNext):
        self.input_ids = input_ids
        self.segment_ids = segment_ids
        self.masked_token = masked_token
        self.masked_pos = maksed_pos
        self.isNext = isNext
    
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.segment_ids[idx], self.masked_token[idx], self.masked_pos[idx], self.isNext[idx]

loader = Data.DataLoader(MyDataSet(input_ids, segment_ids, masked_token, masked_pos, isNext), batch_size, True)

## 训练 & 测试

In [6]:
from bert_util import BERT

model = BERT(vocab_size=vocab_size, max_seq_len=max_seq_len, model_dim=model_dim,
             heads_num=heads_num, ffn_dim=ffn_dim, layers_num=layers_num, dropout=dropout)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(model.parameters(), lr=1e-3)

In [7]:
for epoch in range(epoch_num):
    for input_ids, segment_ids, masked_token, masked_pos, isNext in loader:
        logits_clsf, logits_lm = model(input_ids, segment_ids, masked_pos)
        loss_lm = criterion(logits_lm.view(-1, vocab_size), masked_token.view(-1))
        loss_lm = loss_lm.mean()
        loss_clsf = criterion(logits_clsf, isNext) # for sentence classification
        loss = loss_lm + loss_clsf
        if (epoch + 1) % 10 == 0:  # 每迭代10epoch就打印一次loss
            print(f'Epoch:{epoch + 1},  loss= {loss:.2f}')
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Epoch:10,  loss= 3.83
Epoch:20,  loss= 3.43
Epoch:30,  loss= 3.10
Epoch:40,  loss= 2.86
Epoch:50,  loss= 2.69
Epoch:60,  loss= 2.56
Epoch:70,  loss= 2.47
Epoch:80,  loss= 2.40
Epoch:90,  loss= 2.33
Epoch:100,  loss= 2.28
Epoch:110,  loss= 2.22
Epoch:120,  loss= 2.18
Epoch:130,  loss= 2.13
Epoch:140,  loss= 2.09
Epoch:150,  loss= 2.05
Epoch:160,  loss= 2.01
Epoch:170,  loss= 1.97
Epoch:180,  loss= 1.93


## 测试一下

In [22]:
# Predict mask tokens and isNext
model.eval()
inputs_id, segment_ids, masked_token, masked_pos, isNext = batch[0]
print(text)
print('*****' * 10)
print([idx2word[w] for w in inputs_id if idx2word[w] != '[PAD]'])
print('*****'* 20)
logits_clsf, logits_lm = model(torch.LongTensor([inputs_id]), \
                 torch.LongTensor([segment_ids]), torch.LongTensor([masked_pos]))

Hello, how are you? I am Romeo.
Hello, Romeo My name is Juliet. Nice to meet you.
Nice meet you too. How are you today?
Great. My baseball team won the competition.
Oh Congratulations, Juliet
Thank you Romeo
Where are you going today?
I am going shopping. What about you?
I am going to visit my grandmother. she is not very well
**************************************************
['[CLS]', 'great', 'my', 'baseball', 'team', 'won', 'the', 'competition', '[SEP]', 'great', 'my', 'baseball', '[MASK]', 'won', '[MASK]', 'competition', '[SEP]']
****************************************************************************************************


In [24]:
print(logits_lm.shape)
print(logits_clsf.shape)

torch.Size([1, 5, 40])
torch.Size([1, 2])


In [32]:
clsf_score = logits_clsf.data.max(1)[1].data.numpy()[0]
if clsf_score:
    print('isNext')
else:
    print('not isNext')

not isNext


In [42]:
lm_indices = logits_lm.data.max(-1)[1][0].data.numpy()
label_pos = [pos for pos in masked_token if pos !=0]
predict_pos = [pos for pos in lm_indices if pos !=0]
print('label pos', label_pos)
print('predict pos:', predict_pos)

label pos [16, 9]
predict pos: [16, 16]
