## BERT手把手

[BERT for dummies — Step by Step Tutorial](https://towardsdatascience.com/bert-for-dummies-step-by-step-tutorial-fb90890ffe03)

In [1]:
# BERT imports
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences  # padding句子用
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import AdamW, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


### GPU

In [2]:
torch.cuda.is_available()

True

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
n_gpu = torch.cuda.device_count()
print(n_gpu)

2


In [5]:
print(torch.cuda.get_device_name(0))

Tesla M40 24GB


### 数据处理

1. 读取数据，放在 `biaoji.txt`中


In [6]:
file = "biaoji_raw.txt"

with open(file, encoding="utf-8") as f:
    sentences = [line for line in f.readlines()]
f.close()

In [7]:
sentences

['上海今天的天气怎么样\n',
 '杭州今天有点下雨\n',
 '昨天成都出太阳了\n',
 '今年北京的空气不太好\n',
 '冬天的哈尔滨冰天雪地\n',
 '武汉有很多樱花树\n',
 '金华生产的火腿很出名\n',
 '上海在地图上紧挨着杭州\n',
 '海南岛在冬天里面也很热']

In [8]:
sentences[0]

'上海今天的天气怎么样\n'

In [9]:
sentences[1]

'杭州今天有点下雨\n'

2. 按字拆分：

    自动在首位添加`[CLS]`和`[SEP]`
    转为input ids

In [10]:
tokenizer = BertTokenizer.from_pretrained('./bert-chinese/', do_lower_case=True)
tokenizer

<transformers.tokenization_bert.BertTokenizer at 0x7ff8706c9fd0>

In [11]:
tokenized_texts = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]

In [12]:
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

Tokenize the first sentence:
[101, 677, 3862, 791, 1921, 4638, 1921, 3698, 2582, 720, 3416, 102]


In [13]:
print ("Tokenize the 2nd sentence:")
print (tokenized_texts[1])

Tokenize the 2nd sentence:
[101, 3343, 2336, 791, 1921, 3300, 4157, 678, 7433, 102]


In [14]:
print (len(tokenized_texts))  # 9句话

9


3. 为句子padding：
    
    将句子的input ids padding到64.

In [15]:
# 句子最长长度
MAX_LEN = 64

# 输入padding
# 此函数在keras里面
input_ids = pad_sequences([txt for txt in tokenized_texts],
                          maxlen=MAX_LEN, 
                          dtype="long", 
                          truncating="post", 
                          padding="post")

In [16]:
input_ids[0]

array([ 101,  677, 3862,  791, 1921, 4638, 1921, 3698, 2582,  720, 3416,
        102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0])

In [17]:
input_ids[1]

array([ 101, 3343, 2336,  791, 1921, 3300, 4157,  678, 7433,  102,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0])

In [18]:
# 试试转换回来
raw_texts = [tokenizer.decode(ids) for ids in input_ids]
print(raw_texts)
print(len(raw_texts))

['[CLS] 上 海 今 天 的 天 气 怎 么 样 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]', '[CLS] 杭 州 今 天 有 点 下 雨 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]', '[CLS] 昨 天 成 都 出 太 阳 了 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD

### BERT的输入准备

1. 注意力mask（attention masks）：

    padding部分是不需要被attention到的。
    相当于这部分就是，真实句子为1，padding部分为0。
    得到attention masks

In [19]:
# 创建attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i > 0) for i in seq]
    attention_masks.append(seq_mask)

In [20]:
attention_masks

[[1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.

2. 准备tensor和迭代器

    首先准备labels，其实我们的labels可以认为：
    - 1：这句话在说天气
    - 0：这句话没在说天气
    
```
['上海今天的天气怎么样\n',
 '杭州今天有点下雨\n',
 '昨天成都出太阳了\n',
 '今年北京的空气不太好\n',
 '冬天的哈尔滨冰天雪地\n',
 '武汉有很多樱花树\n',
 '金华生产的火腿很出名\n',
 '上海在地图上紧挨着杭州\n',
 '海南岛在冬天里面也很热']
 ```
 
 > 以后这里可以用`train_test_split`来分
 
然后拆分训练集和验证集，attention masks 也要：

In [21]:
labels = [1, 1, 1, 0, 1, 0, 0, 0, 1]
len(labels)

9

In [22]:
train_inputs = input_ids[0:7]
validation_inputs = input_ids[7:]
train_labels = labels[0:7]
validation_labels = labels[7:]

In [23]:
train_labels

[1, 1, 1, 0, 1, 0, 0]

In [24]:
validation_labels

[0, 1]

In [25]:
train_masks = attention_masks[0:7]
validation_masks = attention_masks[7:]

In [26]:
# tensor化
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [27]:
print(validation_inputs)
print(validation_labels)
print(validation_masks)

tensor([[ 101,  677, 3862, 1762, 1765, 1745,  677, 5165, 2917, 4708, 3343, 2336,
          102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0],
        [ 101, 3862, 1298, 2270, 1762, 1100, 1921, 7027, 7481,  738, 2523, 4178,
          102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]])
tensor([0, 1])
tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 

In [28]:
# Select a batch size for training. 
batch_size = 2

创建迭代器

In [29]:
# Create an iterator of our data with torch DataLoader 

# 形成训练数据集
train_data = TensorDataset(train_inputs, train_masks, train_labels)  
# 随机采样
train_sampler = RandomSampler(train_data) 
# 读取数据
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


# 形成验证数据集
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
# 随机采样
validation_sampler = SequentialSampler(validation_data)
# 读取数据
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

### BERT的微调

在准备好输入以后，现在我们开始微调BERT模型。

使用`BertForSequenceClassification`，它就是一个普通BERT模型，在最后面加了一个线形层用于分类。

1. 导入模型

In [30]:
# 读取 BertForSequenceClassification 模型，
# 是一个预训练的BERT模型，在最后面加了一个线形层用于分类。

model = BertForSequenceClassification.from_pretrained("./bert-chinese/", 
                                                      num_labels=2)  # 
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

2. 准备微调：
    
   其中，`no_decay`见[issue#492](https://github.com/huggingface/transformers/issues/492)

In [34]:
# BERT fine-tuning parameters
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']

# 权重衰减
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
     'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
     'weight_decay': 0.0}]

In [36]:
optimizer = AdamW(optimizer_grouped_parameters,
                  lr=1e-3)

In [37]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [38]:
# Store our loss and accuracy for plotting
train_loss_set = []
# Number of training epochs 
epochs = 4

开始训练

In [46]:
# BERT training loop
for _ in trange(epochs, desc="Epoch"): 
    ## 训练
    
    # 开启训练模式
    model.train()
    tr_loss = 0  # train loss
    nb_tr_examples, nb_tr_steps = 0, 0
    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # 把batch放入GPU
        batch = tuple(t.to(device) for t in batch)
        # 解包batch
        b_input_ids, b_input_mask, b_labels = batch
        # 梯度归零
        optimizer.zero_grad()
        # 前向传播loss计算
        output = model(input_ids=b_input_ids, 
                     attention_mask=b_input_mask, 
                     labels=b_labels)  # 有labels的时候，且labels>1就直接返回Cross-Entropy
        loss = output[0]
        print(loss)
        # 反向传播
        loss.backward()
        # Update parameters and take a step using the computed gradient
        # 更新模型参数
        optimizer.step()
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
    print(f"Train loss: {tr_loss/nb_tr_steps}")

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

tensor(0.8430, device='cuda:0', grad_fn=<NllLossBackward>)


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 22.41 GiB total capacity; 1.83 GiB already allocated; 1.44 MiB free; 96.49 MiB cached)