In [27]:
import re
import torch
import torch.nn as nn
from transformers import BertForTokenClassification, BertTokenizer
from transformers import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm, trange
from keras.preprocessing.sequence import pad_sequences  # padding
import gc

In [28]:
gc.collect()

313

In [22]:
file = "dh_msra.txt"

In [45]:
all_sentences_separate = []
all_letter_labels = []
with open(file, encoding="utf-8") as f:
    single_sentence = []
    single_sentence_labels = []
    for s in f.readlines():
        if s != "\n":
            word, label = s.split("\t")
            label = label.strip("\n")
            single_sentence.append(word)
            single_sentence_labels.append(label)
            label_set.add(label)
        elif s == "\n":
            all_sentences_separate.append(single_sentence)
            all_letter_labels.append(single_sentence_labels)
            single_sentence = []
            single_sentence_labels = []

In [46]:
print(all_sentences_separate[0:2])
print(all_letter_labels[0:2])
print(f"\n所有的标签：{label_set}")

[['当', '希', '望', '工', '程', '救', '助', '的', '百', '万', '儿', '童', '成', '长', '起', '来', '，', '科', '教', '兴', '国', '蔚', '然', '成', '风', '时', '，', '今', '天', '有', '收', '藏', '价', '值', '的', '书', '你', '没', '买', '，', '明', '日', '就', '叫', '你', '悔', '不', '当', '初', '！'], ['藏', '书', '本', '来', '就', '是', '所', '有', '传', '统', '收', '藏', '门', '类', '中', '的', '第', '一', '大', '户', '，', '只', '是', '我', '们', '结', '束', '温', '饱', '的', '时', '间', '太', '短', '而', '已', '。']]
[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]

所有的标签：{'I-ORG', 'B-ORG', 'B-PER', 'I-PER', 'I-LOC', 'B-LOC', 'O'}


In [75]:
# GPU
print("Is CUDA available: ", torch.cuda.is_available())
n_gpu = torch.cuda.device_count()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("GPU numbers: ", n_gpu)
print("device_name: ", torch.cuda.get_device_name(0))
torch.cuda.set_device(1)
print(f"Current device: {torch.cuda.current_device()}")

Is CUDA available:  True
GPU numbers:  2
device_name:  Tesla M40 24GB
Current device: 1


In [21]:
# 构建 tag 到 索引 的字典
tag_to_ix = {"B-LOC": 0,
             "I-LOC": 1, 
             "B-ORG": 2, 
             "I-ORG": 3,
             "B-PER": 4,
             "I-PER": 5,
             "O": 6,
             "[CLS]":7,
             "[SEP]":8,
             "[PAD]":9}

ix_to_tag = {0:"B-LOC", 
             1:"I-LOC", 
             2:"B-ORG", 
             3:"I-ORG",
             4:"B-PER",
             5:"I-PER",
             6:"O",
             7:"[CLS]",
             8:"[SEP]",
             9:"[PAD]"}

In [50]:
all_sentences = []  # 句子

for one_sentence in all_sentences_separate:
    sentence = "".join(one_sentence)
    all_sentences.append(sentence)

print(all_sentences[0:2])

['当希望工程救助的百万儿童成长起来，科教兴国蔚然成风时，今天有收藏价值的书你没买，明日就叫你悔不当初！', '藏书本来就是所有传统收藏门类中的第一大户，只是我们结束温饱的时间太短而已。']


In [52]:
all_labels = []  # labels
for letter_labels in all_letter_labels:
    labels = [tag_to_ix[t] for t in letter_labels]
    all_labels.append(labels)

print(all_labels[0:2])
print(len(all_labels[0]))

[[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6], [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]]
50


In [56]:
print(len(all_labels))

55289


In [55]:
# word2token
tokenizer = BertTokenizer.from_pretrained('./bert-chinese/', do_lower_case=True)
tokenized_texts = [tokenizer.encode(sent, add_special_tokens=True) for sent in all_sentences]

In [54]:
print(tokenized_texts[0])

[101, 2496, 2361, 3307, 2339, 4923, 3131, 1221, 4638, 4636, 674, 1036, 4997, 2768, 7270, 6629, 3341, 8024, 4906, 3136, 1069, 1744, 5917, 4197, 2768, 7599, 3198, 8024, 791, 1921, 3300, 3119, 5966, 817, 966, 4638, 741, 872, 3766, 743, 8024, 3209, 3189, 2218, 1373, 872, 2637, 679, 2496, 1159, 8013, 102]


In [57]:
# 句子padding
# 句子最长长度
MAX_LEN = 128

# 输入padding
# 此函数在keras里面
input_ids = pad_sequences([txt for txt in tokenized_texts],
                          maxlen=MAX_LEN, 
                          dtype="long", 
                          truncating="post", 
                          padding="post")

In [58]:
print(len(input_ids[0]))
print(input_ids[0])

128
[ 101 2496 2361 3307 2339 4923 3131 1221 4638 4636  674 1036 4997 2768
 7270 6629 3341 8024 4906 3136 1069 1744 5917 4197 2768 7599 3198 8024
  791 1921 3300 3119 5966  817  966 4638  741  872 3766  743 8024 3209
 3189 2218 1373  872 2637  679 2496 1159 8013  102    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [59]:
# [3] 代表 O 实体
for label in all_labels:
    label.insert(len(label), 5)  # [SEP]
    label.insert(0, 4) # [CLS]
    if MAX_LEN > len(label) -1:
        for i in range(MAX_LEN - len(label)):
            label.append(3)  # [PAD]

In [60]:
print(len(all_labels[0]))
print(all_labels[0])

128
[4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]


In [61]:
# 创建attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i > 0) for i in seq]
    attention_masks.append(seq_mask)

In [62]:
# 第一句话的 attention_masks
print(np.array(attention_masks[0]))
print(len(np.array(attention_masks[0])))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
128


In [63]:
# train-test-split
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, 
                                                                                    all_labels, 
                                                                                    random_state=2019, 
                                                                                    test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                       input_ids,
                                                       random_state=2019, 
                                                       test_size=0.1)

In [64]:
print(len(train_inputs))
print(len(validation_inputs))

print(train_inputs[0])
print(validation_inputs[0])

49760
5529
[ 101 3616 3828 6598 3315 2356 1767 4638 2600  817  966 8024 1315 2828
 3616 4673 8115 1744 2792 3300 4638 5500 4873  510 6395 1171 1469 7213
 6121 2100 3621 1217 1762  671 6629 8024 2347 5307 6631 6814 5401 1744
  511  102    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
[ 101 3655 4904  510 7029 3941 1469 2166 3360 3918 3884 8024 3354 2768
  749 4263 3173 6230 5384  185 6825 5307  704 1744 4514 4638  712 6206
 7579 3332  511  102    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    

In [65]:
# tensor化
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

In [66]:
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

In [67]:
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [68]:
# dataloader
# batch size
batch_size = 64

# 形成训练数据集
train_data = TensorDataset(train_inputs, train_masks, train_labels)  
# 随机采样
train_sampler = RandomSampler(train_data) 
# 读取数据
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


# 形成验证数据集
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
# 随机采样
validation_sampler = SequentialSampler(validation_data)
# 读取数据
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [76]:
model = BertForTokenClassification.from_pretrained("./bert-chinese/", num_labels=7)
model.cuda()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [77]:
# BERT fine-tuning parameters
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']

# 权重衰减
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
     'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
     'weight_decay': 0.0}]

In [78]:
# 优化器
optimizer = AdamW(optimizer_grouped_parameters,
                  lr=5e-5)

In [81]:
# 保存loss
train_loss_set = []
# epochs 
epochs = 5

In [82]:
# BERT training loop
for _ in trange(epochs): 
    ## 训练
    print(f"当前epoch： {_}")
    # 开启训练模式
    model.train()
    tr_loss = 0  # train loss
    nb_tr_examples, nb_tr_steps = 0, 0
    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # 把batch放入GPU
        batch = tuple(t.to(device) for t in batch)
        # 解包batch
        b_input_ids, b_input_mask, b_labels = batch
        # 梯度归零
        optimizer.zero_grad()
        # 前向传播loss计算
        output = model(input_ids=b_input_ids, 
                       attention_mask=b_input_mask, 
                       labels=b_labels)  
        loss = output[0]
        # print(loss)
        # 反向传播
        loss.backward()
        # Update parameters and take a step using the computed gradient
        # 更新模型参数
        optimizer.step()
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
    print(f"当前 epoch 的 Train loss: {tr_loss/nb_tr_steps}")


  0%|          | 0/5 [00:00<?, ?it/s][A

当前epoch： 0



 20%|██        | 1/5 [47:05<3:08:21, 2825.32s/it][A

当前 epoch 的 Train loss: 0.04894553860200276
当前epoch： 1



 40%|████      | 2/5 [1:14:22<2:03:26, 2468.86s/it][A

当前 epoch 的 Train loss: 0.04709512379523546
当前epoch： 2



 60%|██████    | 3/5 [1:33:40<1:09:11, 2075.67s/it][A

当前 epoch 的 Train loss: 0.04669651953545414
当前epoch： 3



 80%|████████  | 4/5 [1:52:58<30:00, 1800.46s/it]  [A

当前 epoch 的 Train loss: 0.044560085826963663
当前epoch： 4



100%|██████████| 5/5 [2:12:17<00:00, 1587.49s/it][A

当前 epoch 的 Train loss: 0.04462920781823888





In [83]:
# 验证状态
model.eval()

# 建立变量
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
# Evaluate data for one epoch

In [84]:
# 验证集的读取也要batch
for batch in tqdm(validation_dataloader):
    # 元组打包放进GPU
    batch = tuple(t.to(device) for t in batch)
    # 解开元组
    b_input_ids, b_input_mask, b_labels = batch
    # 预测
    with torch.no_grad():
        # segment embeddings，如果没有就是全0，表示单句
        # position embeddings，[0,句子长度-1]
        outputs = model(input_ids=b_input_ids, 
                       attention_mask=b_input_mask,
                       token_type_ids=None,
                       position_ids=None)  
                       
    # print(logits[0])
    # Move logits and labels to CPU
    scores = outputs[0].detach().cpu().numpy()  # 每个字的标签的概率
    pred_flat = np.argmax(scores[0], axis=1).flatten()
    label_ids = b_labels.to('cpu').numpy()  # 真实labels
    # print(logits, label_ids)


  0%|          | 0/346 [00:00<?, ?it/s][A
  0%|          | 1/346 [00:00<00:43,  7.85it/s][A
  1%|          | 2/346 [00:00<00:44,  7.81it/s][A
  1%|          | 3/346 [00:00<00:42,  7.99it/s][A
  1%|          | 4/346 [00:00<00:41,  8.18it/s][A
  1%|▏         | 5/346 [00:00<00:40,  8.33it/s][A
  2%|▏         | 6/346 [00:00<00:40,  8.45it/s][A
  2%|▏         | 7/346 [00:00<00:39,  8.53it/s][A
  2%|▏         | 8/346 [00:00<00:39,  8.58it/s][A
  3%|▎         | 9/346 [00:01<00:39,  8.60it/s][A
  3%|▎         | 10/346 [00:01<00:38,  8.63it/s][A
  3%|▎         | 11/346 [00:01<00:38,  8.65it/s][A
  3%|▎         | 12/346 [00:01<00:38,  8.67it/s][A
  4%|▍         | 13/346 [00:01<00:38,  8.69it/s][A
  4%|▍         | 14/346 [00:01<00:38,  8.70it/s][A
  4%|▍         | 15/346 [00:01<00:38,  8.71it/s][A
  5%|▍         | 16/346 [00:01<00:37,  8.71it/s][A
  5%|▍         | 17/346 [00:01<00:37,  8.71it/s][A
  5%|▌         | 18/346 [00:02<00:37,  8.71it/s][A
  5%|▌         | 19/346 [00:0

 45%|████▌     | 156/346 [00:17<00:21,  8.71it/s][A
 45%|████▌     | 157/346 [00:18<00:21,  8.71it/s][A
 46%|████▌     | 158/346 [00:18<00:21,  8.70it/s][A
 46%|████▌     | 159/346 [00:18<00:21,  8.70it/s][A
 46%|████▌     | 160/346 [00:18<00:21,  8.69it/s][A
 47%|████▋     | 161/346 [00:18<00:21,  8.69it/s][A
 47%|████▋     | 162/346 [00:18<00:21,  8.69it/s][A
 47%|████▋     | 163/346 [00:18<00:21,  8.70it/s][A
 47%|████▋     | 164/346 [00:18<00:20,  8.70it/s][A
 48%|████▊     | 165/346 [00:18<00:20,  8.70it/s][A
 48%|████▊     | 166/346 [00:19<00:20,  8.70it/s][A
 48%|████▊     | 167/346 [00:19<00:20,  8.70it/s][A
 49%|████▊     | 168/346 [00:19<00:20,  8.70it/s][A
 49%|████▉     | 169/346 [00:19<00:20,  8.68it/s][A
 49%|████▉     | 170/346 [00:19<00:20,  8.69it/s][A
 49%|████▉     | 171/346 [00:19<00:20,  8.69it/s][A
 50%|████▉     | 172/346 [00:19<00:20,  8.70it/s][A
 50%|█████     | 173/346 [00:19<00:19,  8.70it/s][A
 50%|█████     | 174/346 [00:20<00:19,  8.70it

 90%|████████▉ | 310/346 [00:35<00:04,  8.70it/s][A
 90%|████████▉ | 311/346 [00:35<00:04,  8.70it/s][A
 90%|█████████ | 312/346 [00:35<00:03,  8.70it/s][A
 90%|█████████ | 313/346 [00:35<00:03,  8.70it/s][A
 91%|█████████ | 314/346 [00:36<00:03,  8.70it/s][A
 91%|█████████ | 315/346 [00:36<00:03,  8.70it/s][A
 91%|█████████▏| 316/346 [00:36<00:03,  8.70it/s][A
 92%|█████████▏| 317/346 [00:36<00:03,  8.70it/s][A
 92%|█████████▏| 318/346 [00:36<00:03,  8.69it/s][A
 92%|█████████▏| 319/346 [00:36<00:03,  8.69it/s][A
 92%|█████████▏| 320/346 [00:36<00:02,  8.69it/s][A
 93%|█████████▎| 321/346 [00:36<00:02,  8.70it/s][A
 93%|█████████▎| 322/346 [00:37<00:02,  8.70it/s][A
 93%|█████████▎| 323/346 [00:37<00:02,  8.70it/s][A
 94%|█████████▎| 324/346 [00:37<00:02,  8.70it/s][A
 94%|█████████▍| 325/346 [00:37<00:02,  8.70it/s][A
 94%|█████████▍| 326/346 [00:37<00:02,  8.70it/s][A
 95%|█████████▍| 327/346 [00:37<00:02,  8.70it/s][A
 95%|█████████▍| 328/346 [00:37<00:02,  8.70it

In [89]:
# 保存模型
# They can then be reloaded using `from_pretrained()`

import os

output_dir = "./model_save"
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
torch.save(model_to_save.state_dict(), os.path.join(output_dir, 'training_args.bin'))

In [None]:
# 读取模型
# Load a trained model and vocabulary that you have fine-tuned
model = model_class.from_pretrained(output_dir)
tokenizer = tokenizer_class.from_pretrained(output_dir)
model.to(device)