# 中文序列标注任务（Hugging Face Bert）
常见的序列标注任务有：
- 命名实体识别 (Named Entity Recognition, NER)：识别出文本中诸如人物、地点、组织等实体，即为所有的 token 都打上实体标签，将其划分到某类实体，或者判定为“非实体”；
- 词性标注 (Part-Of-Speech tagging, POS)：为文本中的每一个词语标注上对应的词性，例如名词、动词、形容词等。
下面我们以 NER 为例，运用 Transformers 库微调一个基于 BERT 的模型来完成任务。

## 数据准备
我们选择 1998 年人民日报语料库作为数据集，在其之上训练一个 NER 模型：每次输入一个句子，识别出其中的人物 (PER)、地点 (LOC) 和组织 (ORG)。人民日报语料库标注了大量的语言学信息，可以使用处理工具从中抽取出数据用于分词、NER 等任务，这里我们直接使用处理好的 NER 语料：

http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz

该语料已经划分好了训练集、验证集和测试集，分别对应 example.train、example.dev 和 example.test 文件，其中训练集 / 验证集 / 测试集分别包含 20864 / 2318 / 4636 个句子。语料采用我们在上一篇《快速分词器》中介绍的 IOB2 格式进行标注，一行对应一个字：

```
海 O
钓 O
比 O
赛 O
地 O
点 O
在 O
厦 B-LOC
门 I-LOC
与 O
金 B-LOC
门 I-LOC
之 O
间 O
的 O
海 O
域 O
。 O
```

## 构建数据集
我们首先编写继承自 Dataset 类的自定义数据集用于组织样本和标签：

In [3]:
from torch.utils.data import Dataset

categories = set()

class PeopleDaily(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)
    
    def load_data(self, data_file):
        Data = {}
        with open(data_file, 'rt', encoding='utf-8') as f:
            for idx, line in enumerate(f.read().split('\n\n')):
                if not line:
                    break
                sentence, tags = '', []
                for i, c in enumerate(line.split('\n')):
                    word, tag = c.split(' ')
                    sentence += word
                    if tag[0] == 'B':
                        tags.append([i, i, word, tag[2:]]) # Remove the B- or I-
                        categories.add(tag[2:])
                    elif tag[0] == 'I':
                        tags[-1][1] = i
                        tags[-1][2] += word
                Data[idx] = {
                    'sentence': sentence, 
                    'tags': tags
                }
        return Data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [4]:
train_data = PeopleDaily('china-people-daily-ner-corpus/example.train')
valid_data = PeopleDaily('china-people-daily-ner-corpus/example.dev')
test_data = PeopleDaily('china-people-daily-ner-corpus/example.test')

print(train_data[0])

{'sentence': '海钓比赛地点在厦门与金门之间的海域。', 'tags': [[7, 8, '厦门', 'LOC'], [10, 11, '金门', 'LOC']]}


In [5]:
id2label = {0:'O'}
for c in list(sorted(categories)):
    id2label[len(id2label)] = f"B-{c}"
    id2label[len(id2label)] = f"I-{c}"
label2id = {v: k for k, v in id2label.items()}

print(id2label)
print(label2id)

{0: 'O', 1: 'B-LOC', 2: 'I-LOC', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-PER', 6: 'I-PER'}
{'O': 0, 'B-LOC': 1, 'I-LOC': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-PER': 5, 'I-PER': 6}


In [6]:
from transformers import AutoTokenizer
import numpy as np

checkpoint = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sentence = '海钓比赛地点在厦门与金门之间的海域。'
tags = [[7, 8, '厦门', 'LOC'], [10, 11, '金门', 'LOC']]

encoding = tokenizer(sentence, truncation=True)
tokens = encoding.tokens()
label = np.zeros(len(tokens), dtype=int)
for char_start, char_end, word, tag in tags:
    token_start = encoding.char_to_token(char_start)
    token_end = encoding.char_to_token(char_end)
    label[token_start] = label2id[f"B-{tag}"]
    label[token_start+1:token_end+1] = label2id[f"I-{tag}"]

print(tokens)
print(label)
print([id2label[id] for id in label])

['[CLS]', '海', '钓', '比', '赛', '地', '点', '在', '厦', '门', '与', '金', '门', '之', '间', '的', '海', '域', '。', '[SEP]']
[0 0 0 0 0 0 0 0 1 2 0 1 2 0 0 0 0 0 0 0]
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


可以看到，通过 char_to_token() 函数，我们成功地将实体标签从原文位置映射到切分后的 token 索引，并且将对应的标签设置为实体编号。

实际编写 DataLoader 的批处理函数 collate_fn() 时，我们处理的是一批中的多个样本，因此需要对上面的操作进行扩展。而且由于最终会通过交叉熵损失来优化模型参数，我们还需要将 [CLS]、[SEP]、[PAD] 等特殊 token 对应的标签设为 -100，以便在计算损失时忽略它们：

In [7]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
import numpy as np

checkpoint = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def collote_fn(batch_samples):
    batch_sentence, batch_tags  = [], []
    for sample in batch_samples:
        batch_sentence.append(sample['sentence'])
        batch_tags.append(sample['tags'])
    batch_inputs = tokenizer(
        batch_sentence, 
        padding=True, 
        truncation=True, 
        return_tensors="pt"
    )
    batch_label = np.zeros(batch_inputs['input_ids'].shape, dtype=int)
    for s_idx, sentence in enumerate(batch_sentence):
        encoding = tokenizer(sentence, truncation=True)
        batch_label[s_idx][0] = -100
        batch_label[s_idx][len(encoding.tokens())-1:] = -100
        for char_start, char_end, _, tag in batch_tags[s_idx]:
            token_start = encoding.char_to_token(char_start)
            token_end = encoding.char_to_token(char_end)
            batch_label[s_idx][token_start] = label2id[f"B-{tag}"]
            batch_label[s_idx][token_start+1:token_end+1] = label2id[f"I-{tag}"]
    return batch_inputs, torch.tensor(batch_label)

train_dataloader = DataLoader(train_data, batch_size=4, shuffle=True, collate_fn=collote_fn)
valid_dataloader = DataLoader(valid_data, batch_size=4, shuffle=False, collate_fn=collote_fn)
test_dataloader = DataLoader(test_data, batch_size=4, shuffle=False, collate_fn=collote_fn)

batch_X, batch_y = next(iter(train_dataloader))
print('batch_X shape:', {k: v.shape for k, v in batch_X.items()})
print('batch_y shape:', batch_y.shape)
print(batch_X)
print(batch_y)

batch_X shape: {'input_ids': torch.Size([4, 91]), 'token_type_ids': torch.Size([4, 91]), 'attention_mask': torch.Size([4, 91])}
batch_y shape: torch.Size([4, 91])
{'input_ids': tensor([[ 101,  671,  702,  782, 2339,  868,  749, 1914, 2399, 8024, 4960, 4197,
         6206, 1440, 1166, 1333, 1044, 4225, 2634, 4638, 2266,  855, 8024, 1079,
         2552, 4638, 5736, 2630,  510, 2659, 2580, 1469, 2606, 2575, 8024, 2600,
         3221, 7410, 1048, 4638,  511,  102,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0],
        [ 101, 3219, 1921,  677, 1286, 8024,  809, 2548, 1744, 8131,  686, 5279,
         5865, 1399, 4289, 4415, 2110, 2157, 2014, 2442,  185, 2434, 2861, 2548,
          185,  840, 4433, 1462, 1399, 463

## 训练模型
---------------------
### 构建模型
采用手工编写 Pytorch 模型的方式来完成：首先利用 Transformers 库加载 BERT 模型，然后接一个全连接层完成分类：

In [10]:
from torch import nn
from transformers import AutoModel

device = 'cuda:1' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.bert_encoder = AutoModel.from_pretrained(checkpoint)
        self.classifier = nn.Linear(768, len(id2label))

    def forward(self, x):
        bert_output = self.bert_encoder(**x)
        logits = self.classifier(bert_output.last_hidden_state)
        return logits

model = NeuralNetwork().to(device)
print(model)

Using cuda:1 device


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NeuralNetwork(
  (bert_encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

### 优化模型参数
与之前一样，我们将每一轮 (Epoch) 分为”训练循环”和”验证/测试循环”，在训练循环中计算损失、优化模型的参数，在验证/测试循环中评估模型的性能。下面我们首先实现训练循环：

In [11]:
from tqdm.auto import tqdm

def train_loop(dataloader, model, loss_fn, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_batch_num = (epoch-1) * len(dataloader)
    
    model.train()
    for batch, (X, y) in enumerate(dataloader, start=1):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred.permute(0, 2, 1), y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        progress_bar.set_description(f'loss: {total_loss/(finish_batch_num + batch):>7f}')
        progress_bar.update(1)
    return total_loss

注意，与文本分类任务对于每个样本只输出一个预测张量不同，token 分类任务会输出一个预测张量的序列（可以看成是对每个 token 都进行了一次分类），因此使用交叉熵计算模型损失时，不能像之前一样直接将模型的预测结果与标签送入到 CrossEntropyLoss 中进行计算。

对于高维度的输出（例如 2D 图像需要按像素计算交叉熵），CrossEntropyLoss 需要将输入维度调整为 
(batch,C,d1,d2,…,dK)，其中 C 是类别个数，K 是输入的维度。对于 token 分类任务，就是在 token 序列维度（Keras 中称时间步）上计算交叉熵，因此我们通过 pred.permute(0, 2, 1) 交换后两维，将模型预测结果从(batch,seq,7) 调整为 (batch,7,seq)。

验证/测试循环负责评估模型的性能。这里我们借助 seqeval 库进行评估，seqeval 是一个专门用于序列标注评估的 Python 库，支持 IOB、IOB、IOBES 等多种标注格式以及多种评估策略，例如：

In [12]:
!pip install seqeval
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2

y_true = [['O', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'B-LOC', 'O'], ['B-PER', 'I-PER', 'O']]
y_pred = [['O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'B-LOC', 'O'], ['B-PER', 'I-PER', 'O']]

print(classification_report(y_true, y_pred, mode='strict', scheme=IOB2))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
              precision    recall  f1-score   support

         LOC       0.50      0.50      0.50         2
         PER       1.00      1.00      1.00         1

   micro avg       0.67      0.67      0.67         3
   macro avg       0.75      0.75      0.75         3
weighted avg       0.67      0.67      0.67         3



In [14]:
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2

def test_loop(dataloader, model):
    true_labels, true_predictions = [], []

    model.eval()
    with torch.no_grad():
        for X, y in tqdm(dataloader):
            X, y = X.to(device), y.to(device)
            pred = model(X)
            predictions = pred.argmax(dim=-1)
            true_labels += [[id2label[int(l)] for l in label if l != -100] for label in y]
            true_predictions += [
                [id2label[int(p)] for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, y)
            ]
    print(classification_report(true_labels, true_predictions, mode='strict', scheme=IOB2))

In [15]:
from transformers import AdamW, get_scheduler

learning_rate = 1e-5
epoch_num = 3

loss_fn = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num*len(train_dataloader),
)

total_loss = 0.
for t in range(epoch_num):
    print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, loss_fn, optimizer, lr_scheduler, t+1, total_loss)
    test_loop(valid_dataloader, model)
print("Done!")



Epoch 1/3
-------------------------------


loss: 0.049675: 100%|██████████| 5216/5216 [03:52<00:00, 22.47it/s]
100%|██████████| 580/580 [00:20<00:00, 28.71it/s]


              precision    recall  f1-score   support

         LOC       0.95      0.95      0.95      1951
         ORG       0.89      0.87      0.88       984
         PER       0.97      0.98      0.98       884

   micro avg       0.94      0.93      0.94      3819
   macro avg       0.94      0.93      0.93      3819
weighted avg       0.94      0.93      0.94      3819

Epoch 2/3
-------------------------------


loss: 0.032391: 100%|██████████| 5216/5216 [03:53<00:00, 22.38it/s]
100%|██████████| 580/580 [00:20<00:00, 28.67it/s]


              precision    recall  f1-score   support

         LOC       0.96      0.96      0.96      1951
         ORG       0.92      0.92      0.92       984
         PER       0.99      0.98      0.99       884

   micro avg       0.96      0.95      0.96      3819
   macro avg       0.96      0.95      0.96      3819
weighted avg       0.96      0.95      0.96      3819

Epoch 3/3
-------------------------------


loss: 0.023824: 100%|██████████| 5216/5216 [03:53<00:00, 22.34it/s]
100%|██████████| 580/580 [00:20<00:00, 28.59it/s]


              precision    recall  f1-score   support

         LOC       0.97      0.96      0.96      1951
         ORG       0.94      0.93      0.93       984
         PER       0.98      0.98      0.98       884

   micro avg       0.96      0.96      0.96      3819
   macro avg       0.96      0.96      0.96      3819
weighted avg       0.96      0.96      0.96      3819

Done!


In [16]:
sentence = '日本外务省3月18日发布消息称，日本首相岸田文雄将于19至21日访问印度和柬埔寨。'
results = []
with torch.no_grad():
    inputs = tokenizer(sentence, truncation=True, return_tensors="pt")
    inputs = inputs.to(device)
    pred = model(inputs)
    probabilities = torch.nn.functional.softmax(pred, dim=-1)[0].tolist()
    predictions = pred.argmax(dim=-1)[0].tolist()

    pred_label = []
    inputs_with_offsets = tokenizer(sentence, return_offsets_mapping=True)
    tokens = inputs_with_offsets.tokens()
    offsets = inputs_with_offsets["offset_mapping"]

    idx = 0
    while idx < len(predictions):
        pred = predictions[idx]
        label = id2label[pred]
        if label != "O":
            label = label[2:] # Remove the B- or I-
            start, end = offsets[idx]
            all_scores = [probabilities[idx][pred]]
            # Grab all the tokens labeled with I-label
            while (
                idx + 1 < len(predictions) and 
                id2label[predictions[idx + 1]] == f"I-{label}"
            ):
                all_scores.append(probabilities[idx + 1][predictions[idx + 1]])
                _, end = offsets[idx + 1]
                idx += 1

            score = np.mean(all_scores).item()
            word = sentence[start:end]
            pred_label.append(
                {
                    "entity_group": label,
                    "score": score,
                    "word": word,
                    "start": start,
                    "end": end,
                }
            )
        idx += 1
    print(pred_label)

[{'entity_group': 'ORG', 'score': 0.9995534420013428, 'word': '日本外务省', 'start': 0, 'end': 5}, {'entity_group': 'LOC', 'score': 0.9995562434196472, 'word': '日本', 'start': 16, 'end': 18}, {'entity_group': 'PER', 'score': 0.9996896684169769, 'word': '岸田文雄', 'start': 20, 'end': 24}, {'entity_group': 'LOC', 'score': 0.9996272623538971, 'word': '印度', 'start': 34, 'end': 36}, {'entity_group': 'LOC', 'score': 0.9991130828857422, 'word': '柬埔寨', 'start': 37, 'end': 40}]
