# 使用LSTM做词性标注

我们之前先讲了使用LSTM做文本分类的任务，如果不使用Attention的话，我们将只会用到最终的cell状态。但是大家有没有发现，RNN实际上会对每个时刻都有一个输出，所以用RNN来做文本分类确实是有点大材小用了。我们接下来介绍一类更广泛的任务，称为“序列标注”，这类任务的目的是对于每个字，给出一个标注。以下这些任务都可以归纳成“序列标注”任务。
1. 中文分词/分句
2. 命名实体识别
3. 组块分析
4. 句法分析

我们今天来介绍一个使用《左传》语料进行分词的任务。

In [1]:
import torch
import numpy as np
from utils.tokenizer import Tokenizer
from utils.get_emb import *

现在在notebook里调用就很简洁了，总之我们力求主文件能够比较简单，这样出BUG的时候排查起来也方便。

In [2]:
emb, dict_length, emb_size = get_emb()
tokenizer = Tokenizer(emb.keys())
emb_matrix = get_emb_matrix(emb, tokenizer, dict_length, emb_size)

dict_length:  9109
emb_size:  300
UNK 0
， 1
的 2
。 3
、 4
和 5
在 6
年 7
“ 8
了 9


然后我们来建立一个LSTM的标注网络。

In [3]:
from torch import nn

class LSTMTaggerNet(nn.Module):
    def __init__(self, seq_length, label_len, hidden_dims=None, bidirectional=False, num_layers=1):
        super(LSTMTaggerNet, self).__init__()
        self.seq_length = seq_length
        self.label_len = label_len
        # 控制是否使用双向LSTM
        self.bidirectional = bidirectional
        if num_layers == 1:
            self.lstm_dropout = 0.0
        else:
            self.lstm_dropout = 0.1
        self.fc_dropout = 0.1
        
        self.emb = nn.Embedding.from_pretrained(torch.tensor(emb_matrix))
        self.emb_size = self.emb.embedding_dim
        if hidden_dims is not None:
            self.hidden_dims = hidden_dims
        else:
            self.hidden_dims = self.emb_size
        
        # 循环神经网络，输入为(seq_len, batch, input_size)，(h_0, c_0), 如果没有给出h_0和c_0则默认为全零
        # 输出为(seq_len, batch, num_directions * hidden_size), (h_final, c_final)
        # 关于hidden_state和cell_state，可以理解为“短期记忆”和“长期记忆”
        self.lstm = nn.LSTM(self.emb_size, self.hidden_dims,
                            num_layers=num_layers, dropout=self.lstm_dropout,
                            bidirectional=self.bidirectional,
                           # batch_first=True
                           )
        
        # 输出层，输入为(batch_size, seq_len, hidden_dims)，输出为(batch_size, seq_len, label_len)
        if self.bidirectional:
            self.FC_out = nn.Sequential(
                nn.Linear(self.hidden_dims * 2, 50),
                nn.ReLU(inplace=True),
                nn.Dropout(self.fc_dropout),
                nn.Linear(50, self.label_len)
            )
        else:
            self.FC_out = nn.Sequential(
                nn.Linear(self.hidden_dims, 50),
                nn.ReLU(inplace=True),
                nn.Dropout(self.fc_dropout),
                nn.Linear(50, self.label_len)
            )
        
        # softmax分类层
        self.softmax = nn.Softmax(dim=-1)
        # 交叉熵损失函数
        self.loss_fct = nn.CrossEntropyLoss()
        
    def forward(self, x, y=None):
        # 通过词嵌入得到词的分布式表示，输出是(batch_size, seq_len, emb_size)
        x = self.emb(x)
        # 但是LSTM要的输入是(seq_len, batch_size, input_size)，做一下维度变换
        # 你也可以在建立LSTM网络的时候设置"batch_first = True"，使得LSTM要的输入就是(batch_size, seq_len, input_size)
        x = x.permute(1, 0, 2)
        # 使用LSTM，输出为(seq_len, batch_size, num_directions * hidden_size)
        # LSTM输出的其实是最后一层的每个时刻的“短期记忆”
        x, (final_h, final_c) = self.lstm(x)
        # 我们把batch_size放到最前面，所以现在是(batch_size, seq_len, num_directions * hidden_size)
        x = x.permute(1, 0, 2)
        
        logits = self.FC_out(x)
        logits = logits.view(-1, self.label_len)
        if y is None:
            return logits
        else:
            y = y.view(-1)
            return self.loss_fct(logits, y)

In [4]:
# 我们做的是左传的词性标注，序列最长设为20
seq_length = 20
# 我们使用BIO标签，O->B，因此还是只有两类
label_len = 2
model = LSTMTaggerNet(seq_length, label_len, bidirectional=True)
# 使用print可以打印出网络的结构
print(model)

total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(str(total_trainable_params), 'parameters is trainable.')

if torch.cuda.is_available():
    model.to(torch.device('cuda'))

LSTMTaggerNet(
  (emb): Embedding(9110, 300)
  (lstm): LSTM(300, 300, bidirectional=True)
  (FC_out): Sequential(
    (0): Linear(in_features=600, out_features=50, bias=True)
    (1): ReLU(inplace=True)
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=50, out_features=2, bias=True)
  )
  (softmax): Softmax(dim=-1)
  (loss_fct): CrossEntropyLoss()
)
1474952 parameters is trainable.


同样的，这里是已经封装好了的`dataset_readers`包，用于存放读取数据的类。之前我们都是只跑训练没有跑eval，现在开始我们的数据都会进行“train/dev”划分，根据测试集上的表现来确定表现，这也是一般的NN工程的方法。

In [5]:
from dataset_readers.single_sent_clf import *
from dataset_readers.cws import *

# 这个类是用于读取左传数据的
reader = Zuozhuan_Cws()
# 获取训练集
train_examples = reader.get_train_examples()
# 获取开发集
dev_examples = reader.get_dev_examples()
for i in range(3):
    print(train_examples[i].text, train_examples[i].label)
for i in range(3):
    print(dev_examples[i].text, dev_examples[i].label)

春秋左传定公 ['B', 'I', 'B', 'I', 'B', 'I']
元年 ['B', 'I']
春 ['B']
春秋左传隐公 ['B', 'I', 'B', 'I', 'B', 'I']
惠公元妃孟子 ['B', 'I', 'B', 'I', 'B', 'I']
孟子卒 ['B', 'I', 'B']


还是一样生成dataloader，只不过这次有两个，一个是train，一个是dev。

In [6]:
from torch.utils.data import TensorDataset, DataLoader

def convert_example_to_feature(examples, tokenizer, seq_length):
    features = []
    for i in examples:
        # 使用tokenizer将字符串转换为数字id
        ids = tokenizer.tokens_to_ids(i.text)
        label = i.label
        # 我们规定了最大长度，超过了就切断
        if len(ids) > seq_length:
            ids = ids[0: seq_length]
            label = label[0: seq_length]
        # 如果这个字符串全都不能识别，那就放弃掉
        if sum(ids) == 0:
            continue
        # 处理标签，我们设B为1，I为0
        trans = {'B': 1, 'I': 0}
        label_ids = [trans[l] for l in i.label]
        
        padding = [0] * (seq_length - len(ids))
        
        ids += padding
        label_ids += padding
                
        assert len(ids) == seq_length
        assert len(label_ids) == seq_length
        features.append(data_feature(ids, label_ids))
    return features

def generate_dataloader(examples, tokenizer, seq_length):
    features = convert_example_to_feature(examples, tokenizer, seq_length)
    ids = torch.tensor([f.ids for f in features], dtype=torch.long)
    label = torch.tensor([f.label_ids for f in features], dtype=torch.long)

    dataset = TensorDataset(ids, label)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
    return dataloader

train_dataloader = generate_dataloader(train_examples, tokenizer, seq_length)
dev_dataloader = generate_dataloader(dev_examples, tokenizer, seq_length)

依然使用Adam优化器。

In [7]:
from torch.optim import Adam

optimizer = Adam(model.parameters(), lr=0.0001)
print(optimizer)

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.0001
    weight_decay: 0
)


开始训练，我们这里使用sklearn提供的评估方法来进行评估。

In [8]:
# 可以直接算p，r，f1
from sklearn.metrics import precision_score, recall_score, f1_score

epoch = 10
for i in range(epoch):
    model.train()
    total_loss = []
    for ids, label_ids in train_dataloader:
        if torch.cuda.is_available():
            ids = ids.to(torch.device('cuda'))
            label_ids = label_ids.to(torch.device('cuda'))
        optimizer.zero_grad()
        loss = model(ids, label_ids)
        total_loss.append(loss.item())
        loss.backward()
        optimizer.step()
    print("epoch: %d, loss: %.6f" % (i + 1, sum(total_loss) / len(total_loss)))
    
    model.eval()
    total_gold = []
    total_pred = []
    for ids, label_ids in dev_dataloader:
        if torch.cuda.is_available():
            ids = ids.to(torch.device('cuda'))
        # 进行预测（batch_size, seq_length, label_len）
        logits = model(ids)
        # 转成numpy
        logits = logits.detach().cpu().numpy()
        # 从预测的概率中找到最大的，输出下标
        logits = np.argmax(logits, axis=-1)
        # 转成list
        logits = logits.tolist()
        # 插入到“总预测”的最后
        total_pred.extend(logits)
        # 将真实标签也插入到“总真实”的最后
        label_ids = label_ids.view(-1).numpy().tolist()
        total_gold.extend(label_ids)
    # eval_p = precision_score(total_gold, total_pred)
    # eval_r = recall_score(total_gold, total_pred)
    eval_f1 = f1_score(total_gold, total_pred)
    print("eval_f1: %.2f%%" % (eval_f1 * 100))

epoch: 1, loss: 0.176529
eval_f1: 91.93%
epoch: 2, loss: 0.068826
eval_f1: 93.50%
epoch: 3, loss: 0.055466
eval_f1: 93.87%
epoch: 4, loss: 0.047732
eval_f1: 93.85%
epoch: 5, loss: 0.043406
eval_f1: 94.16%
epoch: 6, loss: 0.040075
eval_f1: 93.70%
epoch: 7, loss: 0.037694
eval_f1: 94.35%
epoch: 8, loss: 0.035436
eval_f1: 93.63%
epoch: 9, loss: 0.033188
eval_f1: 94.09%
epoch: 10, loss: 0.031708
eval_f1: 94.14%
