# 项目描述：

在本项目中将利用LSTM和LSTM-CRF实现中文命名实体识别。

# 加载中文NER数据

In [1]:
import os
from google.colab import drive
drive.mount('/content/drive/')
os.chdir('/content/drive/My Drive/ChineseNERData')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


In [2]:
!ls

bilstm_crf_zh.pt      source_data.txt	test_data.txt
bilstm_softmax_zh.pt  source_label.txt	test_label.txt


句子开始与结束的tags:

In [0]:
START = '<s>'
END = '<e>'

训练集：

In [0]:
with open('source_data.txt', encoding='utf-8') as f:
    lines = f.readlines()
train_text = [line.strip().split(' ') for line in lines]

In [0]:
with open('source_label.txt', encoding='utf-8') as f:
    lines = f.readlines()
train_label = [line.strip().split(' ') for line in lines]

测试集：

In [0]:
with open('test_data.txt', encoding='utf-8') as f:
    lines = f.readlines()
test_text = [line.strip().split(' ') for line in lines]

In [0]:
with open('test_label.txt', encoding='utf-8') as f:
    lines = f.readlines()
test_label = [line.strip().split(' ') for line in lines]

辅助函数与数据构造：

In [0]:
# 词典
voc = [word for line in train_text for word in line] + [word for line in test_text for word in line]
voc = list(set(voc))
# tag集合
tagset = [tag for line in train_label for tag in line] + [tag for line in test_label for tag in line] + [START] + [END]
tagset = list(set(tagset))

In [0]:
# 单词到index的映射
word_to_idx = {word: idx for idx, word in enumerate(voc)}
idx_to_word = voc
# tag到index的映射
tag_to_idx = {tag: idx for idx, tag in enumerate(tagset)}
idx_to_tag = tagset

将原来的数据改为index数据：

In [0]:
train_x = [[word_to_idx[word] for word in line] for line in train_text]
train_y = [[tag_to_idx[tag] for tag in line] for line in train_label]
test_x = [[word_to_idx[word] for word in line] for line in test_text]
test_y = [[tag_to_idx[tag] for tag in line] for line in test_label]

分出测试集和验证集：

In [0]:
num_data = len(train_x)
num_valid = int(num_data * 0.2)
valid_x = train_x[:num_valid]
valid_y = train_y[:num_valid]
train_x = train_x[num_valid:]
train_y = train_y[num_valid:]

# BaseLine模型

In [30]:
correct = 0
total = 0
for tag in test_y:
    for t in tag:
        if t == 4:
            correct += 1
        total += 1
print(f"BaseLine Test Set Accuracy: {correct/total:.3%}")

BaseLine Test Set Accuracy: 85.872%


# 模型构造

In [0]:
import math
import torch
import torch.nn as nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
class BiLSTM_NER(nn.Module):
    def __init__(self, vocab_size, tag_to_idx, embed_dim, hidden_size,
                 use_gpu=True, use_crf=True):
        super(BiLSTM_NER, self).__init__()
        self.vocab_size = vocab_size
        self.tag_to_idx = tag_to_idx
        self.tag_size = len(tag_to_idx)
        self.use_gpu = use_gpu
        self.use_crf = use_crf
        self.hidden_size = hidden_size

        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=hidden_size,
                            bidirectional=True)
        self.hidden2Tag = nn.Linear(2 * hidden_size, self.tag_size)
        
        if use_crf:
            self.transitions = nn.Parameter(torch.zeros(self.tag_size, self.tag_size))
            self.transitions.data[self.tag_to_idx[START], :] = -10000.0
            self.transitions.data[:, self.tag_to_idx[END]] = -10000.0

    def _get_lstm_features(self, sentence):
        text_embed = self.embed(sentence) # [seq_len, embed_dim]
        text_embed = text_embed.unsqueeze(1) # [seq_len, 1, embed_dim]
        output, _ = self.lstm(text_embed) # [seq_len, 1, 2 * hidden_size]
        output = output.view(len(sentence), 2 * self.hidden_size) # [seq_len, 2 * hidden_size]
        lstm_feats = self.hidden2Tag(output) # [seq_len, tag_size]
        return lstm_feats

    def _score_sentence(self, feats, tags):
        r = torch.LongTensor(range(feats.size()[0]))
        if self.use_gpu:
            r = r.cuda()
            pad_start_tags = torch.cat([torch.cuda.LongTensor([self.tag_to_idx[START]]), tags])
            pad_stop_tags = torch.cat([tags, torch.cuda.LongTensor([self.tag_to_idx[END]])])
        else:
            pad_start_tags = torch.cat([torch.LongTensor([self.tag_to_idx[START]]), tags])
            pad_stop_tags = torch.cat([tags, torch.LongTensor([self.tag_to_idx[END]])])
        score = torch.sum(self.transitions[pad_stop_tags, pad_start_tags]) + torch.sum(feats[r, tags])
        return score
    
    def _forward_alg(self, feats):
        alphas = torch.ones(1, self.tag_size) * (-10000.)
        alphas[0, self.tag_to_idx[START]] = 0.
        if self.use_gpu:
            alphas = alphas.cuda()
        for feat in feats:
            alphas = alphas + self.transitions + feat.view(-1, 1)
            max_alphas, _ = torch.max(alphas, dim=1)
            alphas = alphas - max_alphas.view(-1, 1)
            alphas = max_alphas + torch.logsumexp(alphas, dim=1).view(1, -1)
        alphas = (alphas + self.transitions[self.tag_to_idx[END]]).view(1, -1)
        return torch.logsumexp(alphas, dim=-1)
    
    def viterbi_decode(self, feats):
        backpointers = []
        v = torch.ones(1, self.tag_size) * (-10000.0)
        v[0, self.tag_to_idx[START]] = 0
        if self.use_gpu:
            v = v.cuda()
        
        for feat in feats:
            v = v + self.transitions
            _, bptrs_t = torch.max(v, dim=1)
            bptrs_t = bptrs_t.squeeze(0).data.cpu().numpy()
            v = v[range(len(bptrs_t)), bptrs_t] + feat
            backpointers.append(bptrs_t)

        v = v + self.transitions[self.tag_to_idx[END]]
        v.data[self.tag_to_idx[END]] = -10000.
        v.data[self.tag_to_idx[START]] = -10000.
        
        best_tag_id = torch.argmax(v.unsqueeze(0)).item()
        path_score = v[best_tag_id]
        best_path = [best_tag_id]
        for bptrs_t in backpointers[::-1]:
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        start = best_path.pop()
        assert start == self.tag_to_idx[START]
        return path_score, best_path[::-1]
    
    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        if self.use_crf:
            forward_score = self._forward_alg(feats)
            score = self._score_sentence(feats, tags)
            return forward_score - score
        else:
            scores = nn.functional.cross_entropy(feats, tags)
            return scores
    
    def forward(self, sentence):
        feats = self._get_lstm_features(sentence)
        if self.use_crf:
            score, tag_seq = self.viterbi_decode(feats)
        else:
            score, tag_seq = torch.max(feats, 1)
            tag_seq = list(tag_seq.cpu().data)
        return score, tag_seq

# 模型评价与比较

模型评价函数：

In [0]:
def eva(model, data_x, data_y, use_cuda):
    correct = 0.
    total = 0.
    for sentence, tag in zip(data_x, data_y):
        if use_cuda:
            sentence = torch.cuda.LongTensor(sentence)
        else:
            sentence = torch.LongTensor(sentence)
        _, tag_seq = model(sentence)
        for i in range(len(tag_seq)):
            if tag_seq[i] == tag[i]:
                correct += 1
            total += 1
    return correct / total

超参数：

In [0]:
vocab_size = len(voc)
embed_dim = 100
hidden_size = 100
learning_rate = 0.01
use_gpu = True
num_epoch = 10

## BiLSTM-CRF模型

In [0]:
bilstm_crf = BiLSTM_NER(vocab_size, tag_to_idx, embed_dim, hidden_size,
                        use_gpu=use_gpu, use_crf=True).to(device)
optimizer_crf = torch.optim.Adam(bilstm_crf.parameters(), lr=learning_rate)

模型训练：

In [24]:
best_valid_acc = 0.
print(f"BiLSTM-CRF Traning Begin.")
for epoch in range(num_epoch):
    total_loss = 0.
    for i, sentence, tag in zip(range(len(train_x)), train_x, train_y):
        sentence = torch.cuda.LongTensor(sentence)
        tag = torch.cuda.LongTensor(tag)
        optimizer_crf.zero_grad()
        loss = bilstm_crf.neg_log_likelihood(sentence, tag)
        loss.backward()
        optimizer_crf.step()
        total_loss += loss.item()
        if (i + 1) % 500 == 0:
            print(f"Epoch: {epoch+1}/{num_epoch}. Sentence: {i+1}/{len(train_x)}. Total_loss: {total_loss:.3f}")
    valid_acc = eva(bilstm_crf, valid_x, valid_y, use_gpu)
    if valid_acc > best_valid_acc:
        torch.save(bilstm_crf.state_dict(), 'bilstm_crf_zh.pt')
        print(f"Valid Acc: {valid_acc:.3%}.")
        print(f"Epoch: {epoch+1}/{num_epoch}. Total_loss: {total_loss:.2f}")
        best_valid_acc = valid_acc
    else:
        print(f"Early Stop!")
        break

BiLSTM-CRF Traning Begin.
Epoch: 1/10. Sentence: 500/14947. Total_loss: 7027.164
Epoch: 1/10. Sentence: 1000/14947. Total_loss: 9950.858
Epoch: 1/10. Sentence: 1500/14947. Total_loss: 12525.010
Epoch: 1/10. Sentence: 2000/14947. Total_loss: 14693.253
Epoch: 1/10. Sentence: 2500/14947. Total_loss: 16961.802
Epoch: 1/10. Sentence: 3000/14947. Total_loss: 20533.348
Epoch: 1/10. Sentence: 4000/14947. Total_loss: 26961.955
Epoch: 1/10. Sentence: 4500/14947. Total_loss: 29840.735
Epoch: 1/10. Sentence: 5000/14947. Total_loss: 32655.028
Epoch: 1/10. Sentence: 5500/14947. Total_loss: 34434.245
Epoch: 1/10. Sentence: 6000/14947. Total_loss: 36415.131
Epoch: 1/10. Sentence: 6500/14947. Total_loss: 38361.449
Epoch: 1/10. Sentence: 7000/14947. Total_loss: 40410.552
Epoch: 1/10. Sentence: 7500/14947. Total_loss: 43107.889
Epoch: 1/10. Sentence: 8000/14947. Total_loss: 45153.216
Epoch: 1/10. Sentence: 8500/14947. Total_loss: 47061.030
Epoch: 1/10. Sentence: 9000/14947. Total_loss: 50150.599
Epoch: 1

In [26]:
print(f"Test Set Accuracy: {eva(bilstm_crf, test_x, test_y, True):.3%}")

Test Set Accuracy: 91.587%


## BiLSTM-Softmax模型

In [0]:
bilstm_softmax = BiLSTM_NER(vocab_size, tag_to_idx, embed_dim, hidden_size,
                            use_gpu=True, use_crf=False).to(device)
optimizer_softmax = torch.optim.Adam(bilstm_softmax.parameters(), lr=learning_rate)

模型训练：

In [20]:
best_valid_acc = 0.
print(f"BiLSTM-softmax Traning Begin.")
for epoch in range(num_epoch):
    total_loss = 0.
    for i, sentence, tag in zip(range(len(train_x)), train_x, train_y):
        sentence = torch.cuda.LongTensor(sentence)
        tag = torch.cuda.LongTensor(tag)
        optimizer_softmax.zero_grad()
        loss = bilstm_softmax.neg_log_likelihood(sentence, tag)
        loss.backward()
        optimizer_softmax.step()
        total_loss += loss.item()
        if (i + 1) % 500 == 0:
            print(f"Epoch: {epoch+1}/{num_epoch}. Sentence: {i+1}/{len(train_x)}. Total_loss: {total_loss:.3f}")
    valid_acc = eva(bilstm_softmax, valid_x, valid_y, use_gpu)
    if valid_acc > best_valid_acc:
        torch.save(bilstm_softmax.state_dict(), 'bilstm_softmax_zh.pt')
        print(f"Valid Acc: {valid_acc:.3%}.")
        print(f"Epoch: {epoch+1}/{num_epoch}. Total_loss: {total_loss:.2f}")
        best_valid_acc = valid_acc
    else:
        print(f"Early Stop!")
        break

BiLSTM-softmax Traning Begin.
Epoch: 1/10. Sentence: 500/14947. Total_loss: 184.490
Epoch: 1/10. Sentence: 1000/14947. Total_loss: 309.602
Epoch: 1/10. Sentence: 1500/14947. Total_loss: 430.790
Epoch: 1/10. Sentence: 2000/14947. Total_loss: 528.846
Epoch: 1/10. Sentence: 2500/14947. Total_loss: 633.139
Epoch: 1/10. Sentence: 3000/14947. Total_loss: 789.079
Epoch: 1/10. Sentence: 3500/14947. Total_loss: 939.427
Epoch: 1/10. Sentence: 4000/14947. Total_loss: 1080.247
Epoch: 1/10. Sentence: 4500/14947. Total_loss: 1195.494
Epoch: 1/10. Sentence: 5000/14947. Total_loss: 1317.981
Epoch: 1/10. Sentence: 5500/14947. Total_loss: 1400.260
Epoch: 1/10. Sentence: 6000/14947. Total_loss: 1499.892
Epoch: 1/10. Sentence: 6500/14947. Total_loss: 1590.566
Epoch: 1/10. Sentence: 7000/14947. Total_loss: 1696.542
Epoch: 1/10. Sentence: 7500/14947. Total_loss: 1815.777
Epoch: 1/10. Sentence: 8000/14947. Total_loss: 1917.189
Epoch: 1/10. Sentence: 8500/14947. Total_loss: 2025.414
Epoch: 1/10. Sentence: 900

In [28]:
print(f"Test Set Accuracy: {eva(bilstm_softmax, test_x, test_y, True):.3%}")

Test Set Accuracy: 92.049%


# Conclusion

模型准确率比较为：

In [32]:
import pandas as pd
pd.DataFrame(data=[0.85872, 0.92049, 0.91587], index=['Baseline', 'BiLSTM', 'BiLSTM-CRF'], columns=['Test Set Acc'])

Unnamed: 0,Test Set Acc
Baseline,0.85872
BiLSTM,0.92049
BiLSTM-CRF,0.91587
