In [1]:
import torch
import torch.utils.data as Data
import torch.nn.functional as F
from torch import nn
import torch
import torch.optim as optim
from keras.preprocessing.sequence import pad_sequences
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

embeded_path = "E:/data/NER_dataset/word2vec/cn_char_fastnlp_100d.txt"
train_path = "E:/data/NER_dataset/Weibo/weiboNER_2nd_conll.train"
test_path = "E:/data/NER_dataset/Weibo/weiboNER_2nd_conll.test"
dev_path = "E:/data/NER_dataset/Weibo/weiboNER_2nd_conll.dev"
MAX_SEQ_LEN = 128
BATCH_SIZE = 16
EPOCH = 6
embedding_size = 100
hidden_size = 100

In [2]:
def build_corpus(data_dir):
    word_lists = []
    tag_lists = []
    with open(data_dir, 'r', encoding='utf-8') as f:
        words = []
        tags = []
        for line in f:
            if line != '\n':
                word, tag = line.strip("\n").split()
                word = word[0]
                words.append(word)
                tags.append(tag)
            else:
                word_lists.append(words)
                tag_lists.append(tags)
                words = []
                tags = []
    return word_lists, tag_lists

train_words_lists, train_tag_lists = build_corpus(train_path)
test_words_lists, test_tag_lists = build_corpus(test_path)
dev_words_lists, dev_tag_lists = build_corpus(dev_path)

In [3]:
def get_word_tag(train_words_lists, test_words_lists,dev_words_lists, train_tag_lists):
    words_lists = []
    words_lists.extend(train_words_lists)
    words_lists.extend(test_words_lists)
    words_lists.extend(dev_words_lists)
    words_map = {}
    for list in words_lists:
        for e in list:
            if e not in words_map:
                words_map[e] = len(words_map)+2
    words_map['<pad>'] = 0
    words_map['<unk>'] = 1
    
    id2word = {}
    for x in words_map:
        id2word[words_map[x]] = x
    
    tags_map = {}
    for list in train_tag_lists:
        for e in list:
            if e not in tags_map:
                tags_map[e] = len(tags_map)
    id2tag = {}
    for x in tags_map:
        id2tag[tags_map[x]] = x
    return words_map,id2word, tags_map, id2tag

## 得到单词和标签到id的映射
word2id, id2word, tag2id, id2tag = get_word_tag(train_words_lists, test_words_lists, dev_words_lists, train_tag_lists)

In [4]:
'''
方法1：
将数据集进行截断和填充，得到每一个batch的大小都一样
'''

## 将单词映射到id
def tokenize2id(words_list, tag_list, word2id, tag2id):
    words2id_list = []
    tags2id_list = []
    for i in range(len(words_list)):
        words2id_list.append(list(map(lambda x: word2id[x], words_list[i] )))
        tags2id_list.append(list(map(lambda x: tag2id[x], tag_list[i])))
    return  words2id_list, tags2id_list

train_words_id, train_tags_id = tokenize2id(train_words_lists, train_tag_lists, word2id, tag2id)
# test_words_id, test_tags_id = tokenize2id(test_words_lists, test_tag_lists, word2id, tag2id)
# dev_words_id, dev_tags_id = tokenize2id(dev_words_lists, dev_tag_lists, word2id, tag2id)

## 将语料进行对齐和截断
def get_padded_seq(words, tags):
    padded_wordsid = pad_sequences(words, maxlen= MAX_SEQ_LEN, dtype="long",
              truncating='post', padding='post')
    padded_tagsid = pad_sequences(tags, maxlen= MAX_SEQ_LEN, dtype="long",
                                   truncating='post', padding='post')
    padded_wordsid = torch.LongTensor(padded_wordsid)
    padded_tagsid = torch.LongTensor(padded_tagsid)
    return padded_wordsid, padded_tagsid

train_x, train_y = get_padded_seq(train_words_id, train_tags_id)
# test_x, test_y = get_padded_seq(test_words_id, test_tags_id)
# dev_x, dev_y = get_padded_seq(dev_words_id, dev_tags_id)

In [5]:
'''
方法2, 由于直接对测试集进行截断，会损失一部分预测的值，因此不能截断，只能去每一个batch里面的最大长度
利用collect_fn,让每一个batch的大小都和最大的长度一样
'''

## 将单词映射到id
def tokenize2id(words_list, tag_list, word2id, tag2id):
    words2id_list = []
    tags2id_list = []
    for i in range(len(words_list)):
        words2id_list.append(torch.tensor(list(map(lambda x: word2id[x], words_list[i] ))))
        tags2id_list.append(torch.tensor(list(map(lambda x: tag2id[x], tag_list[i]))))
    return  words2id_list, tags2id_list


import torch.nn.utils.rnn as rnn_utils
def collate_fn(data):
    data.sort(key=lambda x: len(x), reverse=True)
    data = rnn_utils.pad_sequence(data, batch_first=True, padding_value=0)
    return data

# train_words_id, train_tags_id = tokenize2id(train_words_lists, train_tag_lists, word2id, tag2id)
test_words_id, test_tags_id = tokenize2id(test_words_lists, test_tag_lists, word2id, tag2id)
dev_words_id, dev_tags_id = tokenize2id(dev_words_lists, dev_tag_lists, word2id, tag2id)


test_x = Data.DataLoader(test_words_id, batch_size=3, shuffle=False, 
                             collate_fn=collate_fn)
test_y = Data.DataLoader(test_tags_id, batch_size=3, shuffle=False, 
                             collate_fn=collate_fn)
dev_x = Data.DataLoader(dev_words_id, batch_size=3, shuffle=False, 
                             collate_fn=collate_fn)
dev_y = Data.DataLoader(dev_tags_id, batch_size=3, shuffle=False, 
                             collate_fn=collate_fn)

In [193]:
# ## 将语料进行对齐和截断
# def get_padded_seq(words, tags):
#     padded_wordsid = pad_sequences(words, maxlen= MAX_SEQ_LEN, dtype="long",
#               truncating='post', padding='post')
#     padded_tagsid = pad_sequences(tags, maxlen= MAX_SEQ_LEN, dtype="long",
#                                    truncating='post', padding='post')
#     padded_wordsid = torch.LongTensor(padded_wordsid)
#     padded_tagsid = torch.LongTensor(padded_tagsid)
#     return padded_wordsid, padded_tagsid

# train_x, train_y = get_padded_seq(train_words_id, train_tags_id)
# test_x, test_y = get_padded_seq(test_words_id, test_tags_id)
# dev_x, dev_y = get_padded_seq(dev_words_id, dev_tags_id)

In [7]:
train_dataset = Data.TensorDataset(train_x, train_y)
# test_dataset = Data.TensorDataset(test_x, test_y)
# dev_dataset = Data.TensorDataset(dev_x, dev_y)
train_loader = Data.DataLoader(
            dataset=train_dataset,
            batch_size = BATCH_SIZE,
            shuffle = True)
# test_loader = Data.DataLoader(
#         dataset=test_dataset,
#         batch_size=BATCH_SIZE,
#         shuffle=True)
# dev_loader = Data.DataLoader(
#         dataset=dev_dataset,
#         batch_size=BATCH_SIZE,
#         shuffle=True)

In [8]:
## 读取词向量
def pretrained_embedding(embed_path):
    tmp_file = get_tmpfile(embed_path)
    wvmodel = KeyedVectors.load_word2vec_format(tmp_file)
    embed_size = len(wvmodel.get_vector(wvmodel.index2word[3]))
    vocab_size = len(word2id)

    weight = torch.zeros(vocab_size, embed_size)
    for i in range(vocab_size):
        try:
            index = word2id[wvmodel.index2word[i]]
        except:
            continue
        weight[index,:] = torch.from_numpy(wvmodel.get_vector(
            wvmodel.index2word[i]
        ))
    return weight
weight = pretrained_embedding(embeded_path)



In [9]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, out_size, weight=None,drop_out=0.5):
        super(BiLSTM, self).__init__()
        if weight!=None:
            self.embedding = nn.Embedding.from_pretrained(weight)
        else:
            self.embedding = nn.Embedding(vocab_size, emb_size)
        self.bilstm = nn.LSTM(emb_size, hidden_size,
                              batch_first=True,
                              bidirectional=True)
        self.dropout = nn.Dropout(drop_out)
        self.fc = nn.Linear(2*hidden_size, out_size)

    def forward(self, sentence):
        emb = self.embedding(sentence)  # [B, L, emb_size]
#         packed = pack_padded_sequence(emb, lengths, batch_first=True)
        lstm_out, _ = self.bilstm(emb)
#         lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)
        scores = self.fc(lstm_out)
        scores = self.dropout(scores)
        return scores

In [10]:
def cal_loss(pred, target):
    '''
    :param pred:  [Batch_size, seqlen , tagsnum]
    :param target:  [Batch_size, seqlen ]
    :return:
    '''
    pred = pred.permute(0,2,1)
    criterion = nn.CrossEntropyLoss()
    loss = criterion(pred, target)
    return loss


def metrics(pred, batch_y):
#     print("met",pred.shape,batch_y.shape)
    pred = torch.argmax(F.log_softmax(pred,dim=2),dim=2).view(-1).detach().numpy()
    target = batch_y.view(-1).numpy()
    correct = 0  ## 识别正确的
    recongized = 0   ## 识别出来的
    real = 0   ### 实际实体数
    for i in range(len(pred)):
        if target[i]!=0:
            real+=1
        if target[i] == pred[i] and target[i]!=0:
            correct+=1
            recongized+=1
        elif pred[i]!=0:
            recongized+=1
#     print("met",pred.shape,target.shape)
    return correct, recongized, real


def conlleval(pred, batch_y):
    pred = list(torch.argmax(F.log_softmax(pred,dim=2),dim=2).view(-1).detach().numpy())
    target = list(batch_y.view(-1).numpy())
    real = 0
    recongnized = 0
    correct = 0
    target = [0]+ target+ [0] ## 前后增加0 防止数组越界
    pred = [0]+ pred+ [0]
    for i in range(len(target)-1):
        if target[i]!=0 and target[i+1]==0:
            real+=1
        if pred[i]!=0 and pred[i+1]==0:
            recongnized+=1
    pred_loc = []   ## 记录预测的实体位置
    start,end =1,1
    while start<len(pred) and end<len(pred):
        if pred[start]!=0 and pred[start-1]==0:
            end = start
            while pred[end]!=0:
                end+=1
            if start!=end:
                pred_loc.append((start,end))
            start = end
            start +=1
        else:
            start+=1
    for x in pred_loc:
        start,end = x
        if target[start:end] == pred[start:end] and target[start-1]==0 and target[end]==0:
            correct += 1
    return correct, recongnized, real

def evaluate(model, data_loader):   
    model.eval()
    epoch_loss = 0
    correct_all = 0  ## 识别正确得 总的
    recongized_all = 0   ## 识别出来得 总的
    real_all = 0   ### 实际样本数 总的
    with torch.no_grad():
        for step,(batch_x, batch_y) in enumerate(data_loader):
            pred = model(batch_x)
#             print("eva",pred.shape,batch_y.shape)
            loss = cal_loss(pred, batch_y)
            epoch_loss+=loss.item()
            correct, recongized, real = conlleval(pred, batch_y)
            correct_all +=correct
            recongized_all+=recongized
            real_all+=real
    print(correct_all, recongized_all, real_all)
    precision = 0 if recongized_all==0 else (correct_all/recongized_all)
    recall = 0 if real_all ==0 else (correct_all/real_all)
    f1 = 0 if recall + precision == 0 else (2 * precision * recall) / (precision + recall)
    print("loss:",epoch_loss,",||precision:",precision,',||recall:',recall,",||F1:",f1)
    
def evaluate1(model, x,y):   
    model.eval()
    epoch_loss = 0
    correct_all = 0  ## 识别正确得 总的
    recongized_all = 0   ## 识别出来得 总的
    real_all = 0   ### 实际样本数 总的
    with torch.no_grad():
        for step,(batch_x, batch_y) in enumerate(zip(x,y)):
            pred = model(batch_x)
#             print("eva",pred.shape,batch_y.shape)
            loss = cal_loss(pred, batch_y)
            epoch_loss+=loss.item()
            correct, recongized, real = conlleval(pred, batch_y)
            correct_all +=correct
            recongized_all+=recongized
            real_all+=real
    print(correct_all, recongized_all, real_all)
    precision = 0 if recongized_all==0 else (correct_all/recongized_all)
    recall = 0 if real_all ==0 else (correct_all/real_all)
    f1 = 0 if recall + precision == 0 else (2 * precision * recall) / (precision + recall)
    print("loss:",epoch_loss,",||precision:",precision,',||recall:',recall,",||F1:",f1)

In [11]:
"""
BI-LSTM  train
"""
vocab_size = len(word2id)
out_size = len(tag2id)
model = BiLSTM(vocab_size, embedding_size, hidden_size, out_size, weight=None)
optimizer = optim.Adam(model.parameters(), lr=3e-2)
criterion = nn.CrossEntropyLoss()
for epoch in range(5):
    train_loss = 0
    model.train()
    for step, (batch_x, batch_y) in enumerate(train_loader):
        optimizer.zero_grad()
        predictions = model(batch_x)
        loss = cal_loss(predictions, batch_y)
        loss.backward()
        train_loss+=loss.item()
        optimizer.step()
    print("epoch:",epoch)
    evaluate1(model, test_x, test_y)

epoch: 0
31 271 375
loss: 23.585842177271843 ,||precision: 0.11439114391143912 ,||recall: 0.08266666666666667 ,||F1: 0.09597523219814243
epoch: 1
29 324 375
loss: 19.298527613282204 ,||precision: 0.08950617283950617 ,||recall: 0.07733333333333334 ,||F1: 0.08297567954220315
epoch: 2
62 289 375
loss: 15.843255652114749 ,||precision: 0.21453287197231835 ,||recall: 0.16533333333333333 ,||F1: 0.18674698795180725
epoch: 3
69 384 375
loss: 16.602137187495828 ,||precision: 0.1796875 ,||recall: 0.184 ,||F1: 0.18181818181818182
epoch: 4
82 346 375
loss: 15.154858459718525 ,||precision: 0.23699421965317918 ,||recall: 0.21866666666666668 ,||F1: 0.22746185852981968
