In [1]:
import torch
import torch.utils.data as Data
import torch.nn.functional as F
from torch import nn
import torch
import torch.optim as optim
from keras.preprocessing.sequence import pad_sequences
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

embeded_path = "E:/data/NER_dataset/word2vec/cn_char_fastnlp_100d.txt"
# train_path = "E:/data/NER_dataset/Weibo/weiboNER_2nd_conll.train"
# test_path = "E:/data/NER_dataset/Weibo/weiboNER_2nd_conll.test"
# dev_path = "E:/data/NER_dataset/Weibo/weiboNER_2nd_conll.dev"
train_path = "E:/data/NER_dataset/Weibo/NAM/weibo_train_NAM.txt"
test_path = "E:/data/NER_dataset/Weibo/NAM/weibo_test_NAM.txt"
dev_path = "E:/data/NER_dataset/Weibo/NAM/weibo_dev_NAM.txt"
MAX_SEQ_LEN = 100
BATCH_SIZE = 64
EPOCH = 6
embedding_size = 100
hidden_size = 200

In [2]:
def build_corpus(data_dir):
    word_lists = []
    tag_lists = []
    with open(data_dir, 'r', encoding='utf-8') as f:
        words = []
        tags = []
        for line in f:
            if line != '\n':
                word, tag = line.strip("\n").split()
#                 word = word[0]
                words.append(word)
                tags.append(tag)
            else:
                word_lists.append(words)
                tag_lists.append(tags)
                words = []
                tags = []
    return word_lists, tag_lists

train_words_lists, train_tag_lists = build_corpus(train_path)
test_words_lists, test_tag_lists = build_corpus(test_path)
dev_words_lists, dev_tag_lists = build_corpus(dev_path)

In [3]:
def get_word_tag(train_words_lists, test_words_lists,dev_words_lists, train_tag_lists):
    words_lists = []
    words_lists.extend(train_words_lists)
    words_lists.extend(test_words_lists)
    words_lists.extend(dev_words_lists)
    words_map = {}
    for list in words_lists:
        for e in list:
            if e not in words_map:
                words_map[e] = len(words_map)+2
    words_map['<pad>'] = 0
    words_map['<unk>'] = 1
    
    id2word = {}
    for x in words_map:
        id2word[words_map[x]] = x
    
    tags_map = {}
    for list in train_tag_lists:
        for e in list:
            if e not in tags_map:
                tags_map[e] = len(tags_map)
    id2tag = {}
    for x in tags_map:
        id2tag[tags_map[x]] = x
    return words_map,id2word, tags_map, id2tag

## 得到单词和标签到id的映射
word2id, id2word, tag2id, id2tag = get_word_tag(train_words_lists, test_words_lists, dev_words_lists, train_tag_lists)

In [4]:
'''
方法1：
将数据集进行截断和填充，得到每一个batch的大小都一样
'''

## 将单词映射到id
def tokenize2id(words_list, tag_list, word2id, tag2id):
    words2id_list = []
    tags2id_list = []
    for i in range(len(words_list)):
        words2id_list.append(list(map(lambda x: word2id[x], words_list[i] )))
        tags2id_list.append(list(map(lambda x: tag2id[x], tag_list[i])))
    return  words2id_list, tags2id_list

train_words_id, train_tags_id = tokenize2id(train_words_lists, train_tag_lists, word2id, tag2id)
# test_words_id, test_tags_id = tokenize2id(test_words_lists, test_tag_lists, word2id, tag2id)
# dev_words_id, dev_tags_id = tokenize2id(dev_words_lists, dev_tag_lists, word2id, tag2id)

## 将语料进行对齐和截断
def get_padded_seq(words, tags):
    padded_wordsid = pad_sequences(words, maxlen= MAX_SEQ_LEN, dtype="long",
              truncating='post', padding='post')
    padded_tagsid = pad_sequences(tags, maxlen= MAX_SEQ_LEN, dtype="long",
                                   truncating='post', padding='post')
    masks = torch.zeros(len(words),MAX_SEQ_LEN).type(torch.ByteTensor)
    for i,x in enumerate(padded_wordsid):
        if 0 in x:
            zero_index = list(x).index(0)
            masks[i,:zero_index] = torch.ones(zero_index)
        else:
            masks[i] = torch.ones(MAX_SEQ_LEN)

    padded_wordsid = torch.LongTensor(padded_wordsid)
    padded_tagsid = torch.LongTensor(padded_tagsid)
#     lengths = torch.LongTensor(lengths)
    return padded_wordsid, padded_tagsid, masks

train_x, train_y, train_mask = get_padded_seq(train_words_id, train_tags_id)
# test_x, test_y = get_padded_seq(test_words_id, test_tags_id)
# dev_x, dev_y = get_padded_seq(dev_words_id, dev_tags_id)

In [5]:
'''
方法2, 由于直接对test测试集进行截断，会损失一部分预测的值，因此不能截断，只能去每一个batch里面的最大长度
利用collate_fn,让每一个batch的大小都和最大的长度一样
'''

## 将单词映射到id
def tokenize2id(words_list, tag_list, word2id, tag2id):
    words2id_list = []
    tags2id_list = []
    for i in range(len(words_list)):
        words2id_list.append(torch.tensor(list(map(lambda x: word2id[x], words_list[i] ))))
        tags2id_list.append(torch.tensor(list(map(lambda x: tag2id[x], tag_list[i]))))
    return  words2id_list, tags2id_list


import torch.nn.utils.rnn as rnn_utils
def collate_fn(data):
#     data.sort(key=lambda x: len(x), reverse=True)
    data = rnn_utils.pad_sequence(data, batch_first=True, padding_value=0)
    return data

# train_words_id, train_tags_id = tokenize2id(train_words_lists, train_tag_lists, word2id, tag2id)
test_words_id, test_tags_id = tokenize2id(test_words_lists, test_tag_lists, word2id, tag2id)
dev_words_id, dev_tags_id = tokenize2id(dev_words_lists, dev_tag_lists, word2id, tag2id)


def get_mask(words_id):
    test_mask = []
    for e in words_id:
        elen = len(e)
        t = torch.ones(elen).type(torch.ByteTensor)
        test_mask.append(t)
    return test_mask

test_masks = get_mask(test_words_id)
dev_masks = get_mask(dev_words_id)

test_x = Data.DataLoader(test_words_id, batch_size=BATCH_SIZE, shuffle=False, 
                             collate_fn=collate_fn)
test_y = Data.DataLoader(test_tags_id, batch_size=BATCH_SIZE, shuffle=False, 
                             collate_fn=collate_fn)
dev_x = Data.DataLoader(dev_words_id, batch_size=BATCH_SIZE, shuffle=False, 
                             collate_fn=collate_fn)
dev_y = Data.DataLoader(dev_tags_id, batch_size=BATCH_SIZE, shuffle=False, 
                             collate_fn=collate_fn)
test_masks = Data.DataLoader(test_masks, batch_size=BATCH_SIZE, shuffle=False, 
                             collate_fn=collate_fn)
dev_masks = Data.DataLoader(dev_masks, batch_size=BATCH_SIZE, shuffle=False, 
                             collate_fn=collate_fn)

In [6]:
train_dataset = Data.TensorDataset(train_x, train_y, train_mask)
# test_dataset = Data.TensorDataset(test_x, test_y)
# dev_dataset = Data.TensorDataset(dev_x, dev_y)
train_loader = Data.DataLoader(
            dataset=train_dataset,
            batch_size = BATCH_SIZE,
            shuffle = True)
# test_loader = Data.DataLoader(
#         dataset=test_dataset,
#         batch_size=BATCH_SIZE,
#         shuffle=True)
# dev_loader = Data.DataLoader(
#         dataset=dev_dataset,
#         batch_size=BATCH_SIZE,
#         shuffle=True)

In [7]:
## 读取词向量
def pretrained_embedding(embed_path):
    tmp_file = get_tmpfile(embed_path)
    wvmodel = KeyedVectors.load_word2vec_format(tmp_file)
    embed_size = len(wvmodel.get_vector(wvmodel.index2word[3]))
    vocab_size = len(word2id)

    weight = torch.randn(vocab_size, embed_size)
    for i in range(vocab_size):
        try:
            index = word2id[wvmodel.index2word[i]]
        except:
            continue
        weight[index,:] = torch.from_numpy(wvmodel.get_vector(
            wvmodel.index2word[i]
        ))
    return weight
weight = pretrained_embedding(embeded_path)



In [8]:
from torchcrf import CRF
## reference : https://github.com/kmkurn/pytorch-crf/issues/40
class BiLSTMCRF(nn.Module):
    def __init__(self, vocab_size, num_tags, emb_size, hidden_size, weight=None,drop_out=0.5):
        super(BiLSTMCRF, self).__init__()
        if weight!=None:
            self.embedding = nn.Embedding.from_pretrained(weight)
        else:
            self.embedding = nn.Embedding(vocab_size, emb_size)
        self.bilstm = nn.LSTM(emb_size, hidden_size,
                              batch_first=True,
                              bidirectional=True)
        self.dropout = nn.Dropout(drop_out)
        self.fc = nn.Linear(2*hidden_size, num_tags)
        self.crf = CRF(num_tags,batch_first =True)

        
    def neglikelihood(self, sentence, batch_y, masks):
        feats = self.get_lstm_features(sentence)
        crf_loss = self.crf(feats, batch_y, mask = masks)
        return -crf_loss
        
    def get_lstm_features(self, sentence):
        emb = self.embedding(sentence)  # [B, L, emb_size]
        lstm_out, _ = self.bilstm(emb)
        scores = self.fc(lstm_out)
        scores = self.dropout(scores)
        return scores
        
    def forward(self, sentence, masks):
        emb = self.embedding(sentence)  # [B, L, emb_size]
        lstm_out, _ = self.bilstm(emb)
        scores = self.fc(lstm_out)
        scores = self.dropout(scores)
        ## 不需要对scores做softmax
#         scores = F.log_softmax(scores, dim=-1)
        crf_out = self.crf.decode(scores, mask =masks)
        
        return crf_out
    

In [9]:
def conlleval(predictions, batch_y):
    pred = []
    target = []
    for i,p in enumerate(predictions):
        pred.extend(p)
        target.extend(list(batch_y[i][:len(p)]))
    real = 0
    recongnized = 0
    correct = 0
    target = [0]+ target+ [0] ## 前后增加0 防止数组越界
    pred = [0]+ pred+ [0]
    for i in range(len(target)-1):
        if target[i]!=0 and target[i+1]==0:
            real+=1
        if pred[i]!=0 and pred[i+1]==0:
            recongnized+=1
    pred_loc = []   ## 记录预测的实体位置
    start,end =1,1
    while start<len(pred) and end<len(pred):
        if pred[start]!=0 and pred[start-1]==0:
            end = start
            while pred[end]!=0:
                end+=1
            if start!=end:
                pred_loc.append((start,end))
            start = end
            start +=1
        else:
            start+=1
    for x in pred_loc:
        start,end = x
        if target[start:end] == pred[start:end] and target[start-1]==0 and target[end]==0:
            correct += 1
    return correct, recongnized, real

def evaluate(model, x, y, masks):   
    model.eval()
    epoch_loss = 0
    correct_all = 0  ## 识别正确得 总的
    recongized_all = 0   ## 识别出来得 总的
    real_all = 0   ### 实际实体数 总的
    with torch.no_grad():
        for step,(batch_x, batch_y,mask) in enumerate(zip(x,y,masks)):
            pred = model(batch_x,mask)
#             print("eva",pred.shape,batch_y.shape)
            loss = model.neglikelihood(batch_x, batch_y, mask)
            epoch_loss+=loss.item()
            correct, recongized, real = conlleval(pred, batch_y)
            correct_all +=correct
            recongized_all+=recongized
            real_all+=real
#     print(correct_all, recongized_all, real_all)
    precision = 0 if recongized_all==0 else (correct_all/recongized_all)
    recall = 0 if real_all ==0 else (correct_all/real_all)
    f1 = 0 if recall + precision == 0 else (2 * precision * recall) / (precision + recall)
    print("loss:",epoch_loss,",||precision:",precision,',||recall:',recall,",||F1:",f1)

In [10]:
"""
BI-LSTM+CRF  train
"""
vocab_size = len(word2id)
out_size = len(tag2id)
crfmodel = BiLSTMCRF(vocab_size,out_size, embedding_size, hidden_size, weight)
optimizer = optim.Adam(crfmodel.parameters(), lr=0.015)
criterion = nn.CrossEntropyLoss()
for epoch in range(20):
    train_loss = 0
    crfmodel.train()
    for step, (batch_x, batch_y, masks) in enumerate(train_loader):
        optimizer.zero_grad()
        predictions = crfmodel(batch_x, masks)
        loss = crfmodel.neglikelihood(batch_x, batch_y, masks)
        loss.backward()
        train_loss+=loss.item()
        optimizer.step()
    print("epoch:",epoch,"train_loss:",train_loss)
    evaluate(crfmodel, test_x, test_y, test_masks)

epoch: 0 train_loss: 62990.64846801758
loss: 2874.759735107422 ,||precision: 0.0 ,||recall: 0.0 ,||F1: 0
epoch: 1 train_loss: 32930.607482910156
loss: 2155.957633972168 ,||precision: 0.5 ,||recall: 0.004975124378109453 ,||F1: 0.009852216748768473
epoch: 2 train_loss: 19250.55931854248
loss: 1769.231430053711 ,||precision: 0.8333333333333334 ,||recall: 0.04975124378109453 ,||F1: 0.09389671361502348
epoch: 3 train_loss: 12701.864795684814
loss: 1524.0867309570312 ,||precision: 0.5769230769230769 ,||recall: 0.07462686567164178 ,||F1: 0.13215859030837002
epoch: 4 train_loss: 9629.169967651367
loss: 1388.4080352783203 ,||precision: 0.6666666666666666 ,||recall: 0.08955223880597014 ,||F1: 0.15789473684210525
epoch: 5 train_loss: 7677.850845336914
loss: 1280.6250762939453 ,||precision: 0.6585365853658537 ,||recall: 0.13432835820895522 ,||F1: 0.2231404958677686
epoch: 6 train_loss: 6665.474365234375
loss: 1236.2692565917969 ,||precision: 0.5681818181818182 ,||recall: 0.12437810945273632 ,||F1:

In [15]:
for x in test_x:
    print(len(x[0]))

133
138
146
145
140
