# ner-torch模型功能测试

In [1]:
from collections import defaultdict
from operator import itemgetter
import numpy as np
import torch
import torch.nn.functional as F # pytorch 激活函数的类
import pickle as pk
import pandas as pd
from torch import nn
from tqdm import tqdm
from torchcrf import CRF


# 构建基于bilstm+crf实现ner
class bilstm_crf(nn.Module):
    def __init__(self, parameter):
        super(bilstm_crf, self).__init__()
        vocab_size = parameter['vocab_size']
        embedding_dim = parameter['d_model']
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        hidden_size = parameter['hid_dim']
        num_layers = parameter['n_layers']
        dropout = parameter['dropout']
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, bidirectional=True, batch_first=True, dropout=dropout)

        output_size = parameter['num_tags']
        self.fc = nn.Linear(hidden_size*2, output_size)
        
        self.crf = CRF(output_size,batch_first=True)
        
    def forward(self, x):
        out = self.embedding(x)
        out,(h, c)= self.lstm(out)
        out = self.fc(out)
        return out

# 此处是加载对应的模型和配置文件
def load_model(mode_path):
    parameter = pk.load(open(mode_path+'parameter.pkl','rb'))
    parameter['device'] = torch.device('cpu')
    # 因为bert模型需要加载他对应的config文件，因此此处进行了一定的区分
    model = bilstm_crf(parameter).to(parameter['device'])
    model.load_state_dict(torch.load(model_path+'bilstm_crf.h5',map_location='cpu'))
    model.eval() 
    return model,parameter

def keyword_predict(input):
    def list2torch(ins):
        return torch.from_numpy(np.array(ins))
    def seq2id(seq, vocab):
        sentence_id = []
        for word in seq:
            if word not in vocab:
                word = '<UNK>'
            sentence_id.append(vocab[word])
        return sentence_id
    input = list(input)
    ind2key = dict(zip(parameter['tag2label'].values(),parameter['tag2label'].keys()))
    input_id = seq2id(input,parameter['vocab'])#itemgetter(*input)(parameter['word2ind'])
    predict = model.crf.decode(model(list2torch([input_id]).long().to(parameter['device'])))[0]
    predict = itemgetter(*predict)(ind2key)
    keys_list = []
    for ind,i in enumerate(predict):
        if i == 'O':
            continue
        if i[0] == 'S':
            if not(len(keys_list) == 0 or keys_list[-1][-1]):
                del keys_list[-1]
            keys_list.append([input[ind],[i],[ind],True])
            continue
        if i[0] == 'B':
            if not(len(keys_list) == 0 or keys_list[-1][-1]):
                del keys_list[-1]
            keys_list.append([input[ind],[i],[ind],False])
            continue
        if i[0] == 'I':
            if len(keys_list) > 0 and not keys_list[-1][-1] and \
            keys_list[-1][1][0].split('-')[1] == i.split('-')[1]:
                keys_list[-1][0] += input[ind]
                keys_list[-1][1] += [i]
                keys_list[-1][2] += [ind]
            else:
                if len(keys_list) > 0:
                    del keys_list[-1]
            continue
        if i[0] == 'E':
            if len(keys_list) > 0 and not keys_list[-1][-1] and \
            keys_list[-1][1][0].split('-')[1] == i.split('-')[1]:
                keys_list[-1][0] += input[ind]
                keys_list[-1][1] += [i]
                keys_list[-1][2] += [ind]
                keys_list[-1][3] = True
            else:
                if len(keys_list) > 0:
                    del keys_list[-1]
            continue
    keys_list = [[i[0],i[1][0].split('-')[1],i[2]] for i in keys_list]
    return keys_list

model_path = 'model/ner/'
model,parameter = load_model(model_path)

keyword_predict('李白写过哪些诗句，杜甫写过哪些诗句')

  score = torch.where(mask[i].unsqueeze(1), next_score, score)


[['李白', 'author', [0, 1]], ['杜甫', 'author', [9, 10]]]

In [2]:
entity = ['李白', 'author', [0, 1]]
"match p = (n:%s)-[]->(m:introduce) where n.name = '%s' return m.name"%(entity[1],entity[0])

"match p = (n:author)-[]->(m:introduce) where n.name = '李白' return m.name"

In [3]:
# 基于ner重建后的提问

from operator import itemgetter

def takelong(ins):
    return len(ins[0])

def rebuildiins(ins,entity_list):
    new_ins = {}
    left_ind = set(range(len(ins)))
    for i in entity_list:
        left_ind -= set(range(i[-1][0],i[-1][-1]+1))
        new_ins[i[-1][0]] = i[1]
    for i in left_ind:
        new_ins[i] = ins[i]
    new_id = list(new_ins.keys())
    new_id.sort()
    return itemgetter(*new_id)(new_ins)

question = '李白写过哪些诗句，杜甫写过哪些诗句'
entity_list = [['李白', 'author', [0, 1]], ['杜甫', 'author', [9, 10]]]
entity_list.sort(key = takelong)
entity_list = entity_list[::-1]
new_question = rebuildiins(question,entity_list)
new_question

('author',
 '写',
 '过',
 '哪',
 '些',
 '诗',
 '句',
 '，',
 'author',
 '写',
 '过',
 '哪',
 '些',
 '诗',
 '句')

# 意图识别功能测试

In [4]:
import torch
import numpy as np
import pandas as pd
import pickle as pk
from tqdm import tqdm
import torch.nn.functional as F # pytorch 激活函数的类
from torch import nn,optim # 构建模型和优化器
from operator import itemgetter
from collections import defaultdict

# 构建分类模型
class TextRNN(nn.Module):
    def __init__(self, parameter):
        super(TextRNN, self).__init__()
        embedding_dim = parameter['embedding_dim']
        hidden_size = parameter['hidden_size']
        output_size = parameter['output_size']
        num_layers = parameter['num_layers']
        dropout = parameter['dropout']
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, bidirectional=True, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size*2, output_size)
        
    def forward(self, x):
        out,(h, c)= self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out
    
def load_model(path):
    parameter = pk.load(open(path,'rb'))
    parameter['dropout'] = 0
    model = TextRNN(parameter).to(parameter['cuda'])
    model.load_state_dict(torch.load(parameter['model_path']+'model-rnn.h5'))
    return parameter,model

def batch_predict(chars,parameter):
        max_len = 0
        batch_x = []
        for iters in range(len(chars)):
            for i in range(len(chars[iters])):
                if chars[iters][i] not in parameter['char2ind']:
                    chars[iters][i] = '<unk>'
            batch_ids = itemgetter(*chars[iters])(parameter['char2ind'])
            try:
                batch_ids = list(batch_ids)
            except:
                batch_ids = [batch_ids,0]
            if len(batch_ids) > max_len:
                max_len = len(batch_ids)
            batch_x.append(batch_ids)
        batch_x = [np.array(list(itemgetter(*x_ids)(parameter['ind2embeding']))+[parameter['ind2embeding'][0]]*(max_len-len(x_ids))) for x_ids in batch_x]
        device = parameter['cuda']
        return torch.from_numpy(np.array(batch_x)).to(device)
    
def predict(ins,model,parameter):
    seqs = batch_predict(ins,parameter)
    res = model(seqs)
    predicted_prob,predicted_index = torch.max(F.softmax(res, 1), 1)
    res = predicted_index.cpu().numpy()
    return res


intent0_parameter,intent0_model = load_model('model/intent0/parameter.pkl')
intent1_parameter,intent1_model = load_model('model/intent1/parameter.pkl')

In [7]:
intent1_parameter['cuda']

device(type='cpu')

In [8]:
import pickle as pk
x,y = pk.load(open('data/data-intent0.pkl','rb'))
x[1],x[300],x[500],y[1],y[300],y[500]

(['老', '师', 'km1', '有', '哪', '些', '重', '要', '的', '课'],
 ['说', '你', '的', '工', '作'],
 ['唱', '歌', '吧'],
 0,
 1,
 2)

In [9]:
predict([['老', '师', 'km1', '有', '哪', '些', '重', '要', '的', '课'],
        ['说', '你', '的', '工', '作'],
         ['唱', '歌', '吧'],
        ],intent0_model,intent0_parameter)

array([0, 1, 2], dtype=int64)

In [10]:
import pickle as pk
x,y,_,_ = pk.load(open('data/data-intent1-ner.pkl','rb'))
x[1],x[20],x[100],y[1],y[20],y[100]

(['老', '师', 'km1', '有', '哪', '些', '重', '要', '的', '课'],
 ['老', '师', 'km1', '有', '哪', '些', '重', '要', '的', '知', '识', '点'],
 ['老', '师', 'km2', '有', '哪', '些', '重', '要', '的', '例', '题', '需', '要', '掌', '握'],
 0,
 1,
 5)

In [11]:
predict([['老', '师', 'km1', '有', '哪', '些', '重', '要', '的', '课'],
 ['老', '师', 'km1', '有', '哪', '些', '重', '要', '的', '知', '识', '点'],
 ['老', '师', 'km2', '有', '哪', '些', '重', '要', '的', '例', '题', '需', '要', '掌', '握'],
        ],intent1_model,intent1_parameter)

array([0, 1, 5], dtype=int64)