In [10]:
from collections import Counter
import json
import numpy as np
import os
import re
import pandas as pd
import copy
import pickle
from sklearn.model_selection import KFold
from sklearn.metrics import hamming_loss
import os
from gensim.models.word2vec import Word2Vec
import torch
from torch.nn import functional as F
from torch import optim
from torch import nn
from torch.utils.data import TensorDataset,DataLoader

In [26]:
# label_dic = {'inform_theater':0,'inform_starttime':1,'inform_numberofpeople':2,'greeting':3,'thanks':4,'inform_other':5,'request_moviename':6
#             ,'inform_genre':7,'request_ticket':8,'inform_city':9,'inform_state':10,'inform_date':11,'inform_moviename':12,'confirm_answer':13,
#             'inform_zip':14,'inform_video_format':15}
class Config():
    label_dic = {'inform_starttime':0,'inform_numberofpeople':1,'thanks':2,'confirm_answer':3}##设置抽取的类
    embedding_size = 8  ##词编码维度
    hidden_size = 8    ##隐藏层大小
    label_dim = len(label_dic.keys())  #标签维度（标签的种类）
    epoch = 100  #最大训练轮数
    batch_size = 256  #每个batch的大小
    lr = 1e-2 #学习率
    ealy_stop = 3  #早停轮数（多少轮不下降就停止）
    n_splits = 5 #多少折交叉验证
    train = True ##训练标记，false不训练，使用现成模型预测
config = Config()

In [27]:
def pad(x,length=10):##补全句子长度以及增加开头结尾
    le = length-2
    if len(x)>le:
        x = x[0:8]
    x.insert(0,'/u')
    x.append('/s')
    while len(x)<length:
        x.append('/s')
    return x

def data_process(data_path,length=10):###输入文件路径，读取文件的句子及标签，返回句子列表，标签列表
    sent_list = []
    label_list = []
    cnt = 0
    with open(data_path,'r',encoding='utf8') as f:
        for line in f:
            cnt += 1
            label_onehot = [0 for _ in range(len(config.label_dic.keys()))]
            tmp = []
            for item in line.split('\t'):
                tmp.append(item.replace('\n',''))
            label = []
            for item in tmp:
                if item in config.label_dic.keys():
                    label_onehot[config.label_dic[item]] = 1
            if sum(label_onehot)!=0:           
                sent_list.append(pad(re.findall('[\u4e00-\u9fa5a-zA-Z0-9]+',tmp[0],re.S),length=length))##正则去除所有标点符号
                label_list.append(label_onehot)
    return sent_list,label_list

In [28]:
train_sent,train_label = data_process('train_data.tsv')##读取原始训练文件，返回切分好的句子，返回label onehot
test_sent,test_label = data_process('test_data.tsv')##原始测试文件，返回切分好的句子，返回label onehot

In [29]:
###合并训练和测试文件，后续做五折交叉
all_sent = []
all_sent.extend(train_sent)
all_sent.extend(test_sent)
all_label = []
all_label.extend(train_label)
all_label.extend(test_label)

In [30]:
###使用word2vec预训练，并获取向量
if config.train:
    w2v = Word2Vec(all_sent,window=5,vector_size=config.embedding_size,min_count=1,seed=2021)
    w2v.save('w2v.model')
    w2v = Word2Vec.load('w2v.model')
    weight = w2v.wv.vectors
    vocab = w2v.wv.key_to_index 
else:
    w2v = Word2Vec.load('w2v.model')##载入词向量模型
    weight = w2v.wv.vectors
    vocab = w2v.wv.key_to_index 

In [31]:
###将文字转换为词表的索引
def vac2idx(lst,vocab):
    tmp_list = []
    for sent in lst:
        s = []
        for item in sent:
            s.append(vocab[item])
        tmp_list.append(s)
    return tmp_list
all_sent_idx = vac2idx(all_sent,vocab)
# test_sent_idx = vac2idx(test_sent,vocab)

In [32]:
###句子索引及标签转换→tensor→dataset→迭代器
def list2tensor(train_sent_idx,test_sent_idx,train_label,test_label):
    train_set = torch.LongTensor(train_sent_idx)
    test_set = torch.LongTensor(test_sent_idx)
    train_label_set = torch.LongTensor(train_label)
    test_label_set = torch.LongTensor(test_label)
    train_set = TensorDataset(train_set,train_label_set)
    test_set = TensorDataset(test_set,test_label_set)
    train_iter = DataLoader(train_set,shuffle=True,batch_size=config.batch_size)
    test_iter = DataLoader(test_set,shuffle=False,batch_size=config.batch_size)
    return train_iter,test_iter

In [33]:
###神经网络结构
class Net(nn.Module):
    def __init__(self,weight):
        super(Net,self).__init__()
        self.weight = torch.FloatTensor(weight)
        self.embedding = nn.Embedding.from_pretrained(self.weight)
        self.lstm = nn.LSTM(input_size=config.embedding_size,hidden_size=config.hidden_size,num_layers=2,bidirectional=True)
        self.linear1 = nn.Linear(config.hidden_size*2,config.hidden_size)
        self.linear2 = nn.Linear(config.hidden_size,config.label_dim)
    def forward(self,x):
        embed = self.embedding(x) ###embedding层
        out, _ = self.lstm(embed) ###lstm层，out为lstm所有输出，维度，batch_size,句子长度，隐藏层维度*2
        out = out.permute(0, 2, 1) ###转换维度为了下面的maxpool
        out = F.adaptive_max_pool1d(out,output_size=1).squeeze()
        x = self.linear1(out) ###dense+relu
        x = F.relu(x)
        x = self.linear2(x)##dense+sigmoid
        x = torch.sigmoid(x)
        return x


In [34]:
##定义损失函数
loss_fn = torch.nn.BCELoss()
#####验证训练模型效果，返回验证集损失
def evalute(data,net,label=None):
    loss_g = 0
    cnt = 0
    hm_loss_tot = 0
    for i, (trains, labels) in enumerate(data):
        cnt += trains.shape[0]
        outputs = net(trains)
        hm_loss = hamming_loss(np.round(outputs.data.numpy()),labels.numpy())##汉明损失
        hm_loss_tot += hm_loss * trains.shape[0]
        net.zero_grad()
        loss = loss_fn(outputs, labels.float())
        loss_g += loss
    return loss_g,hm_loss_tot/cnt##返回bce损失和汉明损失

In [35]:
####定义多折切分器
kfold = KFold(n_splits=config.n_splits,random_state=2021,shuffle=True)

In [36]:
###根据索引返回训练集和验证集
def get_data(data,label,train_idx,test_idx):
    data_train = [data[_] for _ in train_idx]
    data_test = [data[_] for _ in test_idx]
    label_train = [label[_] for _ in train_idx]
    label_test = [label[_] for _ in test_idx]
    return data_train,data_test,label_train,label_test

In [37]:
#模型训练
if config.train:
    hm_loss_list = [] #储存每折的汉明损失
    for k, (train_index, test_index) in enumerate(kfold.split(all_sent_idx)):   
        train_sent_idx,test_sent_idx,train_label,test_label = get_data(all_sent_idx,all_label,train_index,test_index)##切分训练集和验证集
        train_iter,test_iter = list2tensor(train_sent_idx,test_sent_idx,train_label,test_label)##返回迭代器
        net = Net(weight) #初始化网络
        net.train() #开启训练
        optimizer = optim.Adam(net.parameters(),lr=config.lr) #定义优化器
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9) #减少学习速率
        dev_best_loss = float('inf')
        cnt = 0 #loss没有下降的轮数
        hm_loss_best = 0
        for epoch in range(config.epoch):
            loss_g = 0
            for i, (trains, labels) in enumerate(train_iter):
                outputs = net(trains)
                loss = loss_fn(outputs, labels.float())
                optimizer.zero_grad() 
                loss.backward()
                optimizer.step()
                loss_g += loss
            loss_dev,hm_loss = evalute(test_iter,net) ##验证集输出损失，汉明损失
            print('best loss:{} current loss:{} epoch:{},k_flod:{}'.format(dev_best_loss,loss_dev,epoch,k))
            if loss_dev<dev_best_loss:##损失减小，保存模型
                hm_loss_best = hm_loss
                dev_best_loss = loss_dev
                torch.save(net.state_dict(),'./model/model{}.pth'.format(k))
                cnt = 0 
            else:
                scheduler.step()
                cnt += 1
            if cnt >= config.ealy_stop:
                print('第{}折汉明损失：{}'.format(k,hm_loss_best))
                break
        hm_loss_list.append(hm_loss_best)
    print('五折平均汉明损失：',sum(hm_loss_list)/len(hm_loss_list))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  fold_sizes = (n_samples // n_splits) * np.ones(n_splits, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


best loss:inf current loss:2.9504072666168213 epoch:0,k_flod:0
best loss:2.9504072666168213 current loss:2.9145865440368652 epoch:1,k_flod:0
best loss:2.9145865440368652 current loss:2.9085423946380615 epoch:2,k_flod:0
best loss:2.9085423946380615 current loss:2.833122730255127 epoch:3,k_flod:0
best loss:2.833122730255127 current loss:2.568243980407715 epoch:4,k_flod:0
best loss:2.568243980407715 current loss:2.39871883392334 epoch:5,k_flod:0
best loss:2.39871883392334 current loss:2.2442452907562256 epoch:6,k_flod:0
best loss:2.2442452907562256 current loss:2.111478328704834 epoch:7,k_flod:0
best loss:2.111478328704834 current loss:1.9757719039916992 epoch:8,k_flod:0
best loss:1.9757719039916992 current loss:1.834974765777588 epoch:9,k_flod:0
best loss:1.834974765777588 current loss:1.7347509860992432 epoch:10,k_flod:0
best loss:1.7347509860992432 current loss:1.6789418458938599 epoch:11,k_flod:0
best loss:1.6789418458938599 current loss:1.6295115947723389 epoch:12,k_flod:0
best loss:

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


best loss:inf current loss:2.961393117904663 epoch:0,k_flod:1
best loss:2.961393117904663 current loss:2.870081901550293 epoch:1,k_flod:1
best loss:2.870081901550293 current loss:2.603938102722168 epoch:2,k_flod:1
best loss:2.603938102722168 current loss:2.3023524284362793 epoch:3,k_flod:1
best loss:2.3023524284362793 current loss:2.1656265258789062 epoch:4,k_flod:1
best loss:2.1656265258789062 current loss:2.0713157653808594 epoch:5,k_flod:1
best loss:2.0713157653808594 current loss:1.9681131839752197 epoch:6,k_flod:1
best loss:1.9681131839752197 current loss:1.7933900356292725 epoch:7,k_flod:1
best loss:1.7933900356292725 current loss:1.7013081312179565 epoch:8,k_flod:1
best loss:1.7013081312179565 current loss:1.6241227388381958 epoch:9,k_flod:1
best loss:1.6241227388381958 current loss:1.5298504829406738 epoch:10,k_flod:1
best loss:1.5298504829406738 current loss:1.4767357110977173 epoch:11,k_flod:1
best loss:1.4767357110977173 current loss:1.4600605964660645 epoch:12,k_flod:1
best

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


best loss:inf current loss:2.999257802963257 epoch:0,k_flod:2
best loss:2.999257802963257 current loss:2.9136204719543457 epoch:1,k_flod:2
best loss:2.9136204719543457 current loss:2.6956140995025635 epoch:2,k_flod:2
best loss:2.6956140995025635 current loss:2.360269069671631 epoch:3,k_flod:2
best loss:2.360269069671631 current loss:2.258441925048828 epoch:4,k_flod:2
best loss:2.258441925048828 current loss:2.2448697090148926 epoch:5,k_flod:2
best loss:2.2448697090148926 current loss:2.2274999618530273 epoch:6,k_flod:2
best loss:2.2274999618530273 current loss:2.1813509464263916 epoch:7,k_flod:2
best loss:2.1813509464263916 current loss:2.1746082305908203 epoch:8,k_flod:2
best loss:2.1746082305908203 current loss:2.1321403980255127 epoch:9,k_flod:2
best loss:2.1321403980255127 current loss:2.0488622188568115 epoch:10,k_flod:2
best loss:2.0488622188568115 current loss:1.9963557720184326 epoch:11,k_flod:2
best loss:1.9963557720184326 current loss:1.942668080329895 epoch:12,k_flod:2
best 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


best loss:inf current loss:3.030233383178711 epoch:0,k_flod:3
best loss:3.030233383178711 current loss:2.8751840591430664 epoch:1,k_flod:3
best loss:2.8751840591430664 current loss:2.8849453926086426 epoch:2,k_flod:3
best loss:2.8751840591430664 current loss:2.85640811920166 epoch:3,k_flod:3
best loss:2.85640811920166 current loss:2.7817230224609375 epoch:4,k_flod:3
best loss:2.7817230224609375 current loss:2.541343927383423 epoch:5,k_flod:3
best loss:2.541343927383423 current loss:2.3765580654144287 epoch:6,k_flod:3
best loss:2.3765580654144287 current loss:2.269014596939087 epoch:7,k_flod:3
best loss:2.269014596939087 current loss:2.1248726844787598 epoch:8,k_flod:3
best loss:2.1248726844787598 current loss:2.0034070014953613 epoch:9,k_flod:3
best loss:2.0034070014953613 current loss:1.8757959604263306 epoch:10,k_flod:3
best loss:1.8757959604263306 current loss:1.7639520168304443 epoch:11,k_flod:3
best loss:1.7639520168304443 current loss:1.6587669849395752 epoch:12,k_flod:3
best los

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


best loss:inf current loss:2.928699016571045 epoch:0,k_flod:4
best loss:2.928699016571045 current loss:2.908864974975586 epoch:1,k_flod:4
best loss:2.908864974975586 current loss:2.902780532836914 epoch:2,k_flod:4
best loss:2.902780532836914 current loss:2.863394021987915 epoch:3,k_flod:4
best loss:2.863394021987915 current loss:2.641883134841919 epoch:4,k_flod:4
best loss:2.641883134841919 current loss:2.3492140769958496 epoch:5,k_flod:4
best loss:2.3492140769958496 current loss:2.2401745319366455 epoch:6,k_flod:4
best loss:2.2401745319366455 current loss:2.1738576889038086 epoch:7,k_flod:4
best loss:2.1738576889038086 current loss:2.1339406967163086 epoch:8,k_flod:4
best loss:2.1339406967163086 current loss:2.082793712615967 epoch:9,k_flod:4
best loss:2.082793712615967 current loss:2.022919178009033 epoch:10,k_flod:4
best loss:2.022919178009033 current loss:1.9482300281524658 epoch:11,k_flod:4
best loss:1.9482300281524658 current loss:1.8376824855804443 epoch:12,k_flod:4
best loss:1.

In [68]:
##以下用来自行测试数据
net = Net(weight)

def predict(sent,model_path,net):##输入句子，模型路径，网络
    test_data = test_data_process(sent)
    sent_idx = vac2idx(test_data,vocab)
    sent = torch.LongTensor(sent_idx)    
    model_file = os.listdir(model_path)
    output_list = []
    for model in model_file:
        net.load_state_dict(torch.load(os.path.join(model_path,model)))
        output_list.append(net(sent).data.numpy())
    out = np.zeros_like(output_list[0])
    for output in output_list:
        out += output
    return out/len(output_list)
        
def test_data_process(sents,length=10):  ##返回切分后的句子
    sent_list = []
    for sent in sents:
        sent_list.append(pad(re.findall('[\u4e00-\u9fa5a-zA-Z0-9]+',sent,re.S),length=length))##正则去除所有标点符号
    return sent_list
    

In [73]:
#####sent为你想测试的句子
sent = ['thank you so much!','930 would be perfect: 4 tickets?']
out = predict(sent,'model',net)

In [75]:
for i in range(out.shape[0]):
    print('第{}个句子的预测结果'.format(i))
    for j in range(out.shape[1]):
        if out[i][j]>0.5:
            print(list(config.label_dic.keys())[j])

第0个句子的预测结果
thanks
第1个句子的预测结果
inform_starttime
inform_numberofpeople
