In [3]:
import os
import time
import random
import numpy as np
import pickle as pk
import pandas as pd
from tqdm import tqdm
from operator import itemgetter
from collections import defaultdict
import torch
import jieba


# 准备好模型的参数
parameter = {
    'epoch':100,
    'batch_size':300,
    'embedding_dim':300,
    'hidden_size':128,
    'num_layers':2, 
    'dropout':0.1,
    'cuda':torch.device('cpu'),
    'lr':0.001,
    'max_len':50,
}

def build_dataSet(parameter):
    data_src = pd.read_csv('../../dataSet/data_src.csv')
    data_src = data_src[data_src['关系'] == 'question2answer']
    q,a = list(data_src.实体1),list(data_src.实体2)
    word2id = defaultdict(int)
    word2id['<PAD>'] = 0
    word2id['<UNK>'] = 0
    # 构建qa字典，存储单个q多个a的数据  一个答案多个问题
    qa_list = {}
    for ind in range(len(q)):
        q_cut = list(q[ind])#按字切割
        a_cut = list(a[ind])
        # 处理得到单个q多个a的数据
        if q[ind] not in qa_list:
            qa_list[q[ind]] = [q_cut,a_cut]
        else:
            qa_list[q[ind]] += [a_cut]
        # 构建qa字典，注意qa字典共用
        for i in q_cut:
            word2id[i] += 1
        for i in a_cut:
            word2id[i] += 1
    qa_list = list(qa_list.values())
    parameter['qa_list'] = qa_list
    parameter['word2id'] = dict(zip(word2id.keys(),range(len(word2id))))
    parameter['id2word'] = dict(zip(range(len(word2id)),word2id.keys()))
    parameter['word_size'] = len(word2id)
    
def sample(n,parameter,neg_sample_num):#neg_sample_num采样几个的变量
    # 负采样：
    neg_sample = []
    q_size = len(parameter['qa_list'])
    while 1:
        sample_id = random.randint(0,q_size-1)
        if sample_id == n:
            continue
        neg_sample_answer = parameter['qa_list'][sample_id]
        a_id = random.randint(1,len(neg_sample_answer)-1)
        neg_sample.append(neg_sample_answer[a_id])
        if len(neg_sample) >= neg_sample_num:
            return neg_sample
        
def list2torch(a):
    return torch.from_numpy(np.array(a)).long().to(parameter['cuda'])
    
def batch_yield(parameter,shuffle = True):
    for train_epoch in range(parameter['epoch']):
        qa_list = parameter['qa_list']
        data = []
        # 构建比方说3个同样的q，3个正确的ans，3个错误的ans
        for ind,i in enumerate(qa_list):
            q = i[0]
            p_a = i[1:]
            n_a = sample(ind,parameter,len(p_a))
            q = [q] * len(p_a)
            data += list(zip(q,p_a,n_a))
        if shuffle:
            random.shuffle(data)
        batch_q,batch_a,batch_n = [],[],[]
        seq_len_q,seq_len_a,seq_len_n = 0,0,0
        for (q,a,n) in tqdm(data):
            # id化
            q = itemgetter(*q)(parameter['word2id'])
            a = itemgetter(*a)(parameter['word2id'])
            n = itemgetter(*n)(parameter['word2id'])
            # 异常处理
            q = list(q) if type(q) == type(()) else [q,0]
            a = list(a) if type(a) == type(()) else [a,0]
            n = list(n) if type(n) == type(()) else [n,0]
            # 截断
            q = q[:parameter['max_len']]
            a = a[:parameter['max_len']]
            n = n[:parameter['max_len']]
            if len(q) > seq_len_q:
                seq_len_q = len(q)
            if len(a) > seq_len_a:
                seq_len_a = len(a)
            if len(n) > seq_len_n:
                seq_len_n = len(n)
            batch_q.append(q)
            batch_a.append(a)
            batch_n.append(n)
            if len(batch_q) >= parameter['batch_size']:
                # 数据对齐
                batch_q = [i+[0]*(seq_len_q-len(i)) for i in batch_q]
                batch_a = [i+[0]*(seq_len_a-len(i)) for i in batch_a]
                batch_n = [i+[0]*(seq_len_n-len(i)) for i in batch_n]
                yield list2torch(batch_q),list2torch(batch_a),list2torch(batch_n),None,False
                batch_q,batch_a,batch_n = [],[],[]
                seq_len_q,seq_len_a,seq_len_n = 0,0,0
        batch_q = [i+[0]*(seq_len_q-len(i)) for i in batch_q]
        batch_a = [i+[0]*(seq_len_a-len(i)) for i in batch_a]
        batch_n = [i+[0]*(seq_len_n-len(i)) for i in batch_n]
        yield list2torch(batch_q),list2torch(batch_a),list2torch(batch_n),train_epoch,False
        batch_q,batch_a,batch_n = [],[],[]
        seq_len_q,seq_len_a,seq_len_n = 0,0,0
    yield None,None,None,None,True
            
build_dataSet(parameter)
pk.dump(parameter,open('parameter.pkl','wb'))

In [2]:
parameter['qa_list']

[[['A', 'u', 't', 'o', 'M', 'L', '问', '题', '构', '成', '?'],
  ['特', '征', '选', '择'],
  ['模', '型', '选', '择'],
  ['算', '法', '选', '择']],
 [['特', '征', '工', '程', '选', '择', '思', '路', '？'],
  ['有', '监', '督', '的', '特', '征', '选', '择'],
  ['基',
   '于',
   '模',
   '型',
   '，',
   'l',
   'r',
   '的',
   '系',
   '数',
   '，',
   '树',
   '模',
   '型',
   '的',
   'i',
   'm',
   'p',
   'o',
   'r',
   't',
   'a',
   'n',
   'c',
   'e',
   '等',
   '等'],
  ['基', '于', '选', '择', '，', '前', '项', '后', '项', '选', '择'],
  ['无', '监', '督', '的', '特', '征', '选', '择'],
  ['基',
   '于',
   '统',
   '计',
   '信',
   '息',
   '的',
   '，',
   '熵',
   '、',
   '相',
   '关',
   '性',
   '、',
   'K',
   'L',
   '系',
   '数'],
  ['基',
   '于',
   '方',
   '差',
   '，',
   '因',
   '子',
   '分',
   '解',
   '，',
   'P',
   'C',
   'A',
   '主',
   '成',
   '分',
   '分',
   '享',
   '，',
   '方',
   '差',
   '系',
   '数']],
 [['模', '型', '相', '关', '的', '选', '择', '思', '路', '?'],
  ['模', '型', '选', '择'],
  ['各',
   '自',
   '模',
   '型',
   '的',
   '优'

In [2]:
train_yield = batch_yield(parameter)
test_q,test_a,test_n,_,_ = next(train_yield)
test_q,test_a,test_n

  0%|                                                                                         | 0/1390 [00:00<?, ?it/s]

(tensor([[ 44,  33,  39,  ...,   0,   0,   0],
         [ 98,  44,  34,  ...,   0,   0,   0],
         [405, 185,  58,  ...,   0,   0,   0],
         ...,
         [721, 361,  29,  ...,   0,   0,   0],
         [462, 149, 129,  ...,   0,   0,   0],
         [ 26, 233, 162,  ...,   0,   0,   0]], device='cuda:0'),
 tensor([[ 13,  14, 257,  ..., 293, 454, 195],
         [206, 121, 803,  ...,   0,   0,   0],
         [570,  44,  89,  ...,   0,   0,   0],
         ...,
         [306, 154, 176,  ...,   0,   0,   0],
         [667, 643, 166,  ...,   0,   0,   0],
         [ 49, 340, 341,  ...,   0,   0,   0]], device='cuda:0'),
 tensor([[218,   5,   5,  ...,   0,   0,   0],
         [712, 713, 714,  ...,  89,  89,   0],
         [255, 256, 242,  ...,   0,   0,   0],
         ...,
         [457, 268,  67,  ...,   0,   0,   0],
         [ 39,  41,  89,  ...,   0,   0,   0],
         [524, 310, 311,  ...,   0,   0,   0]], device='cuda:0'))

In [4]:
import torch.nn.functional as F # pytorch 激活函数的类
from torch import nn,optim # 构建模型和优化器

# 构建分类模型
class TextRNN(nn.Module):
    def __init__(self, parameter):
        super(TextRNN, self).__init__()
        embedding_dim = parameter['embedding_dim']
        hidden_size = parameter['hidden_size']
        num_layers = parameter['num_layers']
        dropout = parameter['dropout']
        word_size = parameter['word_size']
        self.embedding = nn.Embedding(word_size, embedding_dim, padding_idx=0)
        # q的特征空间应该和a的特征空间是不一致的，所以分别使用不同的lstm进行特征提取
        self.lstm_q = nn.LSTM(embedding_dim, hidden_size, num_layers, bidirectional=True, batch_first=True, dropout=dropout)
        self.lstm_a = nn.LSTM(embedding_dim, hidden_size, num_layers, bidirectional=True, batch_first=True, dropout=dropout)

        
    def forward(self, q, a1,a2 = None):
        q_emd = self.embedding(q)
        q_emd,(h, c)= self.lstm_q(q_emd)#问题的特征提取器
        q_emd = torch.max(q_emd,1)[0]

        a1_emd = self.embedding(a1)
        a1_emd,(h, c)= self.lstm_a(a1_emd)#答案的特征提取器
        a1_emd = torch.max(a1_emd,1)[0]
        if a2 is not None:
            a2_emd = self.embedding(a2)
            a2_emd,(h, c)= self.lstm_a(a2_emd)
            a2_emd = torch.max(a2_emd,1)[0]
            return q_emd,a1_emd,a2_emd#有三个输出，采用三元组损失
        return F.cosine_similarity(q_emd,a1_emd,1,1e-8)#A*B

In [9]:
test_model = TextRNN(parameter).cuda()
test_model(test_q,test_a)

tensor(0.0061, device='cuda:0', grad_fn=<MeanBackward0>)

In [5]:
import os
import shutil
import pickle as pk
from torch.utils.tensorboard import SummaryWriter

# 构建模型
model = TextRNN(parameter).to(parameter['cuda'])

# 确定训练模式
model.train()

# 确定优化器和损失
optimizer = torch.optim.SGD(model.parameters(),lr=0.1, momentum=0.95, nesterov=True)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.9)

# 准备迭代器
train_yield = batch_yield(parameter)

# 开始训练
loss_cal = []
min_loss = float('inf')
while 1:
        q,a,n,epoch,keys = next(train_yield)
        if keys:
            break
        q_emd,a_emd,n_emd = model(q,a,n)
        loss = nn.functional.triplet_margin_loss(q_emd, a_emd, n_emd,reduction='mean')#三元组损失
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_cal.append(loss.item())
        if epoch is not None:
            if (epoch+1)%1 == 0:
                loss_cal = sum(loss_cal)/len(loss_cal)
                if loss_cal < min_loss:
                    min_loss = loss_cal
                    torch.save(model.state_dict(), 'grade.h5')
                print('epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, \
                                                       parameter['epoch'],loss_cal))
                optimizer.step()
            loss_cal = [loss.item()]


AssertionError: Torch not compiled with CUDA enabled