In [1]:
import random
import re
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LENGTH=200
NUM=500

In [2]:
# 读取文件内容并划分训练集，验证集
all_x ,all_y,pairs = [],[],[]
with open('short_text.txt','r',encoding='utf8') as f1:
    before_x = f1.readlines()
    for i in range(NUM):
        all_x.append(before_x[i].replace('\n',''))
f1.close()

with open('summary.txt','r',encoding='utf8') as f2:
    before_y = f2.readlines()
    for i in range(NUM):
        all_y.append(before_y[i].replace('\n',''))
f2.close()


In [3]:
def normalize(s):
    s = s.replace(' ','').replace('”','').replace('“','').replace('‘','').replace('’','').replace('：','').replace('，','').replace('。','').replace('《','').replace('》','').replace('（','').replace('）','')
    if len(s) > MAX_LENGTH:
        s=s[:200]
    return s

all_x = [normalize(x) for x in all_x]

for i in range(len(all_x)):
    pairs.append([all_x[i],all_y[i]])
    
print(random.choice(pairs))

['银行卡含有大量信息例如公民个人身份信息、账户信息等能否收藏他人银行卡收藏都有哪些法律风险以及容许自己的卡进入收藏流通领域对于持卡人会不会具有不利法律后果等问题？戳长微博了解↓↓↓', '收藏银行卡隐藏多重法律风险5张以上就有可能犯罪']


In [4]:
#表示开始和表示结束的token
SOS_token = 0
EOS_token = 1

class allDic:
    def __init__(self):
        self.word2index = {'SOS':0,'EOS':1}
        self.index2word = {0:'SOS',1:'EOS'}
        self.wordcount = {}
        self.num_words = 2
    def getDic(self,lists):  # 对包含所有文字的列表
        for i in range(len(lists)):
            self.addDic(lists[i])
    def addDic(self,sentence):  #对每个列表里面的一句话
        for i in range(len(sentence)):
            if sentence[i] not in self.word2index:
                self.word2index[sentence[i]] = self.num_words
                self.index2word[self.num_words] = sentence[i]
                self.wordcount[sentence[i]] = 1
                self.num_words += 1
            else:
                self.wordcount[sentence[i]] += 1

In [5]:
alldic_x = allDic()
alldic_x.getDic(all_x)

alldic_y = allDic()
alldic_y.getDic(all_y)

In [6]:
def getPairTensor(pair,alldic_x,alldic_y):
    indexes_x = []
    indexes_y = []
    pairs_tensor = []
    
    for token in pair[0]:
        indexes_x.append(alldic_x.word2index[token])
    
    indexes_x.append(EOS_token)
    input_tensor = torch.tensor(indexes_x, dtype=torch.long, device=device).view(-1, 1)
   
    for token in pair[1]:
        indexes_y.append(alldic_y.word2index[token])
    
    indexes_y.append(EOS_token)
    output_tensor = torch.tensor(indexes_y, dtype=torch.long, device=device).view(-1, 1)
       
    return (input_tensor, output_tensor)

In [7]:
class Encoder(nn.Module):  #输入是一句话
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(input_size,hidden_size)
        self.gru = nn.GRU(hidden_size,hidden_size)
    def forward(self,input,hidden):
        layer_1 = self.embedding(input).view(1, 1, -1)
        output,hidden = self.gru(layer_1,hidden)
        return output,hidden
    def initHidden(self):
        return torch.zeros(1,1,self.hidden_size,device = device)

In [8]:
class Decoder(nn.Module):
    def __init__(self,output_size,hidden_size,dropout_p=0.1, max_length=MAX_LENGTH):
        super(Decoder,self).__init__()
        self.hidden_size = hidden_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        self.embedding = nn.Embedding(output_size,hidden_size)
        self.gru = nn.GRU(hidden_size,hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.out = nn.Linear(hidden_size,output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self,input,hidden,encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights
    def initHidden(self):
        return torch.zeros(1,1,self.hidden_size,device = device)
        

In [9]:
teacher_forcing_ratio = 0.5
def train(encoder,decoder,input_tensor,output_tensor,encoder_optimizer,decoder_optimizer,criterion,max_length = MAX_LENGTH):
    encoder_hidden = encoder.initHidden()
    
    # 梯度清零
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    # 文本和摘要的长度
    input_length = input_tensor.size(0)
    output_length = output_tensor.size(0)
    
#     print('input',input_length)
#     print('output',output_length)
    
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    
    loss = 0
    
    # 对某个句子的每个字都放入encoder，中得到结果
    for i in range(input_length):
        encoder_output ,encoder_hidden = encoder(input_tensor[i],encoder_hidden)
    
    # decoder相关
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    # 第一个decoder的输入是SOS
    decoder_input = torch.tensor([[SOS_token]],device = device)
    decoder_hidden = encoder_hidden
     
    if use_teacher_forcing:
        for i in range(output_length):
#             decoder_output,decoder_hidden = decoder(decoder_input,decoder_hidden)
            decoder_output,decoder_hidden,attention = decoder(decoder_input,decoder_hidden,encoder_outputs)
            loss += criterion(decoder_output, output_tensor[i])
            decoder_input = output_tensor[i]
    else:
        for i in range(output_length):
#             decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            decoder_output,decoder_hidden,attention = decoder(decoder_input,decoder_hidden,encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, output_tensor[i])
            if decoder_input.item() == EOS_token:
                break
        
            
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / output_length

In [10]:
def train_iters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) 
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [getPairTensor(random.choice(pairs),alldic_x,alldic_y) for i in range(n_iters)]
    
    criterion = nn.NLLLoss()
    
    for iter in range(1,n_iters+1):
        train_pair = training_pairs[iter-1]
        input_tensor = train_pair[0]
        output_tensor = train_pair[1]
        
        loss = train(encoder,decoder,input_tensor,output_tensor,encoder_optimizer,decoder_optimizer,criterion)
        
        print_loss_total += loss
        
        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('(%d %d%%) %.4f' % (iter, iter / n_iters * 100, print_loss_avg))


In [11]:
def tensorFromSentence(alldic_x, sentence):
    x = []
    for token in sentence:
        x.append(alldic_x.word2index[token])
    x.append(EOS_token)
    tensor = torch.tensor(x, dtype=torch.long, device=device).view(-1, 1)
    return tensor
   
        
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(alldic_x, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden,attention= decoder(decoder_input, decoder_hidden,encoder_outputs)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(alldic_y.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [12]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ''.join(output_words)
        print('<', output_sentence)
        print('')

In [13]:
hidden_size = 256
encoder = Encoder(alldic_x.num_words,hidden_size).to(device)
decoder = Decoder(alldic_y.num_words,hidden_size).to(device)

train_iters(encoder,decoder,10000,print_every=500)

(500 5%) 4.6466
(1000 10%) 4.9592
(1500 15%) 5.0526
(2000 20%) 5.0207
(2500 25%) 4.9587
(3000 30%) 4.8238
(3500 35%) 4.7550
(4000 40%) 4.4187
(4500 45%) 4.3265
(5000 50%) 3.9943
(5500 55%) 3.5765
(6000 60%) 3.2727
(6500 65%) 3.2561
(7000 70%) 2.9829
(7500 75%) 2.7124
(8000 80%) 2.3443
(8500 85%) 2.0338
(9000 90%) 1.8786
(9500 95%) 1.7770
(10000 100%) 1.5766


In [14]:
evaluateRandomly(encoder, decoder)

> 由于希腊资金将可能在4月20日耗尽如果希腊在此之前无法筹措到资金那么将有可能退出欧元区;希腊退出欧元区对欧元体系所造成的冲击将是无法想象的欧美经济均在逐步走强但由于希腊退欧风险仍在
= 希腊退欧风险仍在美元可能进一步上涨
< 希腊退欧风险仍在美元可能进一步上涨涨<EOS>

> 非农及昨晚公布的多项美国经济数据走弱但美元抵住压力自低位持稳反弹金银未显著突破上方阻力短期可能维持高位整理欧洲盘口欧元区及多国的PMI终值将公布将检验欧元区经济的复苏活力
= 美元绝处迎生机金银上方阻重深
< 美元绝处迎生机金银上方阻重深<EOS>

> 中国有个习大大做梦都想见到他!可以说海南是在习大大心目中占据着特殊位置的他曾前后三次来海南每次都给岛民们带来不同的惊喜和问候这不再过两个多月博鳌亚洲论坛2015年年会举行习大大又要来海南看我们了岛民们不要太激动哦！
= 习大大又要来海南看我们了
< 习大大大量南来看大大量<EOS>

> 1月5日上午三沙1号首航仪式在文昌市清澜港隆重举行南海网记者登上三沙1号揭开它神秘的面纱三沙1号可谓是吃、住、行、娱配套完善设计先进安全舒适带给您不一样的感受
= 走！看看“三沙1号”多豪气
< 走！看看“三沙1号”多豪气<EOS>

> 经海南省民政厅批准三沙市近日在永兴(镇)管委会设立三沙市婚姻登记点不久后三沙市干部职工、社区居委会居民和驻市部队官兵均可在永兴岛办理婚姻登记业务哇！再远的距离我们也能相爱
= 再远的距离我们也能相爱
< 再远的距离我们吃不起<EOS>

> 国际货币基金组织(IMF)周二发布的最新官方外汇储备货币构成(COFER)数据显示去年四季度美元在全球货币储备的占比增至62.9%四季度欧元在全球货币储备的占比萎缩至22.2%这意味着全球央行正在吸纳美元
= IMF数据显示全球央行正增持美元、减持欧元
< IMF数据显示全球央行正增持美元、减持欧元<EOS>

> 公安部会同国家网信办、工信部、环保部、工商总局、安监总局制定出台互联网危险物品信息发布管理规定进一步加强对互联网危险物品信息的管理规范危险物品从业单位信息发布行为规定自2015年3月1日起执行
= 新规：禁止个人在互联网上发布危险物品信息
< 新规：禁止在新增长不及预期<EOS>

> 鳌亚洲论坛2015年年会于3月26日至29日在中国海南博鳌举行3月28日早上将召开