In [1]:
import random
import re
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LENGTH=200
NUM=500

In [2]:
# 读取文件内容并划分训练集，验证集
all_x ,all_y,pairs = [],[],[]
with open('short_text.txt','r',encoding='utf8') as f1:
    before_x = f1.readlines()
    for i in range(NUM):
        all_x.append(before_x[i].replace('\n',''))
f1.close()

with open('summary.txt','r',encoding='utf8') as f2:
    before_y = f2.readlines()
    for i in range(NUM):
        all_y.append(before_y[i].replace('\n',''))
f2.close()


In [3]:
def normalize(s):
    s = s.replace(' ','').replace('”','').replace('“','').replace('‘','').replace('’','').replace('：','').replace('，','').replace('。','').replace('《','').replace('》','').replace('（','').replace('）','')
    if len(s) > MAX_LENGTH:
        s=s[:200]
    return s

all_x = [normalize(x) for x in all_x]

for i in range(len(all_x)):
    pairs.append([all_x[i],all_y[i]])
    
print(random.choice(pairs))

['一周前69岁的王初爱在小区和邻居玩纸牌忽然一辆警车下来几名警员强行将她带入三亚月川派出所进入派出所后不久王初爱就不省人事随后被送往医院救治经诊断王初爱腰骨骨折身体多处受伤', '三亚69岁老妇被强行带入派出所后骨折多处受伤']


In [4]:
#表示开始和表示结束的token
SOS_token = 0
EOS_token = 1

class allDic:
    def __init__(self):
        self.word2index = {'SOS':0,'EOS':1}
        self.index2word = {0:'SOS',1:'EOS'}
        self.wordcount = {}
        self.num_words = 2
    def getDic(self,lists):  # 对包含所有文字的列表
        for i in range(len(lists)):
            self.addDic(lists[i])
    def addDic(self,sentence):  #对每个列表里面的一句话
        for i in range(len(sentence)):
            if sentence[i] not in self.word2index:
                self.word2index[sentence[i]] = self.num_words
                self.index2word[self.num_words] = sentence[i]
                self.wordcount[sentence[i]] = 1
                self.num_words += 1
            else:
                self.wordcount[sentence[i]] += 1

In [5]:
alldic_x = allDic()
alldic_x.getDic(all_x)

alldic_y = allDic()
alldic_y.getDic(all_y)

In [6]:
def getPairTensor(pair,alldic_x,alldic_y):
    indexes_x = []
    indexes_y = []
    pairs_tensor = []
    
    for token in pair[0]:
        indexes_x.append(alldic_x.word2index[token])
    
    indexes_x.append(EOS_token)
    input_tensor = torch.tensor(indexes_x, dtype=torch.long, device=device).view(-1, 1)
   
    for token in pair[1]:
        indexes_y.append(alldic_y.word2index[token])
    
    indexes_y.append(EOS_token)
    output_tensor = torch.tensor(indexes_y, dtype=torch.long, device=device).view(-1, 1)
       
    return (input_tensor, output_tensor)

In [7]:
class Encoder(nn.Module):  #输入是一句话
    def __init__(self, input_size, hidden_size,c_size,dropout_p = 0.1):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.c_size = c_size
        self.dropout_p = dropout_p
        
        self.embedding = nn.Embedding(input_size,hidden_size)
        self.lstm = nn.LSTM(hidden_size,hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
    def forward(self,input,hidden,c):
        layer_1 = self.embedding(input).view(1, 1, -1)
        output,(hidden,c) = self.lstm(layer_1,(hidden,c))
        output = F.relu(output)
        output = self.dropout(output)
        output,(hidden,c) = self.lstm(output,(hidden,c))
#         output = self.dropout(output)
#         output,(hidden,c) = self.lstm(output,(hidden,c))
#         output = self.dropout(output)
#         output,(hidden,c) = self.lstm(output,(hidden,c))
        return output,(hidden,c)
    def initHidden(self):
        return torch.zeros(1,1,self.hidden_size,device = device)
    def initC(self):
        return torch.zeros(1,1,self.c_size,device = device)

In [8]:
class Decoder(nn.Module):
    def __init__(self,output_size,hidden_size,c_size,dropout_p=0.1, max_length=MAX_LENGTH):
        super(Decoder,self).__init__()
        self.hidden_size = hidden_size
        self.c_size = c_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        self.embedding = nn.Embedding(output_size,hidden_size)
        self.lstm = nn.LSTM(hidden_size,hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.out = nn.Linear(hidden_size,output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self,input,hidden,c,encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output,(hidden,c) = self.lstm(output,(hidden,c))
        output = self.dropout(output)
        output,(hidden,c) = self.lstm(output,(hidden,c))
#         output = self.dropout(output)
#         output,(hidden,c) = self.lstm(output,(hidden,c))
#         output = self.dropout(output)
#         output,(hidden,c) = self.lstm(output,(hidden,c))

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output,(hidden,c), attn_weights
    def initHidden(self):
        return torch.zeros(1,1,self.hidden_size,device = device)
        

In [9]:
teacher_forcing_ratio = 0.5
def train(encoder,decoder,input_tensor,output_tensor,encoder_optimizer,decoder_optimizer,criterion,max_length = MAX_LENGTH):
    encoder_hidden = encoder.initHidden()
    encoder_c = encoder.initC()
    
    # 梯度清零
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    # 文本和摘要的长度
    input_length = input_tensor.size(0)
    output_length = output_tensor.size(0)
    
#     print('input',input_length)
#     print('output',output_length)
    
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    
    loss = 0
    
    # 对某个句子的每个字都放入encoder，中得到结果
    for i in range(input_length):
        encoder_output ,(encoder_hidden,encoder_c) = encoder(input_tensor[i],encoder_hidden,encoder_c)
    
    # decoder相关
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    # 第一个decoder的输入是SOS
    decoder_input = torch.tensor([[SOS_token]],device = device)
    decoder_hidden = encoder_hidden
    decoder_c = encoder_c
     
    if use_teacher_forcing:
        for i in range(output_length):
#             decoder_output,decoder_hidden = decoder(decoder_input,decoder_hidden)
            decoder_output,(decoder_hidden,decoder_c),attention = decoder(decoder_input,decoder_hidden,decoder_c,encoder_outputs)
            loss += criterion(decoder_output, output_tensor[i])
            decoder_input = output_tensor[i]
    else:
        for i in range(output_length):
#             decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            decoder_output,(decoder_hidden,decoder_c),attention = decoder(decoder_input,decoder_hidden,decoder_c,encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, output_tensor[i])
            if decoder_input.item() == EOS_token:
                break
        
            
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / output_length

In [10]:
def train_iters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) 
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [getPairTensor(random.choice(pairs),alldic_x,alldic_y) for i in range(n_iters)]
    
    criterion = nn.NLLLoss()
    
    for iter in range(1,n_iters+1):
        train_pair = training_pairs[iter-1]
        input_tensor = train_pair[0]
        output_tensor = train_pair[1]
        
        loss = train(encoder,decoder,input_tensor,output_tensor,encoder_optimizer,decoder_optimizer,criterion)
        
        print_loss_total += loss
        
        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('(%d %d%%) %.4f' % (iter, iter / n_iters * 100, print_loss_avg))


In [11]:
def tensorFromSentence(alldic_x, sentence):
    x = []
    for token in sentence:
        x.append(alldic_x.word2index[token])
    x.append(EOS_token)
    tensor = torch.tensor(x, dtype=torch.long, device=device).view(-1, 1)
    return tensor
   
        
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(alldic_x, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()
        encoder_c = encoder.initC()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, (encoder_hidden,encoder_c) = encoder(input_tensor[ei],
                                                     encoder_hidden,encoder_c)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden
        decoder_c = encoder_c

        decoded_words = []

        for di in range(max_length):
            decoder_output, (decoder_hidden,decoder_c),attention= decoder(decoder_input, decoder_hidden,decoder_c,encoder_outputs)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(alldic_y.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [12]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ''.join(output_words)
        print('<', output_sentence)
        print('')

In [13]:
hidden_size = 256
c_size = 256
encoder = Encoder(alldic_x.num_words,hidden_size,c_size).to(device)
decoder = Decoder(alldic_y.num_words,hidden_size,c_size).to(device)

train_iters(encoder,decoder,50000,print_every=5000)

(5000 10%) 4.8619
(10000 20%) 4.5146
(15000 30%) 3.9118
(20000 40%) 3.5861
(25000 50%) 3.1339
(30000 60%) 2.6881
(35000 70%) 2.6095
(40000 80%) 2.5194
(45000 90%) 2.3994
(50000 100%) 2.3628


In [14]:
evaluateRandomly(encoder, decoder)

> 最近中国载人航天网发布了长征7号火箭矗立在文昌新型移动发射平台上的照片据悉长征7号火箭可能在2016年进行首次发射身处海南亲们到时候记得到文昌围观火箭发射哦~
= 长征7号来了可以在文昌围观火箭发射咯
< 长征7号来了可以观年人出在<EOS>

> 由于北京时间4月9日凌晨2点将公布美联储会议纪要内容所以我们预计在内容公布之前黄金市场可能不会有太大波动从当前糟糕的美国经济表现来看此次会议美联储可能会对加息采取更加谨慎的态度整体内容可能会偏向鸽派
= 非农糟糕美联储会议或显鸽派
< 非农糟糕美联储会议或显鸽<EOS>

> 1月21日起海南省厉行节约反对食品浪费的实施方案印发规定公务用餐原则是实行自助餐不提供高档菜肴酒水和香烟；单位食堂浪费要通报；餐饮企业主动帮助打包等等公务员们不能再大吃大喝了！
= 公务员们，要管好你们的嘴了
< 公务员们，要管管好转们象天象事<EOS>

> 2014海南十大网络新闻评选活动已圆满结束南海网也选出了投稿入围十大网络新闻并且多个推荐理由结合写得最精彩的十名网友他们将获得最佳参与奖每人奖金500元
= 蔬菜大棚整治登榜首
< 歌手尹相杰治程入程入<EOS>

> 家住海口市兴丹路的李先生每天都要开车经过红城湖路延长线他对红城湖路延长线限速50公里表示非常不理解认为周围热闹路段限速60公里、70公里到这条车少人稀的路段突然限速50公里许多车主过红绿灯后来不及换挡很容易就超速
= 海口红城湖路延长线限速不合理
< 海口红城湖路延线线路干新理止分<EOS>

> 三亚市崖州区大出水村一采石场因施工造成扬尘污染严重被当地环保部门下文责令改正但下达责令改正通知书至今已经过去一年多时间了该采石场仍在生产作业粉尘污染与噪声让附近居民和种植户苦不堪言
= 三亚一采石场粉尘污染严重芒果种植户苦不堪言
< 三亚一采石场粉尘污染严重芒果种户苦宾苦宾堪言<EOS>

> 12月30日南海网记者了解到12月20日第24期的互联网周刊上南海网位列2014中国区域门户网站排行第八据了解互联网周刊创刊于1998年是目前中国互联网和IT业界最成功的主流商业杂志之一
= 恭贺！南海网位列2014中国区域门户网站排行第八
< 恭贺！南网网为列204年工工工仍工<EOS>

> 随着美联储加息的预期和中国经济的低迷国际资本从中国出逃的情况比较严重周日中国央行再度降准正是