In [1]:
import os
import sys
import math

from collections import Counter
import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

import nltk
nltk.download()

import jieba

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> punkt


    Downloading package punkt to /root/nltk_data...
      Unzipping tokenizers/punkt.zip.



---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


In [2]:
def load_data(in_file):
    cn = []
    en = []
    num_examples = 0
    with open(in_file, 'r') as f:
        for line in f:
            line = line.strip().split('\t')      # 每一行是英文+翻译的形式
            #print(line)   # ['Anyone can do that.', '任何人都可以做到。']
            #print(nltk.word_tokenize(line[0].lower()))    # ['anyone', 'can', 'do', 'that', '.']
            en.append(['BOS'] + nltk.word_tokenize(line[0].lower()) + ['EOS'])
            #print([c for c in line[1]])   ['任', '何', '人', '都', '可', '以', '做', '到', '。']
            #print(list(jieba.cut(line[1])))        ['任何人', '都', '可以', '做到', '。']
            #cn.append(['BOS'] + [c for c in line[1]] + ['EOS'])
            cn.append(['BOS'] + list(jieba.cut(line[1])) + ['EOS'])
    return en, cn

train_file = 'nmt/en-cn/train.txt'
dev_file = 'nmt/en-cn/dev.txt'
train_en, train_cn = load_data(train_file)
dev_en, dev_cn = load_data(dev_file)

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.817 seconds.
DEBUG:jieba:Loading model cost 0.817 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


In [3]:
print(dev_cn[:5])

[['BOS', '她', '把', '雜誌', '放在', '桌上', '。', 'EOS'], ['BOS', '嘿', '，', '你', '在', '這做', '什麼', '？', 'EOS'], ['BOS', '請', '保守', '這個', '秘密', '。', 'EOS'], ['BOS', '事情', '怎麼', '變糟', '的', '？', 'EOS'], ['BOS', '京都', '和', '波士顿', '是', '姐妹', '城市', '。', 'EOS']]


In [4]:
UNK_IDX = 0
PAD_IDX = 1
def build_dict(sentences, max_words=50000):
    word_count = Counter()
    for sentence in sentences:
        for s in sentence:
            word_count[s] += 1
    ls = word_count.most_common(max_words)
    total_words = len(ls) + 2    # 两个特殊的字符UNK和PAD
    word_dict = {w[0]: index+2 for index, w in enumerate(ls)}   # 字典的前两个位置放特殊字符
    word_dict['UNK'] = UNK_IDX
    word_dict['PAD'] = PAD_IDX
    return word_dict, total_words

en_dict, en_total_words = build_dict(train_en)
cn_dict, cn_total_words = build_dict(train_cn)

inv_en_dict = {v:k for k, v in en_dict.items()}
inv_cn_dict = {v:k for k, v in cn_dict.items()}

In [5]:
def encode(en_sentences, cn_sentences, en_dict, cn_dict, sort_by_len=True):
    length = len(en_sentences)
    out_en_sentences = [[en_dict.get(w, 0) for w in sent] for sent in en_sentences]
    out_cn_sentences = [[cn_dict.get(w, 0) for w in sent] for sent in cn_sentences]
    # 根据英语句子的长度排序
    def len_argsort(seq):   # 这个seq是一个二维矩阵， 每一行是一个句子， 且都已经用单词在字典中的位置进行了编码
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))

    if sort_by_len:
        sorted_index = len_argsort(out_en_sentences)
        out_en_sentences = [out_en_sentences[i] for i in sorted_index]
        out_cn_sentences = [out_cn_sentences[i] for i in sorted_index]

    return out_en_sentences, out_cn_sentences

train_en, train_cn = encode(train_en, train_cn, en_dict, cn_dict)
dev_en, dev_cn = encode(dev_en, dev_cn, en_dict, cn_dict)

In [6]:
print(dev_cn[:5])

[[2, 0, 239, 4, 3], [2, 5019, 613, 3], [2, 587, 265, 4, 3], [2, 6819, 111, 4, 3], [2, 5052, 3420, 4, 3]]


In [7]:
def get_minibatches(n, minibatch_size, shuffle=True):
    idx_list = np.arange(0, n, minibatch_size)
    if shuffle:
        np.random.shuffle(idx_list)
    minibatches = []
    for idx in idx_list:
        minibatches.append(np.arange(idx, min(idx+minibatch_size, n)))
    return minibatches      # 这个会返回多批连着的bath_size个索引
#get_minibatches(len(train_en), 32)

# 这个函数是在做数据预处理， 由于每个句子都不是一样长， 所以通过这个函数就可以把句子进行补齐， 不够长的在句子后面添加0
def prepare_data(seqs):
    lengths = [len(seq) for seq in seqs]    # 得到每个句子的长度
    n_samples = len(seqs)       # 得到一共有多少个句子
    max_len = np.max(lengths)              # 找出最大的句子长度

    x = np.zeros((n_samples, max_len)).astype('int32')    # 按照最大句子长度生成全0矩阵
    x_lengths = np.array(lengths).astype('int32')
    for idx, seq in enumerate(seqs):        # 把有句子的位置填充进去
        x[idx, :lengths[idx]] = seq
    return x, x_lengths      # x_mask

def gen_examples(en_sentences, cn_sentences, batch_size):
    minibatches = get_minibatches(len(en_sentences), batch_size)   # 得到batch个索引
    all_ex = []
    for minibatch in minibatches:   # 每批数据的索引
        mb_en_sentences = [en_sentences[t] for t in minibatch]   # 取数据
        mb_cn_sentences = [cn_sentences[t] for t in minibatch]  # 取数据
        mb_x, mb_x_len = prepare_data(mb_en_sentences) # 填充成一样的长度， 但是要记录一下句子的真实长度， 这个在后面输入网络的时候得用
        mb_y, mb_y_len = prepare_data(mb_cn_sentences)
        all_ex.append((mb_x, mb_x_len, mb_y, mb_y_len))
    return all_ex

batch_size = 64
train_data = gen_examples(train_en, train_cn, batch_size)   # 产生训练集
random.shuffle(train_data)
dev_data = gen_examples(dev_en, dev_cn, batch_size)   # 产生验证集

In [8]:
print(train_data[1][0].shape, train_data[1][1].shape, train_data[1][2].shape, train_data[1][3].shape)
# 第一个维度表示第1个batch， 第二个维度[0]代表每个每个句子单词个数， [1]代表每个句子的长度， [2]代表中文词个数， [3]代表每个句子的中文长度
# 注意每个batch里面的句子长度是不一样的， 同一batch里面的句子长度由于填充0使得一样了

(64, 12) (64,) (64, 15) (64,)


In [9]:
from torch.autograd import Variable
tensor_in = torch.FloatTensor([[1, 2, 3], [5, 0, 0]]).resize_(2, 3, 1)
tensor_in = Variable(tensor_in)
seq_lenghs = [3, 1]
tensor_in

tensor([[[1.],
         [2.],
         [3.]],

        [[5.],
         [0.],
         [0.]]])

In [10]:
torch.tensor(seq_lenghs).sort(0, descending=True)

torch.return_types.sort(
values=tensor([3, 1]),
indices=tensor([0, 1]))

In [11]:
pack = nn.utils.rnn.pack_padded_sequence(tensor_in, seq_lenghs, batch_first=True)
pack

PackedSequence(data=tensor([[1.],
        [5.],
        [2.],
        [3.]]), batch_sizes=tensor([2, 1, 1]), sorted_indices=None, unsorted_indices=None)

In [12]:
rnn = nn.RNN(1, 2, 3, batch_first=True)   # 输入维度是1(embed_dim)， 输出维度是2(2个隐藏单元), 3层
h0 = Variable(torch.randn(3, 2, 2))  # h0的初始状态， (layers_num*direction_nums, batch_size, hidden_size)

out, h = rnn(pack, h0)
out[0].shape   # [4, 2]
out

PackedSequence(data=tensor([[ 0.7026, -0.1171],
        [ 0.6928,  0.0010],
        [ 0.4563, -0.1514],
        [ 0.5318, -0.2126]], grad_fn=<CatBackward0>), batch_sizes=tensor([2, 1, 1]), sorted_indices=None, unsorted_indices=None)

In [19]:
class PlainEncoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout=0.2):
        super(PlainEncoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, lengths):
        #这里需要输入lengths, 因为每个句子是不一样长的，我们需要每个句子最后一个时间步的隐藏状态，
        #所以需要知道句子有多长，x表示一个batch里面的句子

        #把batch里面的seq按照长度排序
        sorted_len, sorted_idx = lengths.sort(0, descending=True) #sorten表示排好序的数组，sorted_index表示每个元素再原数组位置
        x_sorted = x[sorted_idx.long()] #句子已经按照seq长度排好序
        embedded = self.dropout(self.embed(x_sorted)) #[batch_size, seq_len, embed_size]

        #下面一段代码处理变长序列
        #这里的data.numpy()是原始张量的克隆，然后转成了numpy数组，相当于clone().numpy()
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_len.long().cpu().data.numpy(), batch_first=True)
        #上面这句话之后，会把变长序列的0都给去掉，之前填充的字符都给压扁
        packed_out, hid = self.rnn(packed_embedded)#通过这句话就可以得到batch中每个样本的真实隐藏状态
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)#这里是再填充回去，看下面的例子就懂了
        _, original_idx = sorted_idx.sort(0, descending=False) #这里是为了还是让短的句子再前面
        out = out[original_idx.long()].contiguous() #contiguous是为了把不连续的内存单元连续起来
        hid = hid[:, original_idx.long()].contiguous()

        return out, hid[[-1]] #把最后一层的his给拿出来，这个具体看上面的简单演示

In [25]:
#这个基本上和Encoder是一致得，无非就是初始化得h换成了Encoder之后得h
class PlainDecoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout=0.2):
        super(PlainDecoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
    def forward(self, y, y_lengths, hid):
        #y:[batch_size, seq_len-1]
        sorted_len, sorted_idx = y_lengths.sort(0, descending=True) #依然是句子从长到短排序
        y_sorted = y[sorted_idx.long()]
        hid = hid[:, sorted_idx.long()]

        y_sorted = self.dropout(self.embed(y_sorted)) #[batch_size, output_length, embed_size]

        pack_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(), batch_first=True)
        out, hid = self.rnn(pack_seq, hid) #这个计算得是每个有效时间步单词得最后一层得隐藏状态
        unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True) #[batch,seq_len-1, hidden_size]
        _, original_idx = sorted_idx.sort(0, descending=False)
        output_seq = unpacked[original_idx.long()].contiguous() #[batch, seq_len-1, hidden_size]

        hid = hid[:, original_idx.long()].contiguous() #[1,batch, hidden_size]
        output = F.log_softmax(self.out(output_seq), -1)
        #[batch, seq_len-1, vocab_size] 表示每个样本每个时间步长都有一个vocab_size得维度长度，表示每个单词得概率

        return output, hid

In [26]:
class PlainSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(PlainSeq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    def forward(self, x, x_lengths, y, y_lengths):
        encoder_out, hid = self.encoder(x, x_lengths) #encoder进行编码
        output, hid = self.decoder(y, y_lengths, hid) #decoder负责解码
        return output, None
    def translate(self, x, x_lengths, y, max_length=10):#这个是进来一个句子进行翻译 max_length句子得最大长度
        encoder_out, hid = self.encoder(x, x_lengths)
        preds = []
        batch_size = x.shape[0]
        attns = []
        for i in range(max_length):
            output, hid = self.decoder(y, torch.ones(batch_size).long().to(y.device), hid=hid)
            y = output.max(2)[1].view(batch_size, 1)
            preds.append(y)
        return torch.cat(preds, 1), None

In [27]:
# masked cross entropy loss
class LanguageModelCriterion(nn.Module):
    def __init__(self):
        super(LanguageModelCriterion, self).__init__()

    def forward(self, input, target, mask):
        # input: [batch_size, seq_len, vocab_size]    每个单词的可能性
        input = input.contiguous().view(-1, input.size(2))   # [batch_size*seq_len-1, vocab_size]
        target = target.contiguous().view(-1, 1)    #  [batch_size*seq_len-1, 1]

        mask = mask.contiguous().view(-1, 1)   # [batch_size*seq_len-1, 1]
        output = -input.gather(1, target) * mask # 在每个vocab_size维度取正确单词的索引， 但是里面有很多是填充进去的， 所以mask去掉这些填充的
        # 这个其实在写一个NLloss ， 也就是sortmax的取负号
        output = torch.sum(output) / torch.sum(mask)

        return output  # [batch_size*seq_len-1, 1]

In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dropout = 0.2
hidden_size = 100
encoder = PlainEncoder(vocab_size=en_total_words, hidden_size=hidden_size, dropout=dropout)
decoder = PlainDecoder(vocab_size=cn_total_words, hidden_size=hidden_size, dropout=dropout)

model = PlainSeq2Seq(encoder, decoder)
model = model.to(device)
loss_fn = LanguageModelCriterion().to(device)
optimizer = torch.optim.Adam(model.parameters())

In [29]:
def evaluate(model, data):
    model.eval()
    total_num_words = total_loss = 0.
    with torch.no_grad():
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            mb_x = torch.from_numpy(mb_x).to(device).long()    # 这个是一个batch的英文句子 大小是[batch_size, seq_len]
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()    # 每个句子的长度
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()  # 解码器那边的输入， 输入一个单词去预测另外一个单词
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()   # 解码器那边的输出  [batch_size, seq_len-1]
            mb_y_len = torch.from_numpy(mb_y_len-1).to(device).long()  # 这个减去1， 因为没有了最后一个  [batch_size, seq_len-1]
            mb_y_len[mb_y_len<=0] =  1   # 这句话是为了以防出错

            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)

            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
            # [batch_size, mb_y_len.max()], 上面是bool类型， 下面是float类型， 只计算每个句子的有效部分， 填充的那部分去掉
            mb_out_mask = mb_out_mask.float()  # [batch_size, seq_len-1]  因为mb_y_len.max()就是seq_len-1

            loss = loss_fn(mb_pred, mb_output, mb_out_mask)

            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words
    print('Evaluation loss', total_loss / total_num_words)

def train(model, data, num_epochs=20):
    for epoch in range(num_epochs):
        model.train()
        total_num_words = total_loss = 0.
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            mb_x = torch.from_numpy(mb_x).to(device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()
            mb_y_len = torch.from_numpy(mb_y_len-1).to(device).long()
            mb_y_len[mb_y_len<=0] = 1

            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)

            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
            mb_out_mask = mb_out_mask.float()

            loss = loss_fn(mb_pred, mb_output, mb_out_mask)

            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words

            # 更新
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.)     # 这里防止梯度爆炸， 这是和以往不太一样的地方
            optimizer.step()

            if it % 100 == 0:
                print('Epoch', epoch, 'iteration', it, 'loss', loss.item())

        print('Epoch', epoch, 'Training loss', total_loss / total_num_words)
        if epoch % 5 == 0:
            evaluate(model, dev_data)

# 训练
train(model, train_data, num_epochs=20)

Epoch 0 iteration 0 loss 9.331954956054688
Epoch 0 iteration 100 loss 6.044123649597168
Epoch 0 iteration 200 loss 5.020909786224365
Epoch 0 Training loss 5.850677881439716
Evaluation loss 5.202617210244227
Epoch 1 iteration 0 loss 4.993743419647217
Epoch 1 iteration 100 loss 5.517873764038086
Epoch 1 iteration 200 loss 4.523971080780029
Epoch 1 Training loss 4.933376639293118
Epoch 2 iteration 0 loss 4.642943382263184
Epoch 2 iteration 100 loss 5.179933071136475
Epoch 2 iteration 200 loss 4.205909729003906
Epoch 2 Training loss 4.601179413637688
Epoch 3 iteration 0 loss 4.386545181274414
Epoch 3 iteration 100 loss 4.938098430633545
Epoch 3 iteration 200 loss 3.9619898796081543
Epoch 3 Training loss 4.355408479520593
Epoch 4 iteration 0 loss 4.157600402832031
Epoch 4 iteration 100 loss 4.773762226104736
Epoch 4 iteration 200 loss 3.767699956893921
Epoch 4 Training loss 4.152598270888499
Epoch 5 iteration 0 loss 3.964405059814453
Epoch 5 iteration 100 loss 4.622564792633057
Epoch 5 iter

Attention model

In [36]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout=0.2):
        super(Encoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, enc_hidden_size, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(enc_hidden_size*2, dec_hidden_size)
    def forward(self, x, lengths):
        sorted_len, sorted_idx = lengths.sort(0, descending=True)
        x_sorted = x[sorted_idx.long()]
        embedded = self.dropout(self.embed(x_sorted)) #[batch_size, seq_len, embed_size]

        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_len.long().cpu().data.numpy(), batch_first=True)
        packed_out, hid = self.rnn(packed_embedded)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)#[batch_size,seq_len,2*enc_hidden_size]
        _, original_idx = sorted_idx.sort(0, descending=False)
        out = out[original_idx.long()].contiguous() #[batch_size, seq_len, 2*enc_hidden_size]
        hid = hid[:, original_idx.long()].contiguous() #[2,batch_size, enc_hidden_size]

        hid = torch.cat([hid[-2], hid[-1]], dim=1) #双向的GRU，这里是最后一个状态，联结起来 [batch_size, 2*enc_hidden_size]
        hid = torch.tanh(self.fc(hid)).unsqueeze(0) #[1, batch_size, dec_hidden_size]

        return out, hid

In [37]:
class Attention(nn.Module):
    def __init__(self, enc_hidden_size, dec_hidden_size):
        super(Attention, self).__init__()

        self.enc_hidden_size = enc_hidden_size
        self.dec_hidden_size = dec_hidden_size

        self.linear_in = nn.Linear(enc_hidden_size*2, dec_hidden_size, bias=False)
        self.linear_out = nn.Linear(enc_hidden_size*2+dec_hidden_size, dec_hidden_size)

    def forward(self, output, encoder_output, mask):
        # output: [batch_size, seq_len_y-1, dec_hidden_size]  这个output 是decoder的每个时间步输出的隐藏状态
        # encoder_output: [batch_size, seq_len_x, 2*enc_hidden_size]
        batch_size = output.size(0)
        output_len = output.size(1)
        input_len = encoder_output.size(1)

        context_in = self.linear_in(encoder_output.view(batch_size*input_len, -1))  # [batch_size*seq_len_x,dec_hidden_size]
        context_in = context_in.view(batch_size, input_len, -1)  # [batch_size, seq_len_x, dec_hidden_size]
        context_in = context_in.transpose(1, 2)   # [batch_size, dec_hidden_size, seq_len_x]

        attn = torch.bmm(output, context_in)  # [batch_size, seq_len_y-1, seq_len_x]
        # 这个东西就是求得当前时间步的输出output和所有输入相似性关系的一个得分score , 下面就是通过softmax把这个得分转成权重
        attn = F.softmax(attn, dim=2)    # 此时第二维度的数字全都变成了0-1之间的数， 越大表示当前的输出output与哪个相关程度越大

        context = torch.bmm(attn, encoder_output)   # [batch_size, seq_len_y-1, 2*enc_hidden_size]

        output = torch.cat((context, output), dim=2)  # [batch_size, seq_len_y-1, 2*enc_hidden_size+dec_hidden_size]

        output = output.view(batch_size*output_len, -1)   # [batch_size*seq_len_y-1, 2*enc_hidden_size+dec_hidden_size]
        output = torch.tanh(self.linear_out(output))     # [batch_size*seq_len_y-1, dec_hidden_size]
        output = output.view(batch_size, output_len, -1)  # [batch_size, seq_len_y-1, dec_hidden_size]

        return output, attn

In [38]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout=0.2):
        super(Decoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.attention = Attention(enc_hidden_size, dec_hidden_size)
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)
        self.out = nn.Linear(dec_hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def create_mask(self, x_len, y_len):
        # a mask of shape x_len*y_len
        x_mask = torch.arange(x_len.max(), device=x_len.device)[None, :] < x_len[:, None]
        y_mask = torch.arange(y_len.max(), device=x_len.device)[None, :] < y_len[:, None]

        x_mask = x_mask.float()
        y_mask = y_mask.float()
        mask = (1 - x_mask[:, :, None] * y_mask[:, None, :]).byte()
        return mask

    def forward(self, encoder_out, encoder_out_lengths, y, y_lengths, hid):
        sorted_len, sorted_idx = y_lengths.sort(0, descending=True)
        y_sorted = y[sorted_idx.long()]   # 句子从长到短排序
        hid = hid[:, sorted_idx.long()]

        y_sorted = self.dropout(self.embed(y_sorted))     # [batch_size, output_length, embed_size]

        packed_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(), batch_first=True)
        out, hid = self.rnn(packed_seq, hid)
        unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        _, original_idx = sorted_idx.sort(0, descending=False)
        output_seq = unpacked[original_idx.long()].contiguous()   # [batch_size, seq_len_y-1, dec_hidden_size]
        hid = hid[:, original_idx.long()].contiguous()

        mask = self.create_mask(y_lengths, encoder_out_lengths)

        output, attn = self.attention(output_seq, encoder_out, mask)
        output = F.log_softmax(self.out(output), -1)

        return output, hid, attn

In [39]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, x, x_lengths, y, y_lengths):
        encoder_out, hid = self.encoder(x, x_lengths)
        output, hid, attn = self.decoder(encoder_out, x_lengths, y, y_lengths, hid)

        return output, attn

    def translate(self, x, x_lengths, y, max_length=100):
        encoder_out, hid = self.encoder(x, x_lengths)
        preds = []
        batch_size = x.shape[0]
        attns = []
        for i in range(max_length):
            output, hid, attn = self.decoder(encoder_out, x_lengths, y, torch.ones(batch_size).long().to(y.device), hid)
            y = output.max(2)[1].view(batch_size, 1)
            preds.append(y)
            attns.append(attn)

        return torch.cat(preds, 1), torch.cat(attns, 1)