In [1]:
import torch
import numpy as np
import nltk
import pkuseg
import torch.nn as nn
import torch.nn.functional as F

from collections import Counter
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence


# 数据预处理

## <font color='blue'>读取数据</font>

In [2]:
# 读取数据并处理成中英文列表，两个列表相同索引位置为对应译文
def load_data(filename):
    en = []
    cn = []
    seg = pkuseg.pkuseg()
    with open(filename) as f:
        for line in f.readlines():
            line = line.strip().split('\t')
            en.append(['BOS'] + nltk.word_tokenize(line[0].lower()) + ['EOS'])
            cn.append(['BOS'] + [c for c in line[1]] + ['EOS'])
    return en, cn

In [3]:
# 划分训练集和验证集
def train_val_split(en, cn, split=0.1):
    val_len = int(len(en) * split)
    val_index = np.random.choice(len(en), val_len, replace=False)
    train_en = []
    train_cn = []
    val_en = []
    val_cn = []
    for i in range(len(en)):
        if i in val_index:
            val_en.append(en[i])
            val_cn.append(cn[i])
        else:
            train_en.append(en[i])
            train_cn.append(cn[i])
    
    return train_en, train_cn, val_en, val_cn

In [4]:
# 自己找的数据集
filename = 'datasets/cmn.txt'
en, cn = load_data(filename)
train_en, train_cn, val_en, val_cn = train_val_split(en, cn)

In [5]:
print('train_en len:', len(train_en))
print('train_cn len:', len(train_cn))
print('val_en len:', len(val_en))
print('val_cn len:', len(val_cn))

train_en len: 18120
train_cn len: 18120
val_en len: 2013
val_cn len: 2013


## <font color='blue'>构建词表</font>

In [6]:
UNK_ID = 1
PAD_ID = 0

def build_vocab(text, max_words=None):
    word_count = Counter()
    for sentence in text:
        word_count.update(sentence)
    if not max_words:
        max_words = len(word_count)
    word_count = word_count.most_common(max_words)
    
    vocab_size = max_words + 2
    word2ix = {item[0] : i+2 for i, item in enumerate(word_count)}
    word2ix['UNK'] = UNK_ID
    word2ix['PAD'] = PAD_ID
    
    return word2ix, vocab_size

en_word2ix, en_vocab_size = build_vocab(en)
cn_word2ix, cn_vocab_size = build_vocab(cn)

en_ix2word = {ix:word for word, ix in en_word2ix.items()}
cn_ix2word = {ix:word for word, ix in cn_word2ix.items()}

## <font color='blue'>构建Dataset</font>

In [7]:
# 转换成数字编码
def encode(en, cn):
    encode_en = [[en_word2ix[word] for word in sentence] for sentence in en]
    encode_cn = [[cn_word2ix[word] for word in sentence] for sentence in cn]
    return encode_en, encode_cn

In [8]:
# 数字编码后的数据
train_data_en, train_data_cn = encode(en,cn)
val_data_en, val_data_cn = encode(en, cn)

In [9]:
class translateDataset(Dataset):
    def __init__(self, en, cn):
        self.en = en
        self.cn = cn
    
    def __getitem__(self, index):
        en_sentence = torch.LongTensor(self.en[index])
        cn_sentence = torch.LongTensor(self.cn[index])
        return en_sentence, cn_sentence
    
    def __len__(self):
        return len(en)

In [10]:
def collate_fn(data):
    en, cn = zip(*data)
    en_lengths = [len(sentence) for sentence in en]
    cn_lengths = [len(sentence) for sentence in cn]
    
    en_batch_length = max(en_lengths)
    cn_batch_length = max(cn_lengths)
    
    target_en = torch.zeros(len(en), en_batch_length).long()
    target_cn = torch.zeros(len(cn), cn_batch_length).long()
    
    for i in range(len(en)):
        en_text = en[i]
        cn_text = cn[i]
        
        en_len = en_lengths[i]
        cn_len = cn_lengths[i]
        
        target_en[i, :en_len] = en_text
        target_cn[i, :cn_len] = cn_text
    en_lengths = torch.LongTensor(en_lengths)
    cn_lengths = torch.LongTensor(cn_lengths)
    return target_en, en_lengths, target_cn, cn_lengths

In [11]:
def get_dataloader(batch_size=100, shuffle=True):
    train_dataset = translateDataset(train_data_en, train_data_cn)
    val_dataset = translateDataset(val_data_en, val_data_cn)
    
    train_dataloader = DataLoader(dataset=train_dataset
                                 ,batch_size=batch_size
                                 ,shuffle=shuffle
                                 ,pin_memory=True
                                 ,collate_fn=collate_fn)
    
    val_dataloader = DataLoader(dataset=val_dataset
                                 ,batch_size=batch_size
                                 ,shuffle=shuffle
                                 ,pin_memory=True
                                 ,collate_fn=collate_fn)
    return train_dataloader, val_dataloader

In [12]:
train_dataloader, val_dataloader = get_dataloader()

# 构建encoder-decoder模型(没有attention)

## <font color='blue'>encoder部分</font>

In [13]:
class PlainEncoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1, dropout=0.2):
        super(PlainEncoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, lengths):
        '''
        x: (batch_size , seq_len)
        lengths: (batch_size,)
        '''
        sorted_len, sorted_index = lengths.sort(0, descending=True)
        
        sorted_x = x[sorted_index.long()]
        embed = self.dropout(self.embed(sorted_x))
        # embed: (batch_size , seq_len , embed_size)
        
        packed_embed = pack_padded_sequence(embed, sorted_len, batch_first=True)
        
        out, hidden = self.gru(packed_embed)
        # hidden: h_n = (1 , batch_size , hidden_size)
        
        out, _ = pad_packed_sequence(out, batch_first=True) # 其实encoder部分可以不用out，只需要hidden就行了，但是加了attention就必须要了
        # out: (batch_size , seq_len , hidden_size)
        
        # 下面的操作就是还原index
        _, orginal_index = sorted_index.sort(0)
        out = out[orginal_index.long()].contiguous()
        hidden = hidden[:,orginal_index.long()].contiguous()
        # out: (batch_size , seq_len , embed_size)
        # hidden: h_n = (1 , batch_size , hidden_size)
        
        return out, hidden[[-1]]      # [-1]表示取最后一个元素并保持维度数，
                                      # 例如hidden的shape为(3,4,5)，那么hidden[[-1]]的维度为(1,4,5)；而hidden[-1]为(4,5)
        

## <font color='blue'>decoder部分</font>

In [14]:
class PlainDecoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1, dropout=0.2):
        super(PlainDecoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, y, lengths, hidden):
        sorted_len, sorted_index = lengths.sort(0, descending=True)
        
        sorted_y = y[sorted_index.long()]
        hidden = hidden[:, sorted_index.long()]
        
        embed = self.dropout(self.embed(sorted_y))
        
        packed_embed = pack_padded_sequence(embed, sorted_len, batch_first=True)
        
        out, hidden = self.gru(packed_embed, hidden)
        # h_n = (1 , batch_size , hidden_size)
        out, _ = pad_packed_sequence(out,batch_first=True)
        # out: (batch_size , seq_len , embed_size)
        
        _, orginal_index = sorted_index.sort(0)
        
        out = out[orginal_index.long()].contiguous()
        hidden = hidden[:,orginal_index.long()].contiguous()
        
        output = self.fc(out)
        output = F.log_softmax(output, -1)
        # output: (batch_size, seq_len, vocab_size)
        
        return output, hidden

## <font color='blue'>Seq2seq模型</font>

In [15]:
class PlainSeq2seq(nn.Module):
    def __init__(self, en_vocab_size, cn_vocab_size, embed_size, hidden_size, num_layers=1, dropout=0.2):
        super(PlainSeq2seq, self).__init__()
        self.encoder = PlainEncoder(en_vocab_size, embed_size, hidden_size, num_layers, dropout)
        self.decoder = PlainDecoder(cn_vocab_size, embed_size, hidden_size, num_layers, dropout)
        
    def forward(self, en_data, en_lengths, cn_data, cn_lengths):
        _, hidden = self.encoder(en_data, en_lengths)
        output, _ = self.decoder(cn_data, cn_lengths, hidden)
        
        return output
    
    def translate(self, en, cn_word2ix, cn_ix2word, max_len=10):
        '''
        输入：一句经过编码的句子,shape为(seq_len, )
        '''
        en_lengths = torch.LongTensor([len(en)]).to(en.device)
        # en_lengths: (1,) , 即batch_size = 1
        en = en.unsqueeze(0)
        # en: (1, seq_len) , 即batch_size = 1
        _, hidden = self.encoder(en, en_lengths)
        # hidden: (1, 1, hidden_size)
        y = torch.LongTensor([[cn_word2ix['BOS']]]).to(en.device)
        # y:(1, 1)
        res = []
        for i in range(max_len):
            output, hidden = self.decoder(y, torch.ones(1,).long().to(y.device),hidden=hidden)
            # output: (1,1,vocab_size), 经过log_softmax后的output
            y = output.max(2, keepdim=True)[1].view(-1,1)
            index = y.item()
            res.append(index)
            if index==cn_word2ix['EOS']:
                break
        preds = [cn_ix2word[word] for word in res]
        
        return preds
        

## <font color='blue'>定义损失函数</font>

In [16]:
class LanguageModelLoss(nn.Module):
    def __init__(self):
        super(LanguageModelLoss, self).__init__()
    
    def forward(self, outputs, targets, mask):
        '''
        outputs: (batch_size, max_seq_len, vocab_size)
        targets: (batch_size, max_seq_len)
        mask: (batch_size, max_seq_len)
        '''
        outputs = outputs.contiguous().view(-1, outputs.size(2))
        # outputs: (batch_size * max_seq_len,  vocab_size)
        targets = targets.contiguous().view(-1, 1)
        # targets: (batch_size * max_seq_len, 1)
        mask = mask.contiguous().view(-1,1)
        # mask: (batch_size * max_seq_len, 1)
        
        losses = -outputs.gather(1, targets) * mask
        
        loss = torch.sum(losses) / torch.sum(mask)
        
        return loss

# 开始训练

## <font color='blue'>训练函数</font>

In [17]:
def train(train_data_loader, dev_data_loader, model, optimizer, loss_fn, device, max_epochs=2):
    best_loss = float('inf')
    for epoch in range(max_epochs):
        model.train()
        for iteration, (en_data, en_lengths, cn_data, cn_lengths) in enumerate(train_data_loader):
            en_data = en_data.to(device)
            en_lengths = en_lengths.to(device)
            cn_input = cn_data[:, :-1].to(device)
            cn_target = cn_data[:, 1:].to(device)
            cn_lengths = (cn_lengths-1).to(device)
            
            preds = model(en_data, en_lengths, cn_input, cn_lengths)
            
            mask = torch.arange(cn_lengths.max().item(), device=device)[None,:] < cn_lengths[:,None]
            mask = mask.float()
            
            loss = loss_fn(preds, cn_target, mask)
            optimizer.zero_grad()
            loss.backward()
            # 为了防止梯度过大
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.)
            
            optimizer.step()
            
            if iteration%100 == 0:
                print('Epoch: ', epoch, ' |  Iteration', iteration, ' |  loss: ', loss.item())
        if epoch % 3 == 0:
            dev_loss = evaluate(dev_data_loader, model, loss_fn, device)
            if dev_loss < best_loss:
                best_model = model
    return best_model

## <font color='blue'>验证函数</font>

In [18]:
def evaluate(dev_data_loader, model, loss_fn, deivce):
    model.eval()
    total_loss = 0.
    with torch.no_grad():
        for iteration, (en_data, en_lengths, cn_data, cn_lengths) in enumerate(dev_data_loader):
            en_data = en_data.to(device)
            en_lengths = en_lengths.to(device)
            cn_data = cn_data.to(device)
            cn_input = cn_data[:, :-1].to(device)
            cn_target = cn_data[:, 1:].to(device)
            cn_lengths = (cn_lengths-1).to(device)
            
            preds = model(en_data, en_lengths, cn_input, cn_lengths)
            
            mask = torch.arange(cn_lengths.max().item(), device=device)[None,:] < cn_lengths[:,None]
            mask = mask.float()
            
            loss = loss_fn(preds, cn_target, mask)
            total_loss += loss.item()
    print('Dev Loss: ', total_loss / len(dev_data_loader))
    return total_loss / len(dev_data_loader)

## <font color='blue'>实例化模型、优化器、损失函数等</font>

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

en_vocab_size = len(en_word2ix)
cn_vocab_size = len(cn_word2ix)

embed_size = 100
hidden_size = 100
num_layers = 1
dropout = 0.2
lr = 0.01

model = PlainSeq2seq(en_vocab_size, cn_vocab_size, embed_size, hidden_size, num_layers, dropout).to(device)

optimizer = torch.optim.Adam(model.parameters())
loss_fn = LanguageModelLoss().to(device)

best_model = train(train_dataloader, val_dataloader, model, optimizer, loss_fn, device, max_epochs=30)

Epoch:  0  |  Iteration 0  |  loss:  8.153417587280273
Epoch:  0  |  Iteration 100  |  loss:  5.07241153717041
Epoch:  0  |  Iteration 200  |  loss:  4.898135185241699
Dev Loss:  4.741683891504118
Epoch:  1  |  Iteration 0  |  loss:  4.577436923980713
Epoch:  1  |  Iteration 100  |  loss:  4.515731334686279
Epoch:  1  |  Iteration 200  |  loss:  4.224242210388184
Epoch:  2  |  Iteration 0  |  loss:  4.005097389221191
Epoch:  2  |  Iteration 100  |  loss:  4.192634105682373
Epoch:  2  |  Iteration 200  |  loss:  3.9371724128723145
Epoch:  3  |  Iteration 0  |  loss:  3.823545217514038
Epoch:  3  |  Iteration 100  |  loss:  3.886296510696411
Epoch:  3  |  Iteration 200  |  loss:  3.599013328552246
Dev Loss:  3.6594991412493263
Epoch:  4  |  Iteration 0  |  loss:  3.8164820671081543
Epoch:  4  |  Iteration 100  |  loss:  3.551103115081787
Epoch:  4  |  Iteration 200  |  loss:  3.482710123062134
Epoch:  5  |  Iteration 0  |  loss:  3.647207021713257
Epoch:  5  |  Iteration 100  |  loss:  3

## <font color='blue'>实例验证</font>

In [20]:
def translate_dev(model, val_en, val_cn, i, device):
    en_sent = " ".join([en_ix2word[idx] for idx in val_en[i]])
    print(en_sent)
    cn_sent = " ".join([cn_ix2word[idx] for idx in val_cn[i]])
    print(cn_sent)
    
    input_en = torch.LongTensor(val_en[i]).to(device)
    trans = model.translate(input_en, cn_word2ix, cn_ix2word, max_len=10)
    if trans[-1] == 'EOS':
        trans.pop()
    print(" ".join(trans))
    print("-------------------")

In [21]:
for i in range(100,125):
    translate_dev(best_model, val_data_en, val_data_cn, i, device)

BOS try some . EOS
BOS 试 试 吧 。 EOS
试 试 。
-------------------
BOS who died ? EOS
BOS 谁 死 了 ？ EOS
谁 还 是 个 ？
-------------------
BOS birds fly . EOS
BOS 鳥 類 飛 行 。 EOS
去 了 醫 院 。
-------------------
BOS call home ! EOS
BOS 打 电 话 回 家 ！ EOS
有 人 好 ！
-------------------
BOS catch him . EOS
BOS 抓 住 他 。 EOS
抓 住 他 。
-------------------
BOS come home . EOS
BOS 回 家 吧 。 EOS
回 来 。
-------------------
BOS do it now . EOS
BOS 現 在 就 做 。 EOS
現 在 正 在 吃 。
-------------------
BOS dogs bark . EOS
BOS 狗 会 叫 。 EOS
猫 吧 ！
-------------------
BOS do n't cry . EOS
BOS 别 哭 。 EOS
别 人 了 。
-------------------
BOS excuse me . EOS
BOS 对 不 起 。 EOS
我 不 要 打 網 球 。
-------------------
BOS feel this . EOS
BOS 来 感 受 一 下 这 个 。 EOS
這 可 能 。
-------------------
BOS follow me . EOS
BOS 请 跟 我 来 。 EOS
請 給 我 。
-------------------
BOS follow us . EOS
BOS 请 跟 着 我 们 。 EOS
别 人 来 说 了 。
-------------------
BOS good luck . EOS
BOS 祝 你 好 运 。 EOS
祝 贺 你 。
-------------------
BOS grab that . EOS
BOS 抓 住 那 个 。 EOS
繼 續 工 作 。
-------------------
BOS

# 构建encoder-decoder模型(attention)

<font color='red'>**实现注意事项**</font>  
1. Luong Attetion模型和Bahdanau Attention模型的不同  
2. 注意训练过程rnn的输出并不需要输入进下一个time step中，而是仅仅在预测阶段才输入下一个time step中。这其实就是<font color='red'>**teacher forcing**</font>  
3. 在rnn的输出h_n中,h_n[0]表示第一层的前向输出的隐藏状态, h_n[1]表示第一层的后向输出的隐藏状态；h_n[2]表示第二层...前向...,h_n[3]...第二层...后向...

## <font color='blue'>encoder部分</font>  
1. 双向GRU  
2. 两层堆叠

In [22]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, en_hidden_size, de_hidden_size, num_layers=2, dropout=0.2):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.embed = nn.Embedding(vocab_size, embed_size)
        
        self.gru = nn.GRU(embed_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(2 * en_hidden_size, de_hidden_size)
        
    def forward(self, en, en_lengths):
        # en : (batch_size, max_seq_len)
        # en_lengths : (batch_size, )
        sorted_len, sorted_idx = en_lengths.sort(0, descending=True)
        
        sorted_en = en[sorted_idx.long()]
        
        embed = self.dropout(self.embed(sorted_en))
        # embed : (batch_size, max_seq_len, embed_size)
        
        packed_embed = pack_padded_sequence(embed, sorted_len, batch_first=True)
        
        packed_output, hidden = self.gru(packed_embed)
        
        # hidden : (num_layers * num_directions, batch_size, en_hidden_size)
        #          (2, batch_size, en_hidden_size) (由于gru的bidirectional为True，所以这里有个2)
        
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        
        # output: (batch_size, seq_len, 2 * hidden_size) (由于gru的bidirectional为True，所以这里有个2)
        # 这个output在attention中需要
        _, orginal_idx = sorted_idx.sort(0)
        output = output[orginal_idx.long()].contiguous()
        
        ##############################
        # 错误代码： hidden = hidden[orginal_idx.long()].contiguous()
        # 这句代码卡了一个晚上加一个白天，居然是维度弄错了。。。真是醉了，用的测试用例因为数量太少居然通过了
        
        hidden = hidden[:, orginal_idx.long()].contiguous()
        
        
        ###############################
        
        
        
        # 由于在Luong Attention模型中decoder为单向的RNN，所以encoder输出的hidden要作一些变换
        hidden = torch.cat((hidden[0::2], hidden[1::2]), dim=2)
        # hidden: (num_layers, batch_size, en_hidden_size * 2)
        hidden = self.fc(hidden)
        # hidden: (num_layers, batch_size, de_hidden_size)
        return output, hidden

## <font color='blue'>Attention部分</font>

<font color='red'>**实现注意事项**</font>  
1. 计算context的时候，需要将encoder的output的隐状态转换为与decoder隐状态相同的维度(这是fc_in为（en_hidden_size * 2, de_hidden_size）的原因)，但是在计算attention后的向量的时候用的还是原来的encoder的output(这是为什么fc\_out为（en_hidden_size * 2 + de_hidden_size, de_hidden_size）的原因)

In [23]:
class Attention(nn.Module):
    def __init__(self, en_hidden_size, de_hidden_size):
        # 由于encoder是bidirectional，所以先要处理encoder的output
        super(Attention, self).__init__()
        self.fc_in = nn.Linear(en_hidden_size * 2, de_hidden_size)
        self.fc_out = nn.Linear(en_hidden_size * 2 + de_hidden_size, de_hidden_size)
    
    def forward(self, ht, en_output, mask):
        # ht: (batch_size, de_seq_len, de_hidden_size)
        # en_output: (batch_size, en_seq_len, 2 * en_hidden_size)
        
        hs = self.fc_in(en_output)
        # hs: (batch_size, en_seq_len, de_hidden_size)
        
        score = ht.bmm(hs.transpose(1,2))
        # socre:(batch_size, de_seq_len, en_seq_len)
        # score[i,j] 表示decoder的第i个单词隐藏状态的输出和encoder第j个单词隐藏状态输出的得分
        
        score.data.masked_fill(mask, 1e-6) # 主要是为了防止0经过softmax后占掉一部分比例，使其它有效的区域占的比例下降
        
        attn = F.softmax(score, dim=2)
        # attn:(batch_size, de_seq_len, en_seq_len)
        
        context = torch.bmm(attn, en_output)
        # context: (batch_size, de_seq_len, 2*en_hidden_size)
        ht_hat = self.fc_out(torch.cat((context, ht), dim=2))
        ht_hat = torch.tanh(ht_hat)
        # ht_hat: (batch_size, de_seq_len, de_hidden_size)
        
        return ht_hat, attn

## <font color='blue'>Decoder部分</font>

1. 单向GRU(与encoder不同)  
2. 两层堆叠

In [24]:
class LoungAttnDecoder(nn.Module):
    def __init__(self, vocab_size, embed_size, en_hidden_size, de_hidden_size, num_layers=2, dropout=0.2):
        super(LoungAttnDecoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, de_hidden_size, num_layers=num_layers, batch_first=True)
        self.attention = Attention(en_hidden_size, de_hidden_size)
        self.fc = nn.Linear(de_hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
    
    def create_mask(self, en_lengths, cn_lengths, device):
        en_max_len = en_lengths.max()
        cn_max_len = cn_lengths.max()
        
        mask_en = torch.arange(en_max_len, device=device)[None,:] < en_lengths[:, None]
        # mask_en: (batch_size, en_seq_len)
        mask_cn = torch.arange(cn_max_len, device=device)[None,:] < cn_lengths[:, None]
        # mask_cn: (batch_size, cn_seq_len)
        mask = (mask_en[:,None,:] * mask_cn[:,:,None]).logical_not()
        # mask : (batch_size, cn_seq_len, en_seq_len)
        
        return mask
        
    def forward(self, en_output,en_lengths, cn, cn_lengths, hidden):
        sorted_len, sorted_idx = cn_lengths.sort(0, descending=True)
        
        sorted_cn = cn[sorted_idx.long()]
        hidden = hidden[:, sorted_idx.long()]
        
        embed = self.dropout(self.embed(sorted_cn))
        packed_embed = pack_padded_sequence(embed, sorted_len, batch_first=True)
        
        packed_out, hidden = self.gru(packed_embed, hidden)
        # hidden: (num_layers * num_directions, batch_size, de_hidden_size)    这个hidden似乎没用到
        #         (2, batch_size, de_hidden_size) 
        
        output, _ = pad_packed_sequence(packed_out, batch_first=True)
        # output : (batch_size, max_seq_len, de_hidden_size)
        _, original_idx = sorted_idx.sort(0)
        
        output = output[original_idx.long()].contiguous()
        hidden = hidden[:, original_idx.long()].contiguous()
        # output : (batch_size, max_seq_len, de_hidden_size)
        # hidden: (num_layers * num_directions, batch_size, de_hidden_size)
        
        # 由于padding的存在，不是所有的句子的max_seq_len长度都要用到，所以此时需要用一个mask来屏蔽padding的影响。
        mask = self.create_mask(en_lengths, cn_lengths, cn.device)
        
        ht_hat, attn = self.attention(output, en_output, mask)
        # ht_hat: (batch_size, de_seq_len, de_hidden_size)
        # attn:(batch_size, de_seq_len, en_seq_len)
        
        output = F.log_softmax(self.fc(ht_hat), -1)
        
        return output, hidden, attn

## <font color='blue'>Seq2seq部分</font>

In [25]:
class Seq2seq(nn.Module):
    def __init__(self, en_vocab_size, cn_vocab_size, embed_size, en_hidden_size,de_hidden_size, num_layers=2, dropout=0.2):
        super(Seq2seq, self).__init__()
        self.encoder = Encoder(en_vocab_size, embed_size, en_hidden_size, de_hidden_size, num_layers, dropout)
        self.decoder = LoungAttnDecoder(cn_vocab_size, embed_size, en_hidden_size, de_hidden_size, num_layers, dropout)
        
    def forward(self, en_data, en_lengths, cn_data, cn_lengths):
        en_output, hidden = self.encoder(en_data, en_lengths)
        output, _, attn = self.decoder(en_output, en_lengths, cn_data, cn_lengths, hidden)
        
        return output, attn
    
    def translate(self, en, cn_word2ix, cn_ix2word, max_len=100):
        '''
        输入：一句经过编码的句子,shape为(seq_len, )
        '''
        en_lengths = torch.LongTensor([len(en)]).to(en.device)
        # en_lengths: (1,) , 即batch_size = 1
        en = en.unsqueeze(0)
        # en: (1, seq_len) , 即batch_size = 1
        en_output, hidden = self.encoder(en, en_lengths)
        # hidden: (1, 1, hidden_size)
        y = torch.LongTensor([[cn_word2ix['BOS']]]).to(en.device)
        # y:(1, 1)
        res = []
        for i in range(max_len):
            output, hidden, _ = self.decoder(en_output, en_lengths, y, torch.ones(1,).long().to(y.device),hidden=hidden)
            # output: (1,1,vocab_size), 经过log_softmax后的output
            y = output.max(2, keepdim=True)[1].view(-1,1)
            index = y.item()
            res.append(index)
            if index==cn_word2ix['EOS']:
                break
        preds = [cn_ix2word[word] for word in res]
        
        return preds

## <font color='blue'>训练函数</font>

In [26]:
def train(train_data_loader, dev_data_loader, model, optimizer, loss_fn, device, max_epochs=2):
    best_loss = float('inf')
    for epoch in range(max_epochs):
        model.train()
        for iteration, (en_data, en_lengths, cn_data, cn_lengths) in enumerate(train_data_loader):
            en_data = en_data.to(device)
            en_lengths = en_lengths.to(device)
            cn_input = cn_data[:, :-1].to(device)
            cn_target = cn_data[:, 1:].to(device)
            cn_lengths = (cn_lengths-1).to(device)
            
            preds, attn = model(en_data, en_lengths, cn_input, cn_lengths)
            
            mask = torch.arange(cn_lengths.max().item(), device=device)[None,:] < cn_lengths[:,None]
            mask = mask.float()
            
            loss = loss_fn(preds, cn_target, mask)
            optimizer.zero_grad()
            loss.backward()
            # 为了防止梯度过大
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.)
            
            optimizer.step()
            
            if iteration%100 == 0:
                print('Epoch: ', epoch, ' |  Iteration', iteration, ' |  loss: ', loss.item())
        if epoch % 3 == 0:
            dev_loss = evaluate(dev_data_loader, model, loss_fn, device)
            if dev_loss < best_loss:
                best_model = model
    return best_model

## <font color='blue'>验证函数</font>

In [27]:
def evaluate(dev_data_loader, model, loss_fn, deivce):
    model.eval()
    total_loss = 0.
    with torch.no_grad():
        for iteration, (en_data, en_lengths, cn_data, cn_lengths) in enumerate(dev_data_loader):
            en_data = en_data.to(device)
            en_lengths = en_lengths.to(device)
            cn_data = cn_data.to(device)
            cn_input = cn_data[:, :-1].to(device)
            cn_target = cn_data[:, 1:].to(device)
            cn_lengths = (cn_lengths-1).to(device)
            
            preds, attn = model(en_data, en_lengths, cn_input, cn_lengths)
            
            mask = torch.arange(cn_lengths.max().item(), device=device)[None,:] < cn_lengths[:,None]
            mask = mask.float()
            
            loss = loss_fn(preds, cn_target, mask)
            total_loss += loss.item()
    print('Dev Loss: ', total_loss / len(dev_data_loader))
    return total_loss / len(dev_data_loader)

## 定义损失函数

In [28]:
class LanguageModelLoss(nn.Module):
    def __init__(self):
        super(LanguageModelLoss, self).__init__()
    
    def forward(self, outputs, targets, mask):
        '''
        outputs: (batch_size, max_seq_len, vocab_size)
        targets: (batch_size, max_seq_len)
        mask: (batch_size, max_seq_len)
        '''
        outputs = outputs.contiguous().view(-1, outputs.size(2))
        # outputs: (batch_size * max_seq_len,  vocab_size)
        targets = targets.contiguous().view(-1, 1)
        # targets: (batch_size * max_seq_len, 1)
        mask = mask.contiguous().view(-1,1)
        # mask: (batch_size * max_seq_len, 1)
        
        losses = -outputs.gather(1, targets) * mask
        
        loss = torch.sum(losses) / torch.sum(mask)
        
        return loss

## <font color='blue'>实例化模型、优化器、损失函数等</font>

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

en_vocab_size = len(en_word2ix)
cn_vocab_size = len(cn_word2ix)

embed_size = 100
hidden_size = 128
num_layers = 2
dropout = 0.2
lr = 0.01

model = Seq2seq(en_vocab_size, cn_vocab_size, embed_size, hidden_size, hidden_size, num_layers, dropout).to(device)

optimizer = torch.optim.Adam(model.parameters())
loss_fn = LanguageModelLoss().to(device)

best_model = train(train_dataloader, val_dataloader, model, optimizer, loss_fn, device, max_epochs=30)

Epoch:  0  |  Iteration 0  |  loss:  8.137531280517578
Epoch:  0  |  Iteration 100  |  loss:  5.218105792999268
Epoch:  0  |  Iteration 200  |  loss:  5.214974880218506
Dev Loss:  5.083258926278294
Epoch:  1  |  Iteration 0  |  loss:  5.079915523529053
Epoch:  1  |  Iteration 100  |  loss:  4.762298107147217
Epoch:  1  |  Iteration 200  |  loss:  4.696444034576416
Epoch:  2  |  Iteration 0  |  loss:  4.776860237121582
Epoch:  2  |  Iteration 100  |  loss:  4.378644943237305
Epoch:  2  |  Iteration 200  |  loss:  4.4215779304504395
Epoch:  3  |  Iteration 0  |  loss:  4.393805503845215
Epoch:  3  |  Iteration 100  |  loss:  4.339521408081055
Epoch:  3  |  Iteration 200  |  loss:  4.180355072021484
Dev Loss:  4.011054868745331
Epoch:  4  |  Iteration 0  |  loss:  4.12247896194458
Epoch:  4  |  Iteration 100  |  loss:  3.9052987098693848
Epoch:  4  |  Iteration 200  |  loss:  3.684964656829834
Epoch:  5  |  Iteration 0  |  loss:  3.602047920227051
Epoch:  5  |  Iteration 100  |  loss:  3.

## <font color='blue'>实例验证</font>

In [30]:
def translate_dev(model, val_en, val_cn, i, device):
    en_sent = " ".join([en_ix2word[idx] for idx in val_en[i]])
    print(en_sent)
    cn_sent = " ".join([cn_ix2word[idx] for idx in val_cn[i]])
    print(cn_sent)
    
    input_en = torch.LongTensor(val_en[i]).to(device)
    trans = model.translate(input_en, cn_word2ix, cn_ix2word, max_len=10)
    if trans[-1] == 'EOS':
        trans.pop()
    print(" ".join(trans))
    print("-------------------")

<font color='red'>效果明显比不带attention的好多了</font>

In [31]:
for i in range(100,125):
    translate_dev(best_model, val_data_en, val_data_cn, i, device)

BOS try some . EOS
BOS 试 试 吧 。 EOS
试 试 了 。
-------------------
BOS who died ? EOS
BOS 谁 死 了 ？ EOS
谁 死 了 ？
-------------------
BOS birds fly . EOS
BOS 鳥 類 飛 行 。 EOS
鳥 鳥 類 動 了 。
-------------------
BOS call home ! EOS
BOS 打 电 话 回 家 ！ EOS
叫 回 家 ！
-------------------
BOS catch him . EOS
BOS 抓 住 他 。 EOS
抓 住 了 。
-------------------
BOS come home . EOS
BOS 回 家 吧 。 EOS
回 家 。
-------------------
BOS do it now . EOS
BOS 現 在 就 做 。 EOS
现 在 做 。
-------------------
BOS dogs bark . EOS
BOS 狗 会 叫 。 EOS
狗 叫 。
-------------------
BOS do n't cry . EOS
BOS 别 哭 。 EOS
别 哭 。
-------------------
BOS excuse me . EOS
BOS 对 不 起 。 EOS
对 我 来 。
-------------------
BOS feel this . EOS
BOS 来 感 受 一 下 这 个 。 EOS
来 感 到 这 个 。
-------------------
BOS follow me . EOS
BOS 请 跟 我 来 。 EOS
请 帮 我 。
-------------------
BOS follow us . EOS
BOS 请 跟 着 我 们 。 EOS
我 們 想 要 我 們 的 。
-------------------
BOS good luck . EOS
BOS 祝 你 好 运 。 EOS
祝 你 好 。
-------------------
BOS grab that . EOS
BOS 抓 住 那 个 。 EOS
抓 住 那 个 。
-------------------
BOS g