In [0]:
import os
import sys
import math
from collections import Counter #计数器
import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

import nltk

### load_data函数：

- 输入的是文件
- 输出的是一个训练集和验证机列表
  - 列表中每个元素是一个句子列表，列表由字组成

In [12]:
!pip install nltk

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
from snownlp import SnowNLP
import jieba
def load_data(in_file):
    cn = []
    en = []
    num_examples = 0
    with open(in_file,'r',encoding='utf8') as f:
        for line in f:
            line = line.strip().split('\t')
            en.append(['BOS']+nltk.word_tokenize(line[0].lower())+['EOS'])
            # 转化为简体中文
            cn_simple = SnowNLP(line[1]).han
            # 使用切词分割中文
            cn_words = jieba.cut(cn_simple)
#             cn_words = SnowNLP(cn_simple).words
            cn.append(['BOS']+[c for c in cn_words]+['EOS'])
    return en, cn

train_file = "/content/nmt/nmt/en-cn/train.txt"
dev_file = "/content/nmt/nmt/en-cn/dev.txt"
train_en, train_cn = load_data(train_file)
dev_en, dev_cn = load_data(dev_file)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.132 seconds.
Prefix dict has been built succesfully.


In [14]:
print(train_en[:5])
print(train_cn[:5])

[['BOS', 'anyone', 'can', 'do', 'that', '.', 'EOS'], ['BOS', 'how', 'about', 'another', 'piece', 'of', 'cake', '?', 'EOS'], ['BOS', 'she', 'married', 'him', '.', 'EOS'], ['BOS', 'i', 'do', "n't", 'like', 'learning', 'irregular', 'verbs', '.', 'EOS'], ['BOS', 'it', "'s", 'a', 'whole', 'new', 'ball', 'game', 'for', 'me', '.', 'EOS']]
[['BOS', '任何人', '都', '可以', '做到', '。', 'EOS'], ['BOS', '要', '不要', '再', '来', '一块', '蛋糕', '？', 'EOS'], ['BOS', '她', '嫁给', '了', '他', '。', 'EOS'], ['BOS', '我', '不', '喜欢', '学习', '不规则', '动词', '。', 'EOS'], ['BOS', '这', '对', '我', '来说', '是', '个', '全新', '的', '球类', '游戏', '。', 'EOS']]


## build_dict函数构建单词表

- 输入的是上一步得到的句子列表

- 输出的是word2ix字典{"word1":1, "word2":2}, 以及字典的长度

- 最后还构建一个ix2word字典

In [15]:
"""构建单词表, 重点"""
UNK_IDX = 0
PAD_IDX = 1
def build_dict(sentences,max_words=100000):
    word_count = Counter()
    for sentence in sentences:
        for word in sentence:
            word_count[word] += 1
    ls = word_count.most_common(max_words)
#     print(len(ls)) # train_en:5491
    total_words = len(ls)+2 # 因为有unk和pad
    
    # 构造字典
    word_dict = {w[0]: index+2 for index, w in enumerate(ls)}
    word_dict["UNK"] = UNK_IDX
    word_dict["PAD"] = PAD_IDX
    
    # 返回做好的word2ix字典， 以及这个字典的容量
    return word_dict, total_words

# 调用函数进行建表
en_dict, en_total_words = build_dict(train_en)
cn_dict, cn_total_words = build_dict(train_cn)

# 构造索引到字的反向字典
inv_en_dict = {v: k for k,v in en_dict.items()}
inv_cn_dict = {v: k for k,v in cn_dict.items()}

print(f"中文字典的长度：{cn_total_words}")
print(f"英语字典的长度：{en_total_words}")

中文字典的长度：8883
英语字典的长度：5493


In [0]:
"""把文字全部转变为数字"""

def encode(en_sen, cn_sen, en_dict, cn_dict, sort_by_len=True):
    # en_sen = [['BOS', 'anyone', 'can', 'do', 'that', '.', 'EOS'],..
    length = len(en_sen)
    
    # 把每个字都进行数字化，依然保留列表全套列表的结构，表示字在句中，句在文中
    out_en_sen = [[en_dict.get(w, 0) for w in sent] for sent in en_sen]
    out_cn_sen = [[cn_dict.get(w, 0) for w in sent] for sent in cn_sen]
    
    # 定义一个根据句子长度进行排序的方法
    def len_argsort(seq):
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))
    
    if sort_by_len == True:
        sorted_index = len_argsort(out_en_sen)
        
        out_en_sen = [out_en_sen[i] for i in sorted_index]
        out_cn_sen = [out_cn_sen[i] for i in sorted_index]
        
    return out_en_sen, out_cn_sen

train_en, train_cn = encode(train_en, train_cn, en_dict, cn_dict)
dev_en, dev_cn = encode(dev_en, dev_cn, en_dict, cn_dict)

### 看看处理好的句子

In [17]:
k=123
print(" ".join([inv_cn_dict[i] for i in train_cn[k]])) #通过inv字典获取单词
print(" ".join([inv_en_dict[i] for i in train_en[k]])) 

BOS 抓住 他 。 EOS
BOS grab him . EOS


### 把全部句子处理成为batch

In [0]:
def get_minibatch(n,minibatch_size, shuffle=True):
    idx_list = np.arange(0, n, minibatch_size)
    if shuffle:
        np.random.shuffle(idx_list)# 打乱数据
    minibatches = []
    for idx in idx_list:
        minibatches.append(np.arange(idx, min(idx + minibatch_size, n)))
    return minibatches
        
def prepare_data(seqs):
    # 统计每个batch里面的语句的长度
    lengths = [len(seq) for seq in seqs]
    # 一个batch有多少个语句
    n_samples = len(seqs)
    # 取出最长的语句，后面用这个来左padding基准
    max_len = np.max(lengths)
    
    # 先初始化全部为零的矩阵，后面进行赋值
    x = np.zeros((n_samples, max_len)).astype('int32')
    x_lengths = np.array(lengths).astype('int32')
    
    # 取出一个bantch的每条语句和对应的索引
    for idx,seq in enumerate(seqs):
        # 把每条语句按行赋值给x，x会有一些零值没有被赋值
        x[idx, :lengths[idx]] = seq
        
    return x, x_lengths

def gen_examples(en_sentences, cn_sentences, batch_size):
    minibatches = get_minibatch(len(en_sentences), batch_size)
    all_ex = []
    for minibatch in minibatches:
        mb_en_sentences = [en_sentences[t] for t in minibatch]
        mb_cn_sentences = [cn_sentences[t] for t in minibatch]
        mb_x,mb_x_len = prepare_data(mb_en_sentences)
        mb_y,mb_y_len = prepare_data(mb_cn_sentences)
        
        all_ex.append((mb_x, mb_x_len, mb_y, mb_y_len))
        
    return all_ex

batch_size = 64
train_data = gen_examples(train_en, train_cn, batch_size)
random.shuffle(train_data)
dev_data = gen_examples(dev_en, dev_cn, batch_size)

In [19]:
train_data[0]

(array([[   2,   31,   14,    8,   53,   11,    3],
        [   2,    6,  524,   10, 1614,    4,    3],
        [   2,   71,  229,   34,   61,  126,    3],
        [   2,   32,  100,   10,  452,    4,    3],
        [   2,   18,  563,  288,  856,    4,    3],
        [   2,   44,   30,   71,  773,    4,    3],
        [   2,   14,    8,  260, 2600,   11,    3],
        [   2, 3613,   30,  120, 1087,    4,    3],
        [   2,   87,   30,   32,  344,   11,    3],
        [   2,    5,  273,   25,  469,    4,    3],
        [   2,    5,   63,    9, 2104,    4,    3],
        [   2,    5,  113,    9,  249,    4,    3],
        [   2,   30,   44,  116,  405,   11,    3],
        [   2,   18,   85,   13,  282,    4,    3],
        [   2,   67,    5,  110,    8,   11,    3],
        [   2,    6, 2488, 2613, 1105,    4,    3],
        [   2, 1518, 1676,   34,  106,    4,    3],
        [   2,    5,   56, 2614,    8,    4,    3],
        [   2,    5,   42,  130,  726,    4,    3],
        [   

## 数据处理完毕，开始建模

In [0]:
"""luong Attention版本"""
class Encoder(nn.Module):
    def __init__(self,vocab_size, embed_size, 
                 enc_hidden_size, dec_hidden_size,
                 dropout=0.2):
        super().__init__()
        # 嵌入层，把二维数据变成三维数据
        self.embed = nn.Embedding(vocab_size, embed_size)
        # rnn层，把embed_size跟enc_hidden_size拼接进行运算
        self.rnn = nn.GRU(embed_size, enc_hidden_size,
                          batch_first=True, bidirectional=True)
        # 常规的dropout层
        self.dropout = nn.Dropout(dropout)
        # 全连接层，因为最后只拿hidden来进行解码
        self.fc = nn.Linear(enc_hidden_size * 2, dec_hidden_size)
        
    def forward(self, x, lengths):
        sorted_len, sorted_idx = lengths.sort(0, descending=True)
        x_sorted = x[sorted_idx.long()]
        # 第一步：进行词嵌入
        embedded = self.dropout(self.embed(x_sorted))
        
        # 第二步：对embeded数据进行padd
        packed_embedded = nn.utils.rnn.pack_padded_sequence(
                            embedded,
                            sorted_len.long().cpu().data.numpy(),
                            batch_first=True)
        
        # 第三步：把已经补长到相同长度的batch喂给rnn(GRU)
        # 因为是双向的，所以得到的应该是两份的hidden
        # hid.shape = [1, 64, 100]
        packed_out, hid = self.rnn(packed_embedded)
        
        # 第四步：把补长过的输出还原成没有补长前
        # out.shape = [64, 10, 100]
        out, _ = nn.utils.rnn.pad_packed_sequence(
                            packed_out,
                            batch_first=True)
        
        # 第五步：
        _, original_idx = sorted_idx.sort(0, descending=False)
        
        # 第六步：
        #out.shape = torch.Size([64, 10, 100])
        #hid.shape = torch.Size([1, 64, 100])
        out = out[original_idx.long()].contiguous()
        hid = hid[:, original_idx.long()].contiguous()
        
        # 第七步：把最后的两个hidden拼接起来
        hid = torch.cat([hid[-2], hid[-1]], dim=1)
        hid = torch.tanh(self.fc(hid)).unsqueeze(0)
        
        return out, hid

In [0]:
class Attention(nn.Module):
    def __init__(self, enc_hidden_size, dec_hidden_size):
        super(Attention, self).__init__()
        
        self.enc_hidden_size = enc_hidden_size
        self.dec_hidden_size = dec_hidden_size
        
        self.linear_in = nn.Linear(enc_hidden_size*2, dec_hidden_size)
        self.linear_out = nn.Linear(enc_hidden_size*2+dec_hidden_size,
                                   dec_hidden_size)
        
    def forward(self, output, context, mask):
        # output: [batch_size, output_len, dec_hidden_size]
        # context; [batch_size, context_len, 2*enc_hidden_size]
        
        batch_size = output.size(0)
        output_len = output.size(1)
        input_len = context.size(1)
        
        # 第一步：把context vect进行线性变换到跟input一样的规格
        # batch_size, context_len, dec_hidden_size
        context_in = self.linear_in(context.view(batch_size*input_len, -1)).view(
            batch_size, input_len, -1)
        
        # 第二步：进行张量乘法
        # transpose: batch_size, dec_hidden_size, context_len
        # output: batch_size, output_len, dec_hidden_size
        attn = torch.bmm(output, context_in.transpose(1,2))
        # attn: batch_size, output_len, context_len
        
        # 第三步：
        attn.data.masked_fill(mask, -1e6)
        
        # 第四步：attnetion weight进行softmax得到attention vect
        # attn: batch_size, output_len, context_len
        attn = F.softmax(attn, dim=2)
        
        # 第五步：计算context vect
        # context: batch_size, output_len, enc_hidden_size
        context = torch.bmm(attn, context)
        
        # 第六步：把context向量与output向量合并一下
        # batch_size, output_len, hidden_size*2
        output = torch.cat((context, output), dim=2)
        output = output.view(batch_size*output_len, -1)
        output = torch.tanh(self.linear_out(output))
        output = output.view(batch_size, output_len, -1)
        return output, attn
        

In [0]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout=0.2):
        super(Decoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.attention = Attention(enc_hidden_size, dec_hidden_size)
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)
        self.out = nn.Linear(dec_hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def create_mask(self, x_len, y_len):
        # a mask of shape x_len * y_len
        device = x_len.device
        max_x_len = x_len.max()
        max_y_len = y_len.max()
        x_mask = torch.arange(max_x_len, device=x_len.device)[None, :] < x_len[:, None]
        y_mask = torch.arange(max_y_len, device=x_len.device)[None, :] < y_len[:, None]
        mask = (1 - x_mask[:, :, None] * y_mask[:, None, :]).byte()
        return mask
    
    def forward(self, ctx, ctx_lengths, y, y_lengths, hid):
        sorted_len, sorted_idx = y_lengths.sort(0, descending=True)
        y_sorted = y[sorted_idx.long()]
        hid = hid[:, sorted_idx.long()]
        
        y_sorted = self.dropout(self.embed(y_sorted)) # batch_size, output_length, embed_size

        packed_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(), batch_first=True)
        out, hid = self.rnn(packed_seq, hid)
        unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        _, original_idx = sorted_idx.sort(0, descending=False)
        output_seq = unpacked[original_idx.long()].contiguous()
        hid = hid[:, original_idx.long()].contiguous()

        mask = self.create_mask(y_lengths, ctx_lengths)

        output, attn = self.attention(output_seq, ctx, mask)
        output = F.log_softmax(self.out(output), -1)
        
        return output, hid, attn

### 最后构建seq2seq模型把encder, attention, decoder串在一起

In [0]:
"""完整的模型"""
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, x, x_lengths, y, y_lengths):
        encoder_out, hid = self.encoder(x, x_lengths)
        output, hid, attn = self.decoder(ctx=encoder_out, 
                    ctx_lengths=x_lengths,
                    y=y,
                    y_lengths=y_lengths,
                    hid=hid)
        return output, attn
    
    def translate(self, x, x_lengths, y, max_length=100):
        encoder_out, hid = self.encoder(x, x_lengths)
        preds = []
        batch_size = x.shape[0]
        attns = []
        for i in range(max_length):
            output, hid, attn = self.decoder(ctx=encoder_out, 
                    ctx_lengths=x_lengths,
                    y=y,
                    y_lengths=torch.ones(batch_size).long().to(y.device),
                    hid=hid)
            y = output.max(2)[1].view(batch_size, 1)
            preds.append(y)
            attns.append(attn)
        return torch.cat(preds, 1), torch.cat(attns, 1)

In [0]:
# masked cross entropy loss
class LanguageModelCriterion(nn.Module):
    def __init__(self):
        super(LanguageModelCriterion, self).__init__()

    def forward(self, input, target, mask):
        #target=tensor([[5,108,8,4,3,0,0,0,0,0,0,0],....
        #  mask=tensor([[1,1 ,1,1,1,0,0,0,0,0,0,0],.....
        #print(input.shape,target.shape,mask.shape)
        #torch.Size([64, 12, 3195]) torch.Size([64, 12]) torch.Size([64, 12])
        
        # input: (batch_size * seq_len) * vocab_size
        input = input.contiguous().view(-1, input.size(2))
        
        # target: batch_size * 1=768*1
        target = target.contiguous().view(-1, 1)
        mask = mask.contiguous().view(-1, 1)
        #print(-input.gather(1, target))
        output = -input.gather(1, target) * mask
#这里算得就是交叉熵损失，前面已经算了F.log_softmax
#.gather的作用https://blog.csdn.net/edogawachia/article/details/80515038
#output.shape=torch.Size([768, 1])
#mask作用是把padding为0的地方重置为零，因为input.gather时，为0的地方不是零了
        
        output = torch.sum(output) / torch.sum(mask)
        #均值损失

        return output

In [0]:
"""训练配置"""
dropout = 0.2
embed_size = hidden_size = 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = Encoder(vocab_size=en_total_words,
                       embed_size=embed_size,
                      enc_hidden_size=hidden_size,
                       dec_hidden_size=hidden_size,
                      dropout=dropout)
decoder = Decoder(vocab_size=cn_total_words,
                      embed_size=embed_size,
                      enc_hidden_size=hidden_size,
                       dec_hidden_size=hidden_size,
                      dropout=dropout)
# 构建模型
model = Seq2Seq(encoder, decoder)
model = model.to(device)
# 定义损失函数
loss_fn = LanguageModelCriterion().to(device)
# 定义优化器
optimizer = torch.optim.Adam(model.parameters())

In [0]:
"""定义训练函数"""
def train(model, data, num_epochs=2):
    for epoch in range(num_epochs):
        model.train()
        total_num_words = total_loss = 0.
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            #（英文batch，英文长度，中文batch，中文长度）
            
            mb_x = torch.from_numpy(mb_x).to(device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
            
            #前n-1个单词作为输入，后n-1个单词作为输出，因为输入的前一个单词要预测后一个单词
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()
            #
            mb_y_len = torch.from_numpy(mb_y_len-1).to(device).long()
            #输入输出的长度都减一。
            
            mb_y_len[mb_y_len<=0] = 1
            
            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)
            #返回的是类PlainSeq2Seq里forward函数的两个返回值
            
            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
#mb_out_mask=tensor([[1, 1, 1,  ..., 0, 0, 0],[1, 1, 1,  ..., 0, 0, 0],
#mb_out_mask.shape= (64*19),这句代码咱不懂，这个mask就是padding的位置设置为0，其他设置为1
#mb_out_mask就是LanguageModelCriterion的传入参数mask。

            mb_out_mask = mb_out_mask.float()
            
            loss = loss_fn(mb_pred, mb_output, mb_out_mask)
            
            num_words = torch.sum(mb_y_len).item()
            #一个batch里多少个单词
            
            total_loss += loss.item() * num_words
            #总损失，loss计算的是均值损失，每个单词都是都有损失，所以乘以单词数
            
            total_num_words += num_words
            #总单词数
            
            # 更新模型
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.)
            #为了防止梯度过大，设置梯度的阈值
            
            optimizer.step()
            
            if it % 100 == 0:
                print("Epoch", epoch, "iteration", it, "loss", loss.item())

                
        print("Epoch", epoch, "Training loss", total_loss/total_num_words)
        if epoch % 5 == 0:
            evaluate(model, dev_data) #评估模型

"""定义评价函数"""
def evaluate(model, data):
    model.eval()
    total_num_words = total_loss = 0.
    with torch.no_grad():#不需要更新模型，不需要梯度
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            mb_x = torch.from_numpy(mb_x).to(device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()
            mb_y_len = torch.from_numpy(mb_y_len-1).to(device).long()
            mb_y_len[mb_y_len<=0] = 1

            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)

            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
            mb_out_mask = mb_out_mask.float()

            loss = loss_fn(mb_pred, mb_output, mb_out_mask)

            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words
    print("Evaluation loss", total_loss/total_num_words)

In [27]:
"""开始训练"""
train(model, train_data, num_epochs=30)

Epoch 0 iteration 0 loss 9.110408782958984
Epoch 0 iteration 100 loss 5.517004489898682
Epoch 0 iteration 200 loss 5.55142068862915
Epoch 0 Training loss 5.638189344529993
Evaluation loss 4.96490495137933
Epoch 1 iteration 0 loss 4.367475986480713
Epoch 1 iteration 100 loss 4.888298511505127
Epoch 1 iteration 200 loss 5.1074700355529785
Epoch 1 Training loss 4.715048927505989
Epoch 2 iteration 0 loss 3.866223096847534
Epoch 2 iteration 100 loss 4.579425811767578
Epoch 2 iteration 200 loss 4.786312580108643
Epoch 2 Training loss 4.364312815570024
Epoch 3 iteration 0 loss 3.5176448822021484
Epoch 3 iteration 100 loss 4.323484420776367
Epoch 3 iteration 200 loss 4.583402633666992
Epoch 3 Training loss 4.102632323583446
Epoch 4 iteration 0 loss 3.245243787765503
Epoch 4 iteration 100 loss 4.094987869262695
Epoch 4 iteration 200 loss 4.393760681152344
Epoch 4 Training loss 3.881842375799318
Epoch 5 iteration 0 loss 3.0458548069000244
Epoch 5 iteration 100 loss 3.877596616744995
Epoch 5 iter

## 模型性能预览

In [29]:
"""定义翻译函数"""
#翻译个句子看看结果咋样
def translate_dev(i):
    #随便取出句子
    en_sent = " ".join([inv_en_dict[w] for w in dev_en[i]])
    print('待预测：',en_sent)
    cn_sent = " ".join([inv_cn_dict[w] for w in dev_cn[i]])
    print('正确答案：',"".join(cn_sent))

    mb_x = torch.from_numpy(np.array(dev_en[i]).reshape(1, -1)).long().to(device)
    #把句子升维，并转换成tensor
    
    mb_x_len = torch.from_numpy(np.array([len(dev_en[i])])).long().to(device)
    #取出句子长度，并转换成tensor
    
    bos = torch.Tensor([[cn_dict["BOS"]]]).long().to(device)
    #bos=tensor([[2]])

    translation, attn = model.translate(mb_x, mb_x_len, bos)
    #这里传入bos作为首个单词的输入
    #translation=tensor([[ 8,  6, 11, 25, 22, 57, 10,  5,  6,  4]])
    
    translation = [inv_cn_dict[i] for i in translation.data.cpu().numpy().reshape(-1)]
    trans = []
    for word in translation:
        if word != "EOS": # 把数值变成单词形式
            trans.append(word) #
        else:
            break
    print('机器翻译：',"".join(trans))

for i in range(20,30):
    translate_dev(i)
    print()

待预测： BOS anything else ? EOS
正确答案： BOS 还有 别的 吗 ？ EOS
机器翻译： 有什么人吃什么？

待预测： BOS i 'm sleepy . EOS
正确答案： BOS UNK 了 。 EOS
机器翻译： 我是芬兰人。

待预测： BOS i ate UNK . EOS
正确答案： BOS 我 吃 了 UNK 。 EOS
机器翻译： 我在吃面包。

待预测： BOS i like sports . EOS
正确答案： BOS 我 喜欢 运动 。 EOS
机器翻译： 我喜欢足球。

待预测： BOS she may come . EOS
正确答案： BOS 她 可以 来 。 EOS
机器翻译： 她现在可能来。

待预测： BOS everybody will die . EOS
正确答案： BOS 人 UNK UNK 。 EOS
机器翻译： 每个人都在谈论了。

待预测： BOS answer the question . EOS
正确答案： BOS 回答 问题 。 EOS
机器翻译： 问题是狗。

待预测： BOS is that better ? EOS
正确答案： BOS 那 更好 吗 ？ EOS
机器翻译： 那是对吗？

待预测： BOS i like you . EOS
正确答案： BOS 我 喜欢 你 。 EOS
机器翻译： 我喜欢你。

待预测： BOS let him in . EOS
正确答案： BOS 让 他 进来 。 EOS
机器翻译： 让他把盐吧。



In [0]:
"""模型保存"""

torch.save(model.state_dict(), "/content/luongAttention_seq2seq.pth")

In [34]:
model

Seq2Seq(
  (encoder): Encoder(
    (embed): Embedding(5493, 100)
    (rnn): GRU(100, 100, batch_first=True, bidirectional=True)
    (dropout): Dropout(p=0.2)
    (fc): Linear(in_features=200, out_features=100, bias=True)
  )
  (decoder): Decoder(
    (embed): Embedding(8883, 100)
    (attention): Attention(
      (linear_in): Linear(in_features=200, out_features=100, bias=True)
      (linear_out): Linear(in_features=300, out_features=100, bias=True)
    )
    (rnn): GRU(100, 100, batch_first=True)
    (out): Linear(in_features=100, out_features=8883, bias=True)
    (dropout): Dropout(p=0.2)
  )
)