# Transformer 实现序列到序列的翻译

参考代码：
https://pytorch.org/tutorials/beginner/translation_transformer.html

这篇Notebook里会使用我们在model.py中定义的Seq2seqTransformer类来实现一个德语到英语的翻译，双语数据集来自Torchtext中的Multi30k dataset。首先我们先获取一下需要的vocab和tokenizer。
可能需要在bash中运行以下内容并重启jupyter来安装Spacy的依赖。
```bash
pip install spacy
python -m spacy download en_core_web_sm
python -m spacy download de_core_news_sm
```

In [58]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import Multi30k

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SRC_LAN = 'de'
TGT_LAN = 'en'
LAN_PAIR = (SRC_LAN, TGT_LAN)

# Tokenizer
token_transform = {}
token_transform[SRC_LAN] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LAN] = get_tokenizer('spacy', language='en_core_web_sm')

# A generator yield all words in training dataset
def yield_tokens(data_iter, ln):
    for sample in data_iter:
        yield token_transform[ln](sample[LAN_PAIR.index(ln)])

# Define special tokens
UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<sos>', '<eos>']

# Build vocab by build_vocab_from_iterator()
vocab_transform = {}
for ln in LAN_PAIR:
    train_set = Multi30k(split='train', language_pair=LAN_PAIR)
    vocab_transform[ln] = build_vocab_from_iterator(
        yield_tokens(train_set, ln), specials=special_symbols)
    vocab_transform[ln].set_default_index(UNK_IDX)

设计出字符串转词符(Token)列表，词符(Token)列表转数字索引列表，索引连接成Tensor的三个功能后，把它们连在一起得到一个字符串到张量的处理流水线。

In [69]:
# Turn integer list into tensor and add SOS/EOS token
def tensor_transform(input_ids):
    return torch.cat((torch.tensor([SOS_IDX]),
                     torch.tensor(input_ids),
                     torch.tensor([EOS_IDX])))

# Combine all transforms into a pipeline
def sequential_transform(*transforms):
    def func(input):
        for transform in transforms:
            input = transform(input)
        return input
    return func

# Text process pipeline:
# 1) Remove all '\n' in the end of text
# 2) Break text from string to List[str]
# 3) Convert List[str] to List[int] using vocab
# 4) Concatenate SOS_IDX, List[int], EOS_IDX into tensor
text_pipeline = {}
for ln in LAN_PAIR:
    text_pipeline[ln] = sequential_transform(lambda s: s.rstrip('\n'),
                                             token_transform[ln],
                                             vocab_transform[ln],
                                             tensor_transform)

# Simple test
print(text_pipeline['en']("Hello every one\n"))                                

tensor([   2, 6731, 4221,   55,    3])


接着，我们需要把字符串处理的流水线集成到Pytorch的Dataloader中，并且使用pad_sequence把一个batch中的不同长度的tensor延长并拼接。

In [70]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# Function to collate data samples into batch tensors
# Used when constructing Dataloader
def collate_batch(batch):
    src_batch = []
    tgt_batch = []
    for src, tgt in batch:
        src_batch.append(text_pipeline[SRC_LAN](src))
        tgt_batch.append(text_pipeline[TGT_LAN](tgt))
    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

# Return dataloader instance
def get_loader(split='train', batch_size=128):
    dataset = Multi30k(split=split, language_pair=LAN_PAIR)
    return DataLoader(dataset, batch_size, collate_fn=collate_batch)

# Simple test
print("Batch shape:", next(iter(get_loader()))[0].shape)

Batch shape: torch.Size([27, 128])


现在有了数据了, 但是还缺少Transformer输入需要的Mask，输入Transformer对于解码和编码分别需要两种Mask，具体可以看model.ipynb中的介绍。让我们定义一下。

In [61]:
# Get all mask
def create_mask(src, tgt):
    src_len = src.size(0)
    tgt_len = tgt.size(0)

    src_mask = torch.zeros(src_len, src_len)
    tgt_mask = (1 - torch.triu(torch.ones(tgt_len, tgt_len)).T) * -1e9

    # The padding mask must be batch_first
    src_padding_mask = (src == PAD_IDX).T
    tgt_padding_mask = (tgt == PAD_IDX).T

    return src_mask.to(DEVICE), tgt_mask.to(DEVICE), \
           src_padding_mask.to(DEVICE), tgt_padding_mask.to(DEVICE)

我们定义一个训练一个Epoch的train函数以供食用。

In [62]:
import torch.nn as nn

def train_epoch(model: nn.Module, batch_size: int, optimizer: torch.optim.Optimizer, criterion: nn.Module):
    model.train()
    losses = 0

    loader = get_loader('train', batch_size)
    for src, tgt in loader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)
        
        tgt_in = tgt[:-1, :]
        tgt_out = tgt[1:, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_in)
        output = model(src, tgt_in, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask)
        
        loss = criterion(output.reshape(-1, output.size(-1)), tgt_out.reshape(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses += loss.item()

    return losses / len(loader)

万事具备，可以开始训练咯。如果你现在没有显卡来支持CUDA，请不要运行这段代码，尽管这已经是一个小型Transformer，但是在CPU上运行一个Batch仍然需要6-7秒钟，训练18个Epoch需要7个小时左右。将这段代码迁移到Google Colab上后大约用了11分钟完成训练。这和Pytorch原生的Transformer速度是相当的（其实比Pytorch实现还快了不少？），可见model.py中的实现速度还是可以的。后面的内容需要基于一个已经训练好的模型来进行。

In [None]:
from model import Seq2seqTransformer
from utils import Timer

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LAN])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LAN])
D_MODEL = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_EPOCHS = 18
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2seqTransformer(src_vocab_size=SRC_VOCAB_SIZE,
                                 tgt_vocab_size=TGT_VOCAB_SIZE,
                                 num_encoder_layers=NUM_ENCODER_LAYERS,
                                 d_model=D_MODEL,
                                 num_heads=NHEAD,
                                 dim_feedforward=FFN_HID_DIM).to(DEVICE)

criterion = torch.nn.NLLLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

Timer.Start()
for epoch in range(1, NUM_EPOCHS + 1):
    loss = train_epoch(transformer, BATCH_SIZE, optimizer, criterion)
    print("Epoch: %d\tLoss: %.3f\tTime: %s" % (epoch, loss, Timer.Remain(percent=epoch/NUM_EPOCHS)))

训练好了以后，我们希望可以测试一下模型的表现，那么面对全新的数据，我们需要使用Greedy Decode来让模型不依赖target序列输出一段翻译。Greedy Encode本质上就是一个宽度为1的beam search，每次都把拥有最高概率的预测词语添加到Target序列作为输入。

In [64]:
# Compute output with greedy algorithm for all source sequence in a batch
def greedy_decode(model: Seq2seqTransformer, src, src_padding_mask=None, max_length=50):
    src = src.to(DEVICE)
    if src_padding_mask is not None:
        src_padding_mask = src_padding_mask.to(DEVICE)
    batch_size = src.size(1)
    model.eval()

    # Forward propagation on encoder
    src = model.positional_encoding(model.src_embedding(src))
    memory = model.transformer.encoder.forward(src, None, src_padding_mask)

    # Initialize output sequence
    output = torch.empty(1, batch_size).fill_(SOS_IDX).long().to(DEVICE)
    output_states = torch.zeros(batch_size)

    for i in range(max_length - 1):
        _, out_mask, _, _ = create_mask(src, output)

        # Forward propagation on decoder
        out_embedded = model.tgt_embedding(output)
        probs = model.transformer.decoder(out_embedded, memory, out_mask)
        probs = model.generator(probs[-1])

        # Pick out most likely next word
        next_words = probs.argmax(dim=-1)
        output = torch.cat((output,
                            next_words.unsqueeze(0)),
                           dim=0)
        
        # Update and Checking finishing state
        output_states[next_words == EOS_IDX] = 1
        if not 0 in output_states:
            break
    
    return output

把输出的预测序列编码回字符串就可以完成翻译，在Colab上经历了54个Epoches的训练以后，这段德语对应的翻译为："A group of people standing in front of an igloo ."

虽然我们不一定懂德语，但是可以通过翻译软件先把英语翻译成德语后再输入回模型观察效果。比如：

`"I am very happy"` -> `"Ich bin sehr glücklich"` -> `"I very happy"`

`"Today is a good day"` -> `"Heute ist ein guter Tag"` -> `"This day are beautiful day of beautiful"`

`"Does this steak taste good?"` -> `"Schmeckt dieses Steak?"` -> `"We are kicking each other."` -> `"Wir treten uns gegenseitig."` -> `"We are making a hockey."`

`"I was born in china"` -> `"Ich wurde in China geboren"` -> `"I see in China are competing in China ."`

`"Please value your time" -> "Bitte schätzen Sie Ihre Zeit" -> "She is taking pictures ."`

好吧。可能模型还是比较小。

In [74]:
def translate(model: Seq2seqTransformer, text):
    # Regard input text as batch with batch_size=1
    src = text_pipeline[SRC_LAN](text).reshape(-1, 1)
    pred = greedy_decode(model, src, max_length=src.size(0) + 6)
    indexes = list(pred.cpu().view(-1).long().numpy())
    raw_str = " ".join(vocab_transform[TGT_LAN].lookup_tokens(indexes))
    return raw_str.replace('<sos>', '').replace('<eos>', '')
print(translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu ."))

 Army railings curled avrovulcan.com World toys Younger patrol fresco playmat way album unfinished gestures credit holds
