## Data

In [1]:
from torchtext.data import Field
from torchtext.data import BucketIterator
# from torchtext.datasets import IWSLT
from torchtext.datasets import Multi30k

def prepare_data():
    
    # Sets up fields.
#     src_field = Field(init_token="<sos>", eos_token="<eos>", lower=True, 
#                       tokenize="spacy", tokenizer_language='fr')
#     trg_field = Field(init_token="<sos>", eos_token="<eos>", lower=True, 
#                       tokenize="spacy", tokenizer_language="en")
    src_field = Field(init_token="<sos>", eos_token="<eos>", lower=True, 
                      tokenize="spacy", tokenizer_language='de')
    trg_field = Field(init_token="<sos>", eos_token="<eos>", lower=True, 
                      tokenize="spacy", tokenizer_language="en")
    
#     # Makes splits for data.
#     train_set, valid_set, test_set = IWSLT.splits(exts=('.fr', '.en'), 
#                                                   fields=(src_field, trg_field))
    # Makes splits for data.
    train_set, valid_set, test_set = Multi30k.splits(exts=('.de', '.en'), 
                                                  fields=(src_field, trg_field))
    
    # Builds the vocab.
    src_field.build_vocab(train_set)
    trg_field.build_vocab(train_set)
    
    # Makes iterator for splits.
    train_iter, valid_iter, test_iter = BucketIterator.splits(
        datasets=(train_set, valid_set, test_set), batch_size=BATCH_SIZE, 
        device=DEVICE)

    return src_field, trg_field, train_iter, valid_iter, test_iter

## Model

### Encoder

In [2]:
import torch.nn as nn
import torch.nn.functional as F

class Encoder(nn.Module):
    def __init__(self, src_vocab_size):
        super().__init__()
        
        self.embedding = nn.Embedding(src_vocab_size, ENC_EMB_DIM)
        self.dropout = nn.Dropout()
        
        self.gru = nn.GRU(ENC_EMB_DIM, ENC_HID_DIM, bidirectional=True)  # ! hid dim cannot be omitted here
        
        self.fc = nn.Linear(ENC_HID_DIM * 2, DEC_HID_DIM)
        
    def forward(self, inputs):
        # (in)  inputs: [seq_len, batch_size]
        # (out) outputs: [seq_len, batch_size, enc_hid_dim * 2]
        # (out) hidden: [batch_size, dec_hid_dim]
        
        # (in)  inputs: [seq_len, batch_size]
        # (out) embedded: [seq_len, batch_size, enc_emb_dim]
        embedded = self.dropout(
            self.embedding(inputs))
        
        # (in)  embedded
        # (out) outputs: [seq_len, batch_size, enc_hid_dim * 2]
        # (out) hiddens: [2, batch_size, enc_hid_dim]
        outputs, hiddens = self.gru(embedded)
        
        # (in)  hiddens
        # (out) hidden: [batch_size, dec_hid_dim]
        hidden = torch.tanh(  # ! tanh
            self.fc(
                torch.cat((hiddens[0], hiddens[1]), dim=1)))
        
        return outputs, hidden

### Attention

In [3]:
class Attn(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.fc = nn.Linear(DEC_HID_DIM + ENC_HID_DIM * 2, ATTN_V_DIM)
        self.v = nn.Parameter(torch.rand(1, ATTN_V_DIM))
        
    def forward(self, decoder_hidden, encoder_outputs):
        # (in)  decoder_hidden: [batch_size, dec_hid_dim]
        # (in)  encoder_outputs: [seq_len, batch_size, enc_hid_dim * 2]
        #!(out) context: [batch, enc_hid_dim * 2]
        
        # (in)  decoder_hidden
        # (in)  encoder_outputs
        # (out) energy: [batch_size, seq_len, attn_v_dim]
        energy = torch.tanh(
            self.fc(
                torch.cat((
                    decoder_hidden.unsqueeze(1).repeat(1, encoder_outputs.size()[0], 1), 
                    encoder_outputs.permute(1, 0, 2)), dim=2)))  # ! repeat(encoder_outputs.size()[0]) is wrong
        
        # (in)  v: [1, attn_v_dim]
        # (in)  energy
        # (out) [batch_size, 1, seq_len]
        attn = F.softmax(self.v.unsqueeze(0).repeat(energy.size()[0], 1, 1).bmm(energy.permute(0, 2, 1)), dim=2)  # ! energy.size()[0] may not always be equal to BATCH_SIZE
        
        # (in)  attn
        # (in)  encoder_outputs
        # (out) context: [batch, enc_hid_dim]
        context = attn.bmm(encoder_outputs.permute(1, 0, 2)).squeeze()
        
        return context

### Decoder

In [4]:
class Decoder(nn.Module):
    def __init__(self, trg_vocab_size):
        super().__init__()
        
        self.embedding = nn.Embedding(trg_vocab_size, DEC_EMB_DIM)
        self.dropout = nn.Dropout()
        
        self.attn = Attn()
        
        self.gru = nn.GRU(DEC_EMB_DIM + ENC_HID_DIM * 2, DEC_HID_DIM)
        
        self.fc = nn.Linear(DEC_EMB_DIM + ENC_HID_DIM * 2 + DEC_HID_DIM, trg_vocab_size)
        
    def forward(self, last_output, decoder_hidden, encoder_outputs):
        # (in)  last_output: [batch_size]
        # (in)  decoder_hidden: [batch_size, dec_hid_dim]
        # (in)  encoder_outputs: [seq_len, batch_size, enc_hid_dim * 2]
        #!(out) decoder_outputs: [batch_size, trg_vocab_size]
        # (out) decoder_hidden: [batch_size, dec_hid_dim]
        
        # (in)  last_output
        # (out) embedded: [batch_size, dec_emb_dim]
        embedded = self.dropout(
            self.embedding(last_output))
        
        # (in)  decoder_hidden
        # (in)  encoder_outputs
        #!(out) context: [batch, enc_hid_dim * 2]
        context = self.attn(decoder_hidden, encoder_outputs)
        
        # (in)  embedded
        # (in)  context
        # (in)  decoder_hidden
        # (out) outputs: [1, batch_size, dec_hid_dim]
        # (out) decoder_hidden: [1, batch_size, dec_hid_dim]
        outputs, decoder_hidden = self.gru(
            torch.cat((embedded.unsqueeze(0), context.unsqueeze(0)), dim=2), 
            decoder_hidden.unsqueeze(0))
        
        # (in)  embedded
        # (in)  context
        # (in)  decoder_hidden
        # (out) decoder_outputs: [batch_size, trg_vocab_size]
        decoder_outputs = self.fc(
            torch.cat((embedded, context, decoder_hidden.squeeze(0)), dim=1))
        
        return decoder_outputs, decoder_hidden.squeeze(0)

### Seq2Seq

In [5]:
import random

class Seq2Seq(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size):
        super().__init__()
        
        self.encoder = Encoder(src_vocab_size)
        self.decoder = Decoder(trg_vocab_size)
        
        self.trg_vocab_size = trg_vocab_size
        
    def forward(self, inputs, trgs, teacher_forcing_ratio=0.5):
        # (in)  inputs: [src_len, batch_size]
        # (in)  trgs: [trg_len, batch_size]
        # (out) outputs: [trg_len, batch_size, trg_vocab_size]
        
        # seq len of inputs and trgs may not always be the same
                
        # Encode.
        # (in)  inputs
        # (out) encoder_outputs: [src_len, batch_size, enc_hid_dim * 2]
        # (out) decoder_hidden: [batch_size, dec_hid_dim]
        encoder_outputs, decoder_hidden = self.encoder(inputs)
        
        # Decode.
        trg_len = trgs.size()[0]
        batch_size = trgs.size()[1]
        
        outputs = torch.zeros(trg_len, batch_size, self.trg_vocab_size, device=DEVICE)  # batch_size may not always be equal to BATCH_SIZE

        decoder_outputs = trgs[0]
        for t in range(1, trg_len):  # counts from 1
            # (in)  decoder_output: [batch_size]
            # (in)  decoder_hidden
            # (in)  encoder_outputs
            #!(out) decoder_outputs: [batch_size, trg_vocab_size]
            # (out) decoder_hidden: [batch_size, dec_hid_dim]
            decoder_outputs, decoder_hidden = self.decoder(decoder_outputs, decoder_hidden, encoder_outputs)
            
            outputs[t] = decoder_outputs
            
            decoder_outputs = decoder_outputs.argmax(dim=1) if teacher_forcing_ratio <= random.random() else trgs[t]
            
        return outputs

## Training

In [6]:
def _train(train_iter, model, criterion, optimizer):
    
    train_loss = 0
    
    model.train()  # !
    
    for batch in train_iter:
        
        # Gets data.
        srcs = batch.src
        trgs = batch.trg
        
        # Forward.
        outputs = model(srcs, trgs)
        
        # Loss.
        trgs = trgs[1:].view(-1)  # ! [1:]
        outputs = outputs[1:].view(-1, outputs.size()[-1])  # ! [1:]
        loss = criterion(outputs, trgs)
        
        # Backward.
        loss.backward()
        
        # Updates params
        optimizer.step()
        # Zeros grad.
        optimizer.zero_grad()
        
        train_loss += loss.item()
        
    return train_loss / len(train_iter)

In [7]:
def _evaluate(data_iter, model, criterion):
    
    eval_loss = 0
    
    model.eval()  # !
    
    with torch.no_grad():  # !
        for batch in data_iter:

            # Gets data.
            srcs = batch.src
            trgs = batch.trg

            # Forward.
            outputs = model(srcs, trgs, 0)

            # Loss.
            trgs = trgs[1:].view(-1)  # ! [1:]
            outputs = outputs[1:].view(-1, outputs.size()[-1])  # [1:]
            loss = criterion(outputs, trgs)

            eval_loss += loss.item()
        
        return eval_loss / len(data_iter)

In [8]:
def time_track(start, end):
    
    elapsed_time = end - start
    
    mins = int(elapsed_time / 60)
    secs = int(elapsed_time % 60)
    
    return f"{mins:>2}mins {secs:>2}secs"

In [9]:
import time
import math

def train(train_iter, valid_iter, model, criterion, optimizer):
        
    for epoch in range(N_EPOCHS):
        
        start = time.time()
        
        train_loss = _train(train_iter, model, criterion, optimizer)
        valid_loss = _evaluate(valid_iter, model, criterion)
    
        end = time.time()
        
        print(f"epoch: {epoch + 1:02}, time: {time_track(start, end)}")
        print(f"train loss: {train_loss:.3f}, train ppl: {math.exp(train_loss):.3f}")
        print(f"valid loss: {valid_loss:.3f}, valid ppl: {math.exp(valid_loss):.3f}")

## Testing

In [10]:
def test(test_iter, model, criterion):
    
    test_loss = _evaluate(test_iter, model, criterion)

    print(f"test loss: {test_loss:.3f}, test ppl: {math.exp(test_loss):.3f}")

## Main

In [11]:
import torch
import torch.optim as optim

if __name__ == '__main__':
    
    BATCH_SIZE = 512
#     DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    DEVICE = torch.device("cpu")

    N_EPOCHS = 10

    ENC_EMB_DIM = 256
    ENC_HID_DIM = 512

    ATTN_V_DIM = 64

    DEC_EMB_DIM = 256
    DEC_HID_DIM = 512

    # Gets data.
    src_field, trg_field, train_iter, valid_iter, test_iter = prepare_data()
        
    # Gets a model instance.
    src_vocab_size = len(src_field.vocab)
    trg_vocab_size = len(trg_field.vocab)
    model = Seq2Seq(src_vocab_size, trg_vocab_size).to(DEVICE)  # ! to(DEVICE)

    # Criterion.
    criterion = nn.CrossEntropyLoss(ignore_index=trg_field.vocab.stoi['<pad>'])

    # Optimizer.
    optimizer = optim.Adam(model.parameters(), lr=0.003)

    # Trains and validates the model.
    train(train_iter, valid_iter, model, criterion, optimizer)

    # Tests the model.
    test(test_iter, model, criterion)

epoch: 01, time: 22mins 38secs
train loss: 5.348, train ppl: 210.142
valid loss: 4.922, valid ppl: 137.286
epoch: 02, time: 22mins  2secs
train loss: 3.919, train ppl: 50.375
valid loss: 3.985, valid ppl: 53.801
epoch: 03, time: 21mins 45secs
train loss: 3.144, train ppl: 23.201
valid loss: 3.613, valid ppl: 37.081
epoch: 04, time: 21mins 54secs
train loss: 2.704, train ppl: 14.940
valid loss: 3.581, valid ppl: 35.902
epoch: 05, time: 21mins 21secs
train loss: 2.368, train ppl: 10.680
valid loss: 3.602, valid ppl: 36.679
epoch: 06, time: 21mins 39secs
train loss: 2.220, train ppl: 9.209
valid loss: 3.562, valid ppl: 35.220
epoch: 07, time: 28mins 39secs
train loss: 2.055, train ppl: 7.808
valid loss: 3.700, valid ppl: 40.440
epoch: 08, time: 23mins 57secs
train loss: 1.971, train ppl: 7.181
valid loss: 3.721, valid ppl: 41.323
epoch: 09, time: 22mins 47secs
train loss: 1.868, train ppl: 6.474
valid loss: 3.718, valid ppl: 41.199
epoch: 10, time: 22mins 25secs
train loss: 1.796, train p