## Data

In [1]:
from torchtext.data import Field
from torchtext.data import BucketIterator
# from torchtext.datasets import IWSLT
from torchtext.datasets import Multi30k

def prepare_data():
    
    # Sets up fields.
#     src_field = Field(init_token="<sos>", eos_token="<eos>", lower=True, 
#                       tokenize="spacy", tokenizer_language='fr')
#     trg_field = Field(init_token="<sos>", eos_token="<eos>", lower=True, 
#                       tokenize="spacy", tokenizer_language="en")
    src_field = Field(init_token="<sos>", eos_token="<eos>", lower=True, 
                      tokenize="spacy", tokenizer_language='de')
    trg_field = Field(init_token="<sos>", eos_token="<eos>", lower=True, 
                      tokenize="spacy", tokenizer_language="en")
    
#     # Makes splits for data.
#     train_set, valid_set, test_set = IWSLT.splits(exts=('.fr', '.en'), 
#                                                   fields=(src_field, trg_field))
    # Makes splits for data.
    train_set, valid_set, test_set = Multi30k.splits(exts=('.de', '.en'), 
                                                  fields=(src_field, trg_field))
    
    # Builds the vocab.
    src_field.build_vocab(train_set)
    trg_field.build_vocab(train_set)
    
    # Makes iterator for splits.
    train_iter, valid_iter, test_iter = BucketIterator.splits(
        datasets=(train_set, valid_set, test_set), batch_size=BATCH_SIZE, 
        device=DEVICE)

    return src_field, trg_field, train_iter, valid_iter, test_iter

## Model

### Encoder

In [2]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, src_vocab_size):
        super().__init__()
        
        self.embedding = nn.Embedding(src_vocab_size, ENC_EMB_DIM)
        self.dropout = nn.Dropout()
        
#         self.gru = nn.GRU(ENC_EMB_DIM, bidirectional=True)
        self.gru = nn.GRU(ENC_EMB_DIM, ENC_HID_DIM, bidirectional=True)
        
        self.fc = nn.Linear(ENC_HID_DIM * 2, DEC_HID_DIM)
        
    def forward(self, inputs):
        # (in)  inputs: [seq_len, batch_size]
        # (out) outputs: [seq_len, batch_size, enc_hid_dim * 2]
        # (out) hidden: [batch_size, dec_hid_dim]
        
        # (in)  inputs: [seq_len, batch_size]
        # (out) embedded: [seq_len, batch_size, enc_emb_dim]
        embedded = self.dropout(
            self.embedding(inputs))
        
        # (in)  embedded
        # (out) outputs: [seq_len, batch_size, enc_hid_dim * 2]
        # (out) hiddens: [2, batch_size, enc_hid_dim]
        outputs, hiddens = self.gru(embedded)
        
        # (in)  hiddens
        # (out) hidden: [batch_size, dec_hid_dim]
        hidden = self.fc(
#             torch.cat((hiddens[0], hiddens[1]), dim=2))
            torch.cat((hiddens[0], hiddens[1]), dim=1))
        
        return outputs, hidden

### Attention

In [3]:
class Attn(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.fc = nn.Linear(DEC_HID_DIM + ENC_HID_DIM * 2, ATTN_V_DIM)
        self.v = nn.Parameter(torch.rand(1, ATTN_V_DIM))
        
    def forward(self, decoder_hidden, encoder_outputs):
        # (in)  decoder_hidden: [batch_size, dec_hid_dim]
        # (in)  encoder_outputs: [seq_len, batch_size, enc_hid_dim * 2]
        # (out) context: [batch, enc_hid_dim * 2]
        
        # (in)  decoder_hidden
        # (in)  encoder_outputs
        # (out) energy: [batch_size, seq_len, attn_v_dim]
        energy = torch.tanh(
            self.fc(
                torch.cat((
#                     decoder_hidden.unsqueeze(0).repeat(encoder_outputs.size()[0]).permute(1, 0, 2), 
                    decoder_hidden.unsqueeze(0).repeat(encoder_outputs.size()[0], 1, 1).permute(1, 0, 2), 
                    encoder_outputs.permute(1, 0, 2)), dim=2)))
        
        # (in)  v: [1, attn_v_dim]
        # (in)  energy
        # (out) [batch_size, 1, seq_len]
#         attn = nn.Softmax(self.v.unsqueeze(0).repeat(BATCH_SIZE, 1, 1).bmm(energy.permute(0, 2, 1)))
        attn = nn.Softmax(dim=2)(self.v.unsqueeze(0).repeat(BATCH_SIZE, 1, 1).bmm(energy.permute(0, 2, 1)))
        
        # (in)  attn
        # (in)  encoder_outputs
        # (out) context: [batch, enc_hid_dim]
        context = attn.bmm(encoder_outputs.permute(1, 0, 2)).squeeze()
        
        return context

### Decoder

In [4]:
class Decoder(nn.Module):
    def __init__(self, trg_vocab_size):
        super().__init__()
        
        self.embedding = nn.Embedding(trg_vocab_size, DEC_EMB_DIM)
        self.dropout = nn.Dropout()
        
        self.attn = Attn()
        
        self.gru = nn.GRU(DEC_EMB_DIM + ENC_HID_DIM * 2, DEC_HID_DIM)
        
        self.fc = nn.Linear(DEC_EMB_DIM + ENC_HID_DIM * 2 + DEC_HID_DIM, trg_vocab_size)
        
    def forward(self, last_output, decoder_hidden, encoder_outputs):
        # (in)  last_output: [batch_size]
        # (in)  decoder_hidden: [batch_size, dec_hid_dim]
        # (in)  encoder_outputs: [seq_len, batch_size, enc_hid_dim * 2]
        #!(out) decoder_outputs: [batch_size, trg_vocab_size]
        # (out) decoder_hidden: [batch_size, dec_hid_dim]
        
        # (in)  last_output
        # (out) embedded: [batch_size, dec_emb_dim]
        embedded = self.dropout(
            self.embedding(last_output))
        
        # (in)  decoder_hidden
        # (in)  encoder_outputs
        #!(out) context: [batch, enc_hid_dim * 2]
        context = self.attn(decoder_hidden, encoder_outputs)
        
        # (in)  embedded
        # (in)  context
        # (in)  decoder_hidden
        # (out) outputs: [1, batch_size, dec_hid_dim]
        # (out) decoder_hidden: [1, batch_size, dec_hid_dim]
        outputs, decoder_hidden = self.gru(
            torch.cat((embedded.unsqueeze(0), context.unsqueeze(0)), dim=2), 
            decoder_hidden.unsqueeze(0))
        
        # (in)  embedded
        # (in)  context
        # (in)  decoder_hidden
        # (out) decoder_outputs: [batch_size, trg_vocab_size]
        decoder_outputs = self.fc(
            torch.cat((embedded, context, decoder_hidden.squeeze(0)), dim=1))
        
        return decoder_outputs, decoder_hidden.squeeze()

### Seq2Seq

In [5]:
import random

class Seq2Seq(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size):
        super().__init__()
        
        self.encoder = Encoder(src_vocab_size)
        self.decoder = Decoder(trg_vocab_size)
        
        self.trg_vocab_size = trg_vocab_size
        
    def forward(self, inputs, trgs, teacher_forcing_ratio=0.5):
        
        # inputs_seq_len == trgs_seq_len?
                
        # Encode.
        # (in)  inputs: [seq_len, batch_size]
        # (out) encoder_outputs: [seq_len, batch_size, enc_hid_dim * 2]
        # (out) decoder_hidden: [batch_size, dec_hid_dim]
        encoder_outputs, decoder_hidden = self.encoder(inputs)
        
        # Decode.
        trg_len = trgs.size()[0]
        
#         outputs = torch.zeros(trg_len, batch ,device=DEVICE)
        outputs = torch.zeros(trg_len, BATCH_SIZE, self.trg_vocab_size, device=DEVICE)
#         outputs[0] = trgs[0]
        decoder_output = trgs[0]
        for i in range(1, trg_len):
            # (in)  decoder_output: [batch_size]
            # (in)  decoder_hidden
            # (in)  encoder_outputs
            #!(out) decoder_outputs: [batch_size, trg_vocab_size]
            # (out) decoder_hidden: [batch_size, dec_hid_dim]
            decoder_outputs, decoder_hidden = self.decoder(decoder_output, decoder_hidden, encoder_outputs)
            
#             decoder_output = decoder_outputs.argmax(1, dim=1) if teacher_forcing_ratio <= random.random() else trgs[i]
            decoder_output = decoder_outputs.argmax(dim=1) if teacher_forcing_ratio <= random.random() else trgs[i]
            
#             outputs[i] = decoder_outputs.argmax(1, dim=1)
            outputs[i] = decoder_outputs
            
        return outputs

## Training

In [6]:
def _train(train_iter, model, criterion, optimizer):
    
    train_loss = 0
    train_acc = 0
    
#     for _, batch in train_iter:
    for batch in train_iter:
    
        # Zeros grad.
        optimizer.zero_grad()
        
        # Gets data.
        srcs = batch.src
        trgs = batch.trg
        
        # Forward.
        outputs = model(srcs, trgs)
        
        # Loss.
        trgs = trgs.view(-1)
        outputs = outputs.view(-1, outputs.size()[-1])
        loss = criterion(outputs, trgs)
        
        # Backward.
        loss.backward()
        
        # Updates params
        optimizer.step()
        
        train_loss += loss.item()
        train_acc += (outputs.argmax(1) == trgs).sum().item()
        
    return train_loss / len(train_iter), train_acc / len(train_iter)

In [7]:
def _evaluate(data_iter, model, criterion):
    
    eval_loss = 0
    eval_acc = 0
    
#     for _, batch in test_iter:
    for batch in test_iter:
        
        # Gets data.
        srcs = batch.src
        trgs = batch.trg
        
        # Forward.
        outputs = model(srcs, trgs, 0)
        
        # Loss.
        trgs = trgs.view(-1)
        outputs = outputs.view(-1, outputs.size()[-1])
        
        eval_loss += loss.item()
        eval_acc += (ooutputs.argmax(1) == trgs).sum().item()
        
    return eval_loss / len(test_iter), eval_acc / len(test_iter)

In [8]:
def time_track(start, end):
    
    elapsed_time = end - start
    
    mins = int(elapsed_time / 60)
    secs = int(elapsed_time % 60)
    
    return f"{mins:>2}mins {secs:>2}secs"

In [9]:
import time

def train(train_iter, valid_iter, model, criterion, optimizer):
        
    for epoch in range(N_EPOCHS):
        
        start = time.time()
        
        train_loss, train_acc = _train(train_iter, model, criterion, optimizer)
        valid_loss, valid_acc = _evaluate(valid_iter, model, criterion)
    
        end = time.time()
        
        print(f"epoch: {epoch:02}, time: {time_track(start, end)}")
        print(f"train loss: {train_loss:.3f}, train acc: {train_acc:.3%}")
        print(f"valid loss: {valid_loss:.3f}, valid acc: {valid_acc:.3%}")

## Testing

In [10]:
def test(test_iter, model, criterion):
    
    with torch.no_grad():
        test_loss, test_acc = _evaluate(test_iter, model, criterion)
        
        print(f"test loss: {test_loss:.3f}, test acc: {test_acc:.3f}")

## Main

In [11]:
import torch
import torch.optim as optim

if __name__ == '__main__':
    
    BATCH_SIZE = 128
    # DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    DEVICE = torch.device("cpu")

    N_EPOCHS = 10

    ENC_EMB_DIM = 256
    ENC_HID_DIM = 512

    ATTN_V_DIM = 64

    DEC_EMB_DIM = 256
    DEC_HID_DIM = 512

    # Gets data.
    src_field, trg_field, train_iter, valid_iter, test_iter = prepare_data()
        
    # Gets a model instance.
    src_vocab_size = len(src_field.vocab)
    trg_vocab_size = len(trg_field.vocab)
    # model = Seq2Seq(src_vocab_size, trg_vocab_size)
    model = Seq2Seq(src_vocab_size, trg_vocab_size).to(DEVICE)

    # Criterion.
#     criterion = nn.CrossEntropyLoss(ignore_index=src_field.vocab.stoi['<pad>'])
    criterion = nn.CrossEntropyLoss(ignore_index=trg_field.vocab.stoi['<pad>'])

    # Optimizer.
    optimizer = optim.Adam(model.parameters(), lr=0.003)

    # Trains and validates the model.
    train(train_iter, valid_iter, model, criterion, optimizer)

    # Tests the model.
    test(test_iter, model, criterion)

RuntimeError: Expected tensor to have size 128 at dimension 0, but got size 72 for argument #2 'batch2' (while checking arguments for bmm)