In [1]:
import torch

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

## Data

In [2]:
from torchtext.datasets import Multi30k
from torchtext.data import Field

SRC = Field(tokenize="spacy", tokenizer_language="de", init_token="<sos>", 
           eos_token="<eos>", lower=True)

TRG = Field(tokenize="spacy", tokenizer_language="en", init_token="<sos>", 
           eos_token="<eos>", lower=True)

train_data, valid_data, test_data = Multi30k.splits(exts=(".de", ".en"), fields=(SRC, TRG))

In [3]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

In [4]:
from torchtext.data import BucketIterator

BATCH_SIZE = 128
train_iterator, valid_iterator, test_iterator = BucketIterator.splits((
    train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device)

## Model

In [5]:
from typing import Tuple

import torch.nn as nn
from torch import Tensor

class Encoder(nn.Module):
    def __init__(self,
                input_dim: int, 
                emb_dim: int,
                enc_hid_dim: int,
                dec_hid_dim: int,
                dropout: float):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.dropout = nn.Dropout(dropout)
        
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)
        
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
                
    def forward(self, 
               src: Tensor) -> Tuple[Tensor]:
        
        embedded = self.dropout(self.embedding(src))
        
        outputs, hidden = self.rnn(embedded)
        
        # Hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...].
        # Outputs are always from the last layer.
        
        # Hidden [-2, :, : ] is the last of the forwards RNN.
        # Hidden [-1, :, : ] is the last of the backwards RNN.
        
        hidden = torch.tanh(self.fc(torch.cat((
            hidden[-2, :, :], hidden[-1, :, :]), dim=1)))
        
        return outputs, hidden

In [6]:
import torch.nn.functional as F

class Attention(nn.Module):
    def __init__(self, 
                enc_hid_dim: int, 
                dec_hid_dim: int,
                attn_dim: int):
        super().__init__()
        
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        
        # forward enc hid + backward enc hid + dec hid
        self.attn_in = (enc_hid_dim * 2) + dec_hid_dim
        
        self.attn = nn.Linear(self.attn_in, attn_dim)
        self.v = nn.Parameter(torch.rand(attn_dim))
        
    def forward(self, 
               decoder_hidden: Tensor, 
               encoder_outputs: Tensor) -> Tensor:
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        # Expands dec hid.
        # Tensor.repeat(): repeats this tensor along the specified dimensions.
        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        energy = torch.tanh(self.attn(torch.cat((
            repeated_decoder_hidden, 
        encoder_outputs),
        dim=2)))
        
        energy = energy.permute(0, 2, 1)
        
        v = self.v.repeat(batch_size, 1).unsqueeze(1)
        
        attention = torch.bmm(v, energy).squeeze(1)
        
        return F.softmax(attention, dim=1)

In [7]:
class Decoder(nn.Module):
    def __init__(self, 
                output_dim: int, 
                emb_dim: int,
                enc_hid_dim: int, 
                dec_hid_dim, 
                dropout: int,
                attention: nn.Module):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.dropout = nn.Dropout(dropout)
        
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        
        self.out = nn.Linear(self.attention.attn_in + emb_dim, output_dim)
        
    def forward(self, 
               input_: Tensor, 
               decoder_hidden: Tensor, 
               encoder_outputs: Tensor) -> Tuple[Tensor]:
        
        input_ = input_.unsqueeze(0)
        
        embedded = self.dropout(self.embedding(input_))
        
        weighted_encoder_rep = self._weighted_encoder_rep(decoder_hidden, 
                                                         encoder_outputs)
        
        rnn_input = torch.cat((embedded, weighted_encoder_rep), dim=2)
        
        output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))
        
        assert (output == decoder_hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted_encoder_rep = weighted_encoder_rep.squeeze(0)
        
        output = self.out(torch.cat((output, 
                                    weighted_encoder_rep, 
                                    embedded), dim=1))
        
        return output, decoder_hidden.squeeze(0)
        
    def _weighted_encoder_rep(self, 
                             decoder_hidden: Tensor, 
                             encoder_outputs: Tensor) -> Tensor:
        
        a = self.attention(decoder_hidden, encoder_outputs)
        
        a = a.unsqueeze(1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        weighted_encoder_rep = torch.bmm(a, encoder_outputs)
        
        weighted_encoder_rep = weighted_encoder_rep.permute(1, 0, 2)
        
        return weighted_encoder_rep

In [8]:
import random

class Seq2Seq(nn.Module):
    def __init__(self, 
                encoder: nn.Module, 
                decoder: nn.Module,
                device: torch.device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, 
               src: Tensor, 
               trg: Tensor, 
               teacher_forcing_ratio: float = 0.5) -> Tensor:
        
        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size, device=device)
        
        encoder_outputs, hidden = self.encoder(src)
        
        output = trg[0, :]
        
        for t in range(1, max_len):
            output, hidden = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output
            
            teacher_force = random.random() < teacher_forcing_ratio
            
            top1 = output.max(1)[1]
            
            output = trg[t] if teacher_force else top1
            
        return outputs

In [9]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
# ENC_EMB_DIM = 256
# DEC_EMB_DIM = 256
# ENC_HID_DIM = 512
# DEC_HID_DIM = 512
# ATTN_DIM = 64
# ENC_DROPOUT = 0.5
# DEC_DROPOUT = 0.5

ENC_EMB_DIM = 32
DEC_EMB_DIM = 32
ENC_HID_DIM = 64
DEC_HID_DIM = 64
ATTN_DIM = 8
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

In [10]:
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
attn = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)

In [11]:
def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

# Applies fn recursively to every submodule (as returned by .children()) as well 
# as self. Typical use includes initializing the parameters of a model 
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7855, 32)
    (dropout): Dropout(p=0.5, inplace=False)
    (rnn): GRU(32, 64, bidirectional=True)
    (fc): Linear(in_features=128, out_features=64, bias=True)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=192, out_features=8, bias=True)
    )
    (embedding): Embedding(5893, 32)
    (dropout): Dropout(p=0.5, inplace=False)
    (rnn): GRU(160, 64)
    (out): Linear(in_features=224, out_features=5893, bias=True)
  )
)

In [12]:
def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 1,856,693 trainable parameters


## Optimizer

In [13]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

## Criterion

In [14]:
# Note: when scoring the performance of a language translation model in 
# particular, we have to tell the nn.CrossEntropyLoss function to ignore the 
# indices where the target is simply padding.
PAD_IDX = TRG.vocab.stoi["<pad>"]

# print(SRC.vocab.itos[0], SRC.vocab.itos[1], SRC.vocab.itos[2], SRC.vocab.itos[3])
# print(TRG.vocab.itos[0], TRG.vocab.itos[1], TRG.vocab.itos[2], TRG.vocab.itos[3])
# <unk> <pad> <sos> <eos>
# <unk> <pad> <sos> <eos>

In [15]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

## Training and Validating.

In [16]:
def train(model: nn.Module, 
         iterator: BucketIterator, 
         optimizer: optim.Optimizer, 
         criterion: nn.Module, 
         clip: float):
    
    # Sets the module in training mode. This has any effect only on certain 
    # modules. See documentations of particular modules for details of their 
    # behaviors in training/evaluation mode, if they are affected, e.g. Dropout, 
    # BatchNorm, etc.
    model.train()
    
    eposh_loss = 0
    
    for _, batch in enumerate(iterator):

        # Zeros grad.
        optimizer.zero_grad()
        
        # Data.
        src = batch.src  # ?
        trg = batch.trg  # ?
        
        # Forward.
        output = model(src, trg)
        
        # Loss.
        output = output[1:].view(-1, output.shape[-1])  # ?
        trg = trg[1:].view(-1)  # ?
        loss = criterion(output, trg)
        
        # Backward.
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        # Updates params.
        optimizer.step()
        
        eposh_loss += loss.item()
        
    return eposh_loss / len(iterator)

In [17]:
def evaluate(model: nn.Module, 
            iterator: BucketIterator, 
            criterion: nn.Module):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
        for _, batch in enumerate(iterator):
            
            # Data.
            src = batch.src
            trg = batch.trg
            
            # Forward.
            output = model(src, trg, 0)  # turns off teacher forcing
            
            # Loss.
            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
            
        return epoch_loss / len(iterator)

In [18]:
def epoch_time(start_time: int, 
              end_time: int):
    elapsed_time = end_time - start_time
    
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time % 60)
    
    return elapsed_mins, elapsed_secs

In [19]:
import time
import math

N_EPOCHS = 10
CLIP = 1

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    # Training and validating.
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f"Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}")
    print(f"\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}")

Epoch: 01 | Time: 4m 38s
	Train Loss: 5.687 | Train PPL: 295.067
	 Val. Loss: 5.257 |  Val. PPL: 191.870
Epoch: 02 | Time: 4m 42s
	Train Loss: 5.009 | Train PPL: 149.764
	 Val. Loss: 5.087 |  Val. PPL: 161.981
Epoch: 03 | Time: 4m 26s
	Train Loss: 4.717 | Train PPL: 111.805
	 Val. Loss: 4.926 |  Val. PPL: 137.782
Epoch: 04 | Time: 4m 41s
	Train Loss: 4.599 | Train PPL:  99.390
	 Val. Loss: 4.888 |  Val. PPL: 132.682
Epoch: 05 | Time: 4m 21s
	Train Loss: 4.508 | Train PPL:  90.770
	 Val. Loss: 4.791 |  Val. PPL: 120.474
Epoch: 06 | Time: 4m 15s
	Train Loss: 4.407 | Train PPL:  82.026
	 Val. Loss: 4.713 |  Val. PPL: 111.406
Epoch: 07 | Time: 4m 24s
	Train Loss: 4.316 | Train PPL:  74.885
	 Val. Loss: 4.680 |  Val. PPL: 107.803
Epoch: 08 | Time: 4m 54s
	Train Loss: 4.232 | Train PPL:  68.887
	 Val. Loss: 4.685 |  Val. PPL: 108.342
Epoch: 09 | Time: 4m 14s
	Train Loss: 4.155 | Train PPL:  63.748
	 Val. Loss: 4.704 |  Val. PPL: 110.440
Epoch: 10 | Time: 4m 12s
	Train Loss: 4.075 | Train PPL

In [20]:
test_loss = evaluate(model, test_iterator, criterion)
print(f"| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |")

| Test Loss: 4.710 | Test PPL: 111.013 |
