In [1]:
import torch

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

## Data

In [2]:
from torchtext.data import Field

SRC = Field(tokenize="spacy", tokenizer_language="de", init_token="<sos>", 
           eos_token="<eos>", lower=True)

TRG = Field(tokenize="spacy", tokenizer_language="en", init_token="<sos>", 
           eos_token="<eos>", lower=True)

In [3]:
from torchtext.datasets import Multi30k

train_data, valid_data, test_data = Multi30k.splits(exts=(".de", ".en"), 
                                                    fields=(SRC, TRG))

In [4]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

In [5]:
from torchtext.data import BucketIterator

BATCH_SIZE = 256
# if batch size is too large: cuda out of memory
# if batch size is too small: much time

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((
    train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device)
# when len of sentences are diff, use <pad>?

## Model

In [6]:
from typing import Tuple

import torch.nn as nn
from torch import Tensor

class Encoder(nn.Module):
    def __init__(self,
                input_dim: int, 
                emb_dim: int,
                enc_hid_dim: int,
                dec_hid_dim: int,
                dropout: float):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.dropout = nn.Dropout(dropout)  # avoids overfitting when the dataset is small
        
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)  # bi-gru
        
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)  # the last hiddens of the encoder will be the first hidden of the decoder
                
    def forward(self, src: Tensor) -> Tuple[Tensor]:
        # (in)  src:      [seq_len, batch]
        # (out) embedded: [seq_len, batch, emb_dim]
        embedded = self.embedding(src)
        # (int, out) embedded: [seq_len, batch, emb_dim]
        embedded = self.dropout(embedded)
        
        # (in)  embedded: [seq_len, batch, emb_dim]
        # (in)  hidden:   [2, batch, enc_hid_dim]
        # (out) outputs:  [seq_len, batch, 2 * enc_hid_dim]
        # (out) hidden:   [2, batch, enc_hid_dim]
        outputs, hidden = self.rnn(embedded)
        
        last_forward_hidden = hidden[0]  # hidden[0, :, :] == hidden[0]. the last forward
        last_backward_hidden = hidden[1]  # hidden[1, :, :] == hidden[1]. the last backward
        # (in)  last_forward_hidden:  [batch, enc_hid_dim]
        # (in)  last_backward_hidden  [batch, enc_hid_dim]
        # (out) last_hidden:          [batch, enc_hid_dim * 2]                 
        last_hidden = torch.cat((last_forward_hidden, last_backward_hidden), dim=1)
        # (in)  last_hidden: [batch, enc_hid_dim * 2]
        # (out) last_hidden: [batch, dec_hid_dim]
        last_hidden = self.fc(last_hidden)
        # (in)  last_hidden: [batch, dec_hid_dim]
        # (out) last_hidden: [batch, dec_hid_dim]
        last_hidden = torch.tanh(last_hidden)
        
        return outputs, last_hidden

In [7]:
import torch.nn.functional as F

class Attention(nn.Module):
    def __init__(self, 
                enc_hid_dim: int, 
                dec_hid_dim: int,
                attn_dim: int):
        super().__init__()
        
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.attn_dim = attn_dim
        
        # forward enc hid + backward enc hid + dec hid
        self.attn_in_dim = (enc_hid_dim * 2) + dec_hid_dim
        
        self.attn = nn.Linear(self.attn_in_dim, attn_dim)
        self.v = nn.Parameter(torch.rand(attn_dim))
        
    def forward(self, 
               decoder_hidden: Tensor, 
               encoder_outputs: Tensor) -> Tensor:
        
        seq_len = encoder_outputs.shape[0]
        batch_size = encoder_outputs.shape[1]
        
        # (in)  decoder_hidden: [batch, dec_hid_dim]
        # (out) decoder_hidden: [batch, 1, dec_hid_dim]
        decoder_hidden = decoder_hidden.unsqueeze(1)
        # (in)  repeated_decoder_hidden: [batch, 1, dec_hid_dim]
        # (out) repeated_decoder_hidden: [batch, seq_len, dec_hid_dim]
        repeated_decoder_hidden = decoder_hidden.repeat(1, seq_len, 1)  # repeats this tensor along the specified dimensions.
        
        # (in)  encoder_outputs: [seq_len, batch, enc_hid_dim * 2]
        # (out) encoder_outputs: [batch, seq_len, enc_hid_dim * 2]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        # (in)  repeated_decoder_hidden: [batch, seq_len, dec_hid_dim]
        # (in)  encoder_outputs:         [batch, seq_len, enc_hid_dim * 2]
        # (out) energy_in:               [batch, seq_len, enc_hid_dim * 2 + dec_hid_dim]
        energy_in = torch.cat((repeated_decoder_hidden, encoder_outputs), dim=2)
        # (in)  energy_in: [batch, seq_len, enc_hid_dim * 2 + dec_hid_dim]
        # (out) energy_in: [batch, seq_len, attn_dim] 
        energy_in = self.attn(energy_in)
        # (in)  energy_in: [batch, seq_len, attn_dim]
        # (out) energy:    [batch, seq_len, attn_dim]
        energy = torch.tanh(energy_in)
        
        # (in)  energy: [batch, seq_len, attn_dim]
        # (out) energy: [batch, attn_dim, seq_len]
        energy = energy.permute(0, 2, 1)
        # (in)  self.v: [attn_dim]
        # (out) v: [batch, attn_dim]
        v = self.v.repeat(batch_size, 1)
        # (in)  v: [batch, attn_dim]
        # (out) v: [batch, 1, attn_dim]
        v = v.unsqueeze(1)
        # (in)  v:         [batch, 1, attn_dim]
        # (in)  energy:    [batch, attn_dim, seq_len]
        # (out) attention: [batch, 1, seq_len]
        attention = torch.bmm(v, energy)
        # (in)  attention: [batch, 1, seq_len]
        # (out) attention: [batch, seq_len]
        attention = attention.squeeze(1)
        # (in)  attention: [batch, seq_len]
        # (out) attention: [batch, seq_len]
        attention = F.softmax(attention, dim=1)
        
        return attention

In [8]:
class Decoder(nn.Module):
    def __init__(self, 
                output_dim: int, 
                emb_dim: int,
                enc_hid_dim: int, 
                dec_hid_dim, 
                dropout: int,
                attention: nn.Module):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.dropout = nn.Dropout(dropout)
        
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        
        self.out = nn.Linear(enc_hid_dim * 2 + dec_hid_dim + emb_dim, output_dim)
        
    def forward(self, 
               input_: Tensor, 
               decoder_hidden: Tensor, 
               encoder_outputs: Tensor) -> Tuple[Tensor]:
        
        # (in)  input_: [batch]
        # (out) input_: [1, batch]
        input_ = input_.unsqueeze(0)
        # (in)  input:    [1, batch]
        # (out) embedded: [1, batch, emb_dim]
        embedded = self.embedding(input_)
        # (in)  embedded: [1, batch, emb_dim]
        # (out) embedded: [1, batch, emb_dim]
        embedded = self.dropout(embedded)
        
        # (out) a: (batch, seq_len)
        a = self.attention(decoder_hidden, encoder_outputs)
        
        # (in): a: [batch, seq_len]
        # (out) a: [batch, 1, seq_len]
        a = a.unsqueeze(1)
        # (in)  encoder_outputs: [seq_len, batch, enc_hid_dim * 2]
        # (out) encoder_outputs: [batch, seq_len, enc_hid_dim * 2]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # (in)  a:               [batch, 1, seq_len]
        # (in)  encoder_outputs: [batch, seq_len, enc_hid_dim * 2]
        # (out) weighted:        [batch, 1, enc_hid_dim * 2]
        weighted = torch.bmm(a, encoder_outputs)
        
        # (in)  weighted: [batch, 1, enc_hid_dim * 2]
        # (out) weighted: [1, batch, enc_hid_dim * 2]
        weighted = weighted.permute(1, 0, 2)
        # (in)  embedded:  [1, batch, emb_dim]
        # (in)  weighted:  [1, batch, enc_hid_dim * 2]
        # (out) rnn_input: [1, batch, emb_dim + enc_hid_dim * 2]
        rnn_input = torch.cat((embedded, weighted), dim=2)
        # (in)  decoder_hidden: [batch, dec_hid_dim]
        # (out) decoder_hidden: [1, batch, dec_hid_dim]
        decoder_hidden = decoder_hidden.unsqueeze(0)
        # (in)  rnn_input:      [1, batch, emb_dim + enc_hid_dim * 2]
        # (in)  decoder_hidden: [1, batch, dec_hid_dim]
        # (out) output:         [1, batch, dec_hid_dim]
        # (out) decoder_hidden: [1, batch, dec_hid_dim]
        output, decoder_hidden = self.rnn(rnn_input, decoder_hidden)
        
        # (in)  embedded: [1, batch, emb_dim]
        # (out) embedded: [batch, emb_dim]
        embedded = embedded.squeeze(0)
        # (in)  output:   [1, batch, dec_hid_dim]
        # (out) output:   [batch, dec_hid_dim]
        output = output.squeeze(0)
        # (in)  weighted: [1, batch, enc_hid_dim * 2]
        # (out) weighted: [batch, enc_hid_dim * 2]
        weighted = weighted.squeeze(0)
        # (in)  output:    [batch, dec_hid_dim]
        # (in)  weighted:  [batch, enc_hid_dim * 2]
        # (in)  embedded:  [batch, emb_dim]
        # (out) output_in: [batch, dec_hid_dim + enc_hid_dim * 2 + emb_dim]
        output_in = torch.cat((output, weighted, embedded), dim=1)
        # (in)  output_in: [batch, dec_hid_dim + enc_hid_dim * 2 + emb_dim]
        # (out) output:    [batch, output_dim]
        output = self.out(output_in)
        
        # (in)  decoder_hidden: [1, batch, dec_hid_dim]
        # (out) decoder_hidden: [batch, dec_hid_dim]
        decoder_hidden = decoder_hidden.squeeze(0)
        
        return output, decoder_hidden

In [9]:
import random

class Seq2Seq(nn.Module):
    def __init__(self, 
                encoder: nn.Module, 
                decoder: nn.Module,
                device: torch.device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, 
               src: Tensor, 
               trg: Tensor, 
               teacher_forcing_ratio: float = 0.5) -> Tensor:
        
        # Encode.
        encoder_outputs, hidden = self.encoder(src)
        
        trg_len = trg.shape[0]
        batch_size = src.shape[1]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size, device=device)
                
        output = trg[0]  # <sos>
        # the first col of outputs should be <sos>? no if 
        for t in range(1, trg_len):  # counts from 1
            output, hidden = self.decoder(output, hidden, encoder_outputs)
            
            outputs[t] = output
            
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            output = trg[t] if teacher_force else top1
        
        return outputs

In [10]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ATTN_DIM = 64
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

# ENC_EMB_DIM = 32
# DEC_EMB_DIM = 32
# ENC_HID_DIM = 64
# DEC_HID_DIM = 64
# ATTN_DIM = 8
# ENC_DROPOUT = 0.5
# DEC_DROPOUT = 0.5

In [11]:
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
attn = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)

In [12]:
def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

# Applies fn recursively to every submodule (as returned by .children()) as well 
# as self. Typical use includes initializing the parameters of a model 
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7855, 256)
    (dropout): Dropout(p=0.5, inplace=False)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=64, bias=True)
    )
    (embedding): Embedding(5893, 256)
    (dropout): Dropout(p=0.5, inplace=False)
    (rnn): GRU(1280, 512)
    (out): Linear(in_features=1792, out_features=5893, bias=True)
  )
)

In [13]:
def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 19,829,893 trainable parameters


## Optimizer

In [14]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

## Criterion

In [15]:
# Note: when scoring the performance of a language translation model in 
# particular, we have to tell the nn.CrossEntropyLoss function to ignore the 
# indices where the target is simply padding.
PAD_IDX = TRG.vocab.stoi["<pad>"]

# __init__ of Field contains: pad_token='<pad>', unk_token='<unk>'
# print(SRC.vocab.itos[0], SRC.vocab.itos[1], SRC.vocab.itos[2], SRC.vocab.itos[3])
# print(TRG.vocab.itos[0], TRG.vocab.itos[1], TRG.vocab.itos[2], TRG.vocab.itos[3])
# <unk> <pad> <sos> <eos>
# <unk> <pad> <sos> <eos>

In [16]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

## Training and Validating.

In [17]:
def train(model: nn.Module, 
         iterator: BucketIterator, 
         optimizer: optim.Optimizer, 
         criterion: nn.Module, 
         clip: float):
    
    # Sets the module in training mode. This has any effect only on certain 
    # modules. See documentations of particular modules for details of their 
    # behaviors in training/evaluation mode, if they are affected, e.g. Dropout, 
    # BatchNorm, etc.
    model.train()
    
    eposh_loss = 0
    
    for _, batch in enumerate(iterator):
        # i, batch
        
        # Zeros grad.
        optimizer.zero_grad()
        
        # Data.
#         [torchtext.data.batch.Batch of size 32 from MULTI30K]
#         [.src]:[torch.cuda.LongTensor of size 24x32 (GPU 0)]
#         [.trg]:[torch.cuda.LongTensor of size 25x32 (GPU 0)]
        src = batch.src
        trg = batch.trg 
        
        # Forward.
        output = model(src, trg)
        
        # Loss.
        output = output[1:].view(-1, output.shape[-1])  # output[0]: <sos>
        trg = trg[1:].view(-1)  # trg[0]: <sos>
        # Input: (N, C) where C = number of classes
        # Target: (N) where each value is 0 ≤targets[i]≤ C−1
        loss = criterion(output, trg)
        
        # Backward.
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        # Updates params.
        optimizer.step()
        
        eposh_loss += loss.item()
        
    return eposh_loss / len(iterator)

In [18]:
def evaluate(model: nn.Module, 
            iterator: BucketIterator, 
            criterion: nn.Module):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
        for _, batch in enumerate(iterator):
            
            # Data.
            src = batch.src
            trg = batch.trg
            
            # Forward.
            output = model(src, trg, 0)  # turns off teacher forcing
            
            # Loss.
            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
            
        return epoch_loss / len(iterator)

In [19]:
def epoch_time(start_time: int, 
              end_time: int):
    elapsed_time = end_time - start_time
    
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time % 60)
    
    return elapsed_mins, elapsed_secs

In [20]:
import time
import math

N_EPOCHS = 10
CLIP = 1

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    # Training and validating.
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f"Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}")
    print(f"\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}")

Epoch: 01 | Time: 16m 12s
	Train Loss: 5.307 | Train PPL: 201.833
	 Val. Loss: 5.068 |  Val. PPL: 158.916
Epoch: 02 | Time: 16m 21s
	Train Loss: 4.537 | Train PPL:  93.396
	 Val. Loss: 4.938 |  Val. PPL: 139.452
Epoch: 03 | Time: 16m 9s
	Train Loss: 4.181 | Train PPL:  65.460
	 Val. Loss: 4.676 |  Val. PPL: 107.308


KeyboardInterrupt: 

In [None]:
test_loss = evaluate(model, test_iterator, criterion)
print(f"| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |")