<a href="https://colab.research.google.com/github/Dagobert42/NMT-Attention/blob/main/Neural_Machine_Translation_with_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Neural Machine Translation with Attention

The goal of this notebook is to implement the **RNNsearch-50** model, i.e., the encoder-decoder with attention system for any language pair different from English-German, German-English, English-French, and French-English. We choose **German-Italian** as an example language pair.

Furthermore we loosely follow **test-driven development** ("TDD") paradigms to replicate the original system based on the paper:

    Bahdanau, Cho & Bengio. Neural Machine Translation by Jointly Learning to Align and Translate. ICLR 2015.

An in-depth walk-through of the project is given in the accompanying report which can be found here ADD LINK

# 1. Setup

Please choose which dependencies to install to your environment. On re-runs you can ucheck the boxes to save some time.

In [1]:
from IPython.display import clear_output

torch = True #@param {type:"boolean"}
if torch:
    !pip install --upgrade torch

torchtext = True #@param {type:"boolean"}
if torchtext:
    !pip install --upgrade torchtext

spacy_nlp = True #@param {type:"boolean"}
if spacy_nlp:
    !pip install --upgrade spacy

spacy_packages = True #@param {type:"boolean"}
if spacy_packages:
    !python -m spacy download en_core_web_sm
    !python -m spacy download de_core_news_sm
    !python -m spacy download it_core_news_sm

clear_output()

Next let us import all the modules we are going to be using.

In [2]:
# access to translation datasets
import torchtext

# model implementation
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import DataLoader

# tokenization
import spacy

# utilities
import math
import random
import time
from datetime import datetime
import gc
import uuid

# 2. Data

### 2.1 Vocabulary

The vocabulary converts words to indeces and vice versa. Unknown words are marked with the < U > token. The vocabulary can be filtered by word counts and provides methods for converting between sequences and sentences. It also takes an optional SpaCy NLP object which (when given) is used for tokenization.

In [3]:
class Vocab:
    """
    A vocabulary holding dictionaries for converting words to indeces and back.
    """
    class Entry:
        def __init__(self, id):
            """ An entry to the vocabulary. With index and count. """
            self.id = id
            self.count = 1

        def __repr__(self):
            """ String prepresentation for printing. """
            return str((self.id, self.count))

    def __init__(self, text=None, spacy_nlp=None):
        """
        Creates a vocabulary over an input text in the form of
        dictionaries for indeces and word counts. Hand in a
        spacy_nlp object to make use of a SpaCy for tokenization.
        """
        self.spacy_nlp = spacy_nlp
        self.words = {'<S>':self.Entry(0),
                      '<E>':self.Entry(1),
                      '<U>':self.Entry(2),
                      '<P>':self.Entry(3)}
        self.ids = {0:'<S>',
                    1:'<E>',
                    2:'<U>',
                    3:'<P>'}
        self.next_id = 4

        if text:
            self.append(text)

    def append(self, txt):
        """ Adds a string token by token to the vocabulary. """
        # use SpaCy for tokenization if requested
        if self.spacy_nlp:
            for tok in self.spacy_nlp.tokenizer(txt):
                word = tok.text
                if word not in self.words.keys():
                    self.words[word] = self.Entry(self.next_id)
                    self.ids[self.next_id] = word
                    self.next_id += 1
                else:
                    self.words[word].count += 1
        else:
            for word in txt.split():
                if word not in self.words.keys():
                    self.words[word] = self.Entry(self.next_id)
                    self.ids[self.next_id] = word
                    self.next_id += 1
                else:
                    self.words[word].count += 1

    def filter(self, n_samples, descending=True):
        """
        Reduces this vocabs dictionary to n_samples and
        the 4 special tokens after sorting by word count.
        Indeces remain untouched
        """
        #exclude specials from filter
        del self.words['<S>']
        del self.words['<E>']
        del self.words['<U>']
        del self.words['<P>']
        sorted_list = list(sorted(
                self.words.items(),
                key=lambda item: item[1].count,
                reverse=descending))
        self.words = {k: v for k, v in sorted_list[:n_samples]}
        # add specials back in, in case they were filtered
        self.words['<S>'] = self.Entry(0)
        self.words['<E>'] = self.Entry(1)
        self.words['<U>'] = self.Entry(2)
        self.words['<P>'] = self.Entry(3)

    def get_indeces(self, sentence):
        """ Produces a representation from the indeces in this vocabulary. """
        END = 1
        UNK = 2
        if self.spacy_nlp:
            seq = [self.words[tok.text].id if tok.text in self.words.keys() else UNK
                for tok in self.spacy_nlp.tokenizer(sentence)]
        else:
            seq = [self.words[word].id if word in self.words.keys() else UNK
                for word in sentence.split()]
        seq.append(END)
        return seq
    
    def get_sentence(self, indeces):
        """
        Converts a list of indeces into a readable sentence
        using words from this vocabulary.
        """
        return ' '.join([self.ids[id] for id in indeces])

We make sure the vocabulary works as intended by running tests with a tiny corpus. Note that during development these tests were written **first** and subsequently we implemented the related functionality in the class.

In [4]:
TEST_CORPUS_EN = """The final project should implement a system 
        related to deep learning for NLP using the Py- Torch library 
        and test it. The project is documented in an ACL-style paper 
        that adheres to the standards of practice in computational 
        linguistics."""

test_vocab = Vocab(TEST_CORPUS_EN)

# Basic Functionality
assert(test_vocab.ids[0] == '<S>') # <S> should always be first
assert(test_vocab.ids[4] == 'The')
assert(test_vocab.words['The'].id == 4)
assert(test_vocab.words['The'].count == 2)

# Spacy Option
english_nlp = spacy.load('en_core_web_sm')
spacy_vocab = Vocab(TEST_CORPUS_EN, spacy_nlp=english_nlp)

assert(spacy_vocab.ids[0] == '<S>')
assert(spacy_vocab.ids[6] == 'project')
assert(spacy_vocab.words['project'].id == 6)
assert(spacy_vocab.words['project'].count == 2)

# Vocabulary Filter
top_30 = test_vocab
top_30.filter(n_samples=30)
# new size should be n_samples + 4 specials
assert(len(top_30.words) == 34)

# Sentence Vector Conversion
vec = spacy_vocab.get_indeces("This is a test")
sent = spacy_vocab.get_sentence(vec)

assert(vec == [2, 27, 9, 24, 1])
assert(sent == '<U> is a test <E>')

print('OK. No errors or asserts were triggered during testing :)')

OK. No errors or asserts were triggered during testing :)


### 2.2 Dataset

For data we look to the Web Inventory of Transcribed and Translated Talks which comes as a torchtext dataset.  You can see some examples during the building process. On re-runs you can ucheck the box to save some time.

In [5]:
from torchtext.datasets import IWSLT2017
LOAD_DATA = True #@param {type:"boolean"} 
#@markdown (ETA 2:15 min)
if LOAD_DATA:
    train_iter, test_iter, val_iter = IWSLT2017(
        split=('train', 'test', 'valid'),
        language_pair=('de', 'it'))
    
    train_pairs = [(src, trg) for src, trg in train_iter]
    test_pairs = [(src, trg) for src, trg in test_iter]
    val_pairs = [(src, trg) for src, trg in val_iter]

print('sentence pairs in...')
print('...train:', len(train_pairs), end='\t')
print('...test:', len(test_pairs), end='\t')
print('...validation:', len(val_pairs))

2017-01-trnmted.tgz: 329MB [00:19, 16.5MB/s]


sentence pairs in...
...train: 205465	...test: 1567	...validation: 923


In [6]:
BUILD_VOCAB = True #@param {type:"boolean"}
if BUILD_VOCAB:

    random.shuffle(train_pairs)
    TRAIN_EXAMPLES = 50000 #@param {type:"integer"}
    if TRAIN_EXAMPLES < len(train_pairs):
        train_pairs = train_pairs[:TRAIN_EXAMPLES]

    de_nlp = spacy.load('de_core_news_sm')
    it_nlp = spacy.load('it_core_news_sm')
    de_vocab = Vocab(spacy_nlp=de_nlp)
    it_vocab = Vocab(spacy_nlp=it_nlp)

    PRINT_EVERY = 10000
    for n, (src, trg) in enumerate(train_pairs):
        if n % PRINT_EVERY == 0:
            clear_output()
            print("Building vocabularies...\n")
            print(f'Example: {n}\n')
            print('source ->', src)
            print('target ->', trg)
        de_vocab.append(src)
        it_vocab.append(trg)

    VOCAB_SIZE = 10000 #@param {type:"integer"}
    de_vocab.filter(VOCAB_SIZE)
    it_vocab.filter(VOCAB_SIZE)

    def get_tensors(sentence_pairs, src_vocab, trg_vocab):
        data = []
        for n, (src, trg) in enumerate(sentence_pairs):
            de_tensor = torch.tensor(src_vocab.get_indeces(src),
                                    dtype=torch.long)
            it_tensor = torch.tensor(trg_vocab.get_indeces(trg),
                                    dtype=torch.long)
            if n % PRINT_EVERY == 0:
                clear_output()
                print("Converting to tensors...\n")
                print(f'Example: {n}\n')
                print('source ->', de_tensor)
                print('target ->', it_tensor)
            data.append((de_tensor, it_tensor))
        return data
        
    train_data = get_tensors(train_pairs, de_vocab, it_vocab)
    test_data = get_tensors(test_pairs, de_vocab, it_vocab)
    val_data = get_tensors(val_pairs, de_vocab, it_vocab)
    clear_output()

# print a random example
#@markdown (ETA 1:45 min)
x = random.randrange(0, len(train_pairs))
print(f'Example: {x}\n')
print('source ->', train_pairs[x][0])
print('target ->', train_pairs[x][1])
print("As tensors...\n")
print('source ->', train_data[x][0], '\n')
print('target ->', train_data[x][1])

Example: 40582

source -> Und jetzt schließt sich der Kreis.

target -> Stiamo per chiudere il cerchio.

As tensors...

source -> tensor([   4,  331, 8826,   47,   66, 6464,   26,   27,    1]) 

target -> tensor([2018,   46,  425,  114, 6528,   27,   28,    1])


# 3. Model

The network consists of three main parts. These are the Encoder, Decoder and Alignment models which are ultimately combined to form the RNNsearch model.

## 3.1 Encoder

**Inputs:** sequence of one-hot vectors representing a sentence

**Outputs:** series of annotations, most recent GRU hidden states

In [7]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.biRNN = nn.GRU(embedding_dim, hidden_dim, bidirectional = True)
        self.fc = nn.Linear(hidden_dim * 2, hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        outputs, hidden = self.biRNN(embedded)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        return outputs, hidden

## 3.2 Attention

**Inputs:** encoder annotations, recent encoder hidden states

**Outputs:** attention energies over the input sequence

In [8]:
class Attention(nn.Module):
    def __init__(self, hidden_dim, attention_dim):
        super().__init__()
        self.attention = nn.Linear(hidden_dim * 3, attention_dim)

    def forward(self, decoder_hidden, encoder_outputs):
        src_len = encoder_outputs.shape[0]
        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        energy = torch.tanh(self.attention(torch.cat((
            repeated_decoder_hidden,
            encoder_outputs),
            dim = 2)))
        attention = torch.sum(energy, dim=2)
        return F.softmax(attention, dim=1)

## 3.3 Decoder

**Input:** encoder annotations, recent encoder hidden states

**Outputs:** current prediction, hidden states

In [9]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, attention_dim, dropout):
        super().__init__()
        self.attention = Attention(hidden_dim, attention_dim)
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.GRU((hidden_dim * 2) + embedding_dim, hidden_dim)
        self.out = nn.Linear((hidden_dim * 3) + embedding_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def apply_attention(self, decoder_hidden, encoder_outputs):
        a = self.attention(decoder_hidden, encoder_outputs)
        a = a.unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted_encoder_rep = torch.bmm(a, encoder_outputs)
        weighted_encoder_rep = weighted_encoder_rep.permute(1, 0, 2)
        return weighted_encoder_rep

    def forward(self, input, decoder_hidden, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        weighted_encoder_rep = self.apply_attention(decoder_hidden,
                                                    encoder_outputs)
        rnn_input = torch.cat((embedded, weighted_encoder_rep), dim = 2)
        output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted_encoder_rep = weighted_encoder_rep.squeeze(0)
        output = self.out(torch.cat((output,
                                     weighted_encoder_rep,
                                     embedded), dim = 1))
        return output, decoder_hidden.squeeze(0)

## 3.4 Final Model

**Input:** encoded source sequence, empty target sequence

**Outputs:** encoded translation

In [10]:
class Translator(nn.Module):
    def __init__(self, input_dim, output_dim, embedding_dim, hidden_dim, attention_dim, dropout, device):
        super().__init__()
        self.target_vocab_size = output_dim
        self.encoder = Encoder(input_dim, embedding_dim, hidden_dim, dropout)
        self.decoder = Decoder(output_dim, embedding_dim, hidden_dim, attention_dim, dropout)
        self.device = device

    def forward(self, x, y, teacher_forcing_ratio=0.5):
        batch_size = x.shape[1]
        max_len = y.shape[0]
        outputs = torch.zeros(max_len, batch_size, self.target_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(x)
        
        # first input to the decoder is the <S> token
        output = y[0,:]
        for t in range(1, max_len):
            output, hidden = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (y[t] if teacher_force else top1)
        return outputs

# 4. Training

## 4.2 Batching

In the original training method 1600 sentence pairs are retrieved at every 20th update and sorted by length. Then 20 new mini-batches with 80 sentence pairs each are prepared from this data. This keeps the average length of batches comparably low since each batch is always the length of its longest sequence while all of its smaller sequences are being padded with the special token < P >.

In [11]:
RESORT_DATA = True #@param {type:"boolean"}
if RESORT_DATA:

    #filter input sentences for max length
    MAX_SENT_LENGTH = 50 #@param {type:"integer"}
    train_data_pruned = []
    for pair in train_data:
        if pair[0].size()[0] < MAX_SENT_LENGTH:
            train_data_pruned.append(pair)
    print(f'Sentence pairs of size < {MAX_SENT_LENGTH}:',
          len(train_data_pruned))

    train_sorted = list(sorted(
                    train_data_pruned,
                    key=lambda pair: pair[0].size()[0],
                    reverse=True))

    test_sorted = list(sorted(
                    test_data,
                    key=lambda pair: pair[0].size()[0],
                    reverse=True))

    val_sorted = list(sorted(
                    val_data,
                    key=lambda pair: pair[0].size()[0],
                    reverse=True))

Sentence pairs of size < 50: 47970


In [12]:
def prepare_batches(data):
    """ Returns mini-batches from sorted data. """
    STA = 0.0
    PAD = 3.0
    de_batch = []
    it_batch = []
    for de_item, it_item in data:
        de_batch.append(torch.cat([torch.tensor([STA]), de_item], dim=0))
        it_batch.append(torch.cat([torch.tensor([STA]), it_item], dim=0))
    de_batch_padded = pad_sequence(de_batch, padding_value=PAD)
    it_batch_padded = pad_sequence(it_batch, padding_value=PAD)
    return de_batch_padded, it_batch_padded


BATCH_SIZE = 16 #@param {type:"integer"}
train_batches = DataLoader(train_sorted, batch_size=BATCH_SIZE,
                           collate_fn=prepare_batches)
test_batches = DataLoader(test_sorted, batch_size=BATCH_SIZE,
                          collate_fn=prepare_batches)
val_batches = DataLoader(val_sorted, batch_size=BATCH_SIZE,
                         collate_fn=prepare_batches)

## 4.2 Hyperparameters

For our purposes we can adjust the hyperparameters to more reasonably expensive values in terms of training time.

In [13]:
#@markdown Model Parameters
EMBEDDING_DIM =  64#@param {type:"integer"}
HIDDEN_DIM =  128#@param {type:"integer"}
ATTENION_DIM =  64#@param {type:"integer"}
DROPOUT = 0.5 #@param {type:"number"}
#@markdown Training Parameters
EPOCHS = 5 #@param {type:"integer"}
RHO = 0.95 #@param {type:"number"}
EPSILON = 0.000001 #@param {type:"number"}
LEARNING_RATE = 0.01 #@param {type:"number"}
GRADIENT_CLIP = 1 #@param {type:"number"}

## 4.3 Model Declaration

Lets check the model that we can declare with these hyperparameters.

In [14]:
def init_weights(model):
    for name, param in model.named_parameters():
        # recurrent layer weights
        if 'weight_ih_l' in name or 'weight_hh_l' in name:
            nn.init.orthogonal_(param.data)
        # recurrent layer biases
        elif 'bias_ih_l' in name or 'bias_hh_l' in name:
            nn.init.constant_(param.data, 0)
        elif 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        elif 'bias' in name:
            nn.init.constant_(param.data, 0)

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.empty_cache()

input_dim = len(de_vocab.words)
output_dim = len(it_vocab.words)

model = Translator(input_dim, output_dim, EMBEDDING_DIM, HIDDEN_DIM, ATTENION_DIM, DROPOUT, device).to(device)
model.apply(init_weights)
print("Initialized weights...")

n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {n_parameters} trainable parameters:')
print(model)

use_adadelta = False #@param {type:"boolean"}
if use_adadelta:
    optimizer = optim.Adadelta(model.parameters(), lr=1.0, rho=RHO, eps=EPSILON)
else:
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_func = nn.CrossEntropyLoss(ignore_index=de_vocab.words['<P>'].id)

Initialized weights...
The model has 6151636 trainable parameters:
Translator(
  (encoder): Encoder(
    (embedding): Embedding(10004, 64)
    (biRNN): GRU(64, 128, bidirectional=True)
    (fc): Linear(in_features=256, out_features=128, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attention): Linear(in_features=384, out_features=64, bias=True)
    )
    (embedding): Embedding(10004, 64)
    (rnn): GRU(320, 128)
    (out): Linear(in_features=448, out_features=10004, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)


## 4.4 Train Loop

We initialize the weights as proposed. Setting all biases to 0, the recurrent weight matrices to random orthogonal matrices and sampling all other weights from normal distribution.

In [16]:
def train(model, train_batches, optimizer, loss_func, gradient_clip):
    epoch_loss = 0
    n_batches = len(train_batches)
    model.train()

    for batch, (x, y) in enumerate(train_batches):
        x = x.to(device=device, dtype=torch.long)
        y = y.to(device=device, dtype=torch.long)

        optimizer.zero_grad()

        output = model(x, y)
        output = output[1:].view(-1, output.shape[-1])
        y = y[1:].view(-1)

        loss = loss_func(output, y)

        loss.backward()
        clip_grad_norm_(model.parameters(), gradient_clip)
        optimizer.step()
        epoch_loss += loss.item()
        
        # indicate progress
        BAR_SIZE = 20
        i = math.ceil(batch/n_batches * BAR_SIZE)
        print('\r', '#' * i, ' ' * (BAR_SIZE-i), '{:.2f}%'.format(batch/n_batches * 100.0), end=' ')

    return epoch_loss / n_batches

In [17]:
def evaluate(model, val_batches, loss_func):
    epoch_loss = 0
    model.eval()
    n_batches = len(train_batches)

    with torch.no_grad():
        for batch, (x, y) in enumerate(val_batches):
            x, trg = x.to(device), y.to(device)

            output = model(x, y, 0) # no teacher forcing

            output = output[1:].view(-1, output.shape[-1])
            y = y[1:].view(-1)

            loss = loss_func(output, y)
            epoch_loss += loss.item()

            # indicate progress
            BAR_SIZE = 20
            i = math.ceil(batch/n_batches * BAR_SIZE)
            print('\r', '#' * i, ' ' * (BAR_SIZE-i), '{:.2f}%'.format(batch/n_batches * 100.0), end=' ')

    return epoch_loss / len(val_batches)

In [18]:
def time_since(start):
    s = time.time() - start
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [19]:
print("Begin training...")
start_time = time.time()
for epoch in range(EPOCHS):
    print(f'[Epoch: {epoch+1}] @', datetime.now().strftime("%H:%M:%S"))
    train_loss = train(model, train_batches, optimizer, loss_func, GRADIENT_CLIP)
    print('\n[time elapsed: {} loss: {:.4f}]'.format(time_since(start_time), epoch, train_loss))
print("Training finished...")

Begin training...
[Epoch: 1] @ 15:42:50


RuntimeError: ignored

In [None]:
id = uuid.uuid1() + '.pt'
torch.save(model.state_dict(), id)

# 5. Evaluation

In [None]:
print("Begin evaluation...")
start_time = time.time()
test_loss = evaluate(model, test_batches, loss_func)
print(f'Test loss: {test_loss:.3f}')

In [None]:
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<E>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)