# Translating with a Recurrent Neural Networks

This notebook provides the complete code example that implements a sequence-to-sequence (seq2seq) model for machine translation using recurrent neural networks.

## Building the Vocabularies

Implement a function to tokenize and standardize text ...

In [1]:
import contractions, re
from torchtext.data.utils import get_tokenizer

tokenizer_eng = get_tokenizer("spacy", language="en_core_web_sm")
tokenizer_spa = get_tokenizer("spacy", language="es_core_news_sm")

def tokenize(text, lang):
    """Standardize, tokenize and filter text."""
    text = (text.replace("’", "'").replace("‘", "'").replace("´", "'")
            .replace("“", '"').replace("”", '"').replace("´´", '"'))
    tokens = (tokenizer_eng(contractions.fix(text)) if lang == "eng"
              else tokenizer_spa(text))  # lang == "spa"
    filtered_tokens = [token for token in tokens if re.match(
        r"""
        ^[a-zA-Z0-9áéíóúüñÁÉÍÓÚÜÑ.,!?¡¿]+  # 1+ allowed characters.
        (-[a-zA-Z0-9áéíóúüñÁÉÍÓÚÜÑ.,!?¡¿]+)*  # Optional hyphen plus chars.
        (_[a-zA-Z0-9áéíóúüñÁÉÍÓÚÜÑ.,!?¡¿]+)*  # Optional underscore plus chars.
        $  # End of the string.
        """, token, re.VERBOSE)]
    return filtered_tokens

... a function to read and tokenize sentences by iterating through a corpus file ...

In [2]:
def corpus_iterator(filename, lang, lang_position):
    """Read and tokenize texts by iterating through a corpus file."""
    with open(filename, "r", encoding="utf-8") as file:
        for line in file:
            sentences = line.strip().split("\t")
            sentence = sentences[lang_position]
            yield tokenize(sentence, lang)

... a function to build a vocabulary from a corpus file ...

In [3]:
from torchtext.vocab import build_vocab_from_iterator

def build_vocab(filename, lang, lang_position, specials="<unk>", min_freq=5):
    """Build vocabulary."""
    vocab = build_vocab_from_iterator(
        corpus_iterator(filename, lang, lang_position),
        min_freq=min_freq,
        specials=specials,
    )
    vocab.set_default_index(vocab[specials[-1]])
    return vocab

... and build the vocabularies.

In [4]:
in_lang, out_lang, filename = "eng", "spa", "eng-spa.txt"
special_tokens = ["<pad>", "<sos>", "<eos>", "<unk>"]

in_vocab = build_vocab(filename, in_lang, lang_position=0, 
                       specials=special_tokens)
out_vocab = build_vocab(filename, out_lang, lang_position=1, 
                        specials=special_tokens)

## Preprocessing the Data

Implement a function to check if all words in a sentence are present in a vocabulary ...

In [5]:
def all_words_in_vocab(sentence, vocab):
    """Check whether all words in a sentence are present in a vocabulary"""
    return all(word in vocab for word in sentence)

... a function to pad a sequence of tokens ...

In [6]:
def pad(tokens, max_length=10):
    """Pad sequence of tokens."""
    padding_length = max_length - len(tokens)
    return ["<sos>"] + tokens + ["<eos>"] + ["<pad>"] * padding_length

... a function to process the language corpus ...

In [7]:
import numpy as np

def process(file, in_lang, out_lang, in_vocab, out_vocab, max_length=10):
    """Process language corpus."""
    in_sequences, out_sequences = [], []
    for line in file:
        texts = line.strip().split("\t")
        in_tokens = tokenize(texts[0], in_lang)
        out_tokens = tokenize(texts[1], out_lang)

        if (all_words_in_vocab(in_tokens, in_vocab) 
            and len(in_tokens) <= max_length
            and all_words_in_vocab(out_tokens, out_vocab)
            and len(out_tokens) <= max_length):
            
            padded_in_tokens = pad(in_tokens)
            in_sequence = in_vocab(padded_in_tokens)
            in_sequences.append(in_sequence)
            
            padded_out_tokens = pad(out_tokens)
            out_sequence = out_vocab(padded_out_tokens)
            out_sequences.append(out_sequence)

    return np.array(in_sequences), np.array(out_sequences)

... and build the datasets and data loaders.

In [8]:
import deeptrack as dt
import torch
from torch.utils.data import DataLoader

with open(filename, "r", encoding="utf-8") as file:
    in_sequences, out_sequences = \
        process(file, in_lang, out_lang, in_vocab, out_vocab)

sources = dt.sources.Source(inputs=in_sequences, targets=out_sequences)
train_sources, test_sources = dt.sources.random_split(sources, [0.85, 0.15])

inputs_pip = dt.Value(sources.inputs) >> dt.pytorch.ToTensor(dtype=torch.int)
outputs_pip = dt.Value(sources.targets) >> dt.pytorch.ToTensor(dtype=torch.int)

train_dataset = \
    dt.pytorch.Dataset(inputs_pip & outputs_pip, inputs=train_sources)
test_dataset = \
    dt.pytorch.Dataset(inputs_pip & outputs_pip, inputs=test_sources)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

## Implementing and Training the Sequence-to-Sequence Architecture

Implement the encoder ...

In [9]:
import deeplay as dl

class Seq2SeqEncoder(dl.DeeplayModule):
    """Sequence-to-sequence encoder."""

    def __init__(self, vocab_size, in_features=300, hidden_features=128,
                 hidden_layers=1, dropout=0.0, bidirectional=True):
        """Initialize sequence-to-sequence encoder."""
        super().__init__()

        self.hidden_features = hidden_features
        self.hidden_layers = hidden_layers
        self.bidirectional = bidirectional
        
        self.embedding = dl.Layer(torch.nn.Embedding, vocab_size, in_features)
        self.rnn = dl.Layer(torch.nn.GRU, input_size=in_features, \
            hidden_size=hidden_features, num_layers=hidden_layers, \
            dropout=(0 if hidden_layers == 1 else dropout), \
            bidirectional=bidirectional, batch_first=True)

    def forward(self, in_sequences, hidden=None):
        """Perform forward pass."""
        in_embedding = self.embedding(in_sequences)
        encoder_output, hidden = self.rnn(in_embedding, hidden)
        if self.bidirectional:
            encoder_output = (encoder_output[:, :, :self.hidden_features]
                               + encoder_output[:, :, self.hidden_features:])
            hidden = hidden[:self.hidden_layers]
        return encoder_output, hidden

... implement the decoder ...

In [10]:
class Seq2SeqDecoder(dl.DeeplayModule):
    """Sequence-to-sequence decoder."""

    def __init__(self, vocab_size, in_features=300, hidden_features=128, 
                 hidden_layers=1, dropout=0.0):
        """Initialize sequence-to-sequence decoder."""
        super().__init__()

        self.embedding = dl.Layer(torch.nn.Embedding, vocab_size, in_features)
        self.rnn = dl.Layer(torch.nn.GRU, input_size=in_features, \
            hidden_size=hidden_features, num_layers=hidden_layers, \
            bidirectional=False, batch_first=True, \
            dropout=(0 if hidden_layers == 1 else dropout))
        self.dense = dl.Layer(torch.nn.Linear, hidden_features, vocab_size)     ### Carlo: Is this 1-hot encoded?
        self.softmax = dl.Layer(torch.nn.Softmax, dim=-1)
        self.relu = dl.Layer(torch.nn.ReLU)

    def forward(self, out_tokens, hidden):
        """Perform forward pass."""
        out_embeddings = self.embedding(out_tokens)
        out_embeddings = self.relu(out_embeddings)
        decoder_output, hidden = self.rnn(out_embeddings, hidden)
        decoder_output = self.dense(decoder_output)
        decoder_output = self.softmax(decoder_output)
        return decoder_output, hidden

... implement the full seq2seq model combining the encoder and decoder ...

In [11]:
class Seq2SeqModel(dl.DeeplayModule):
    """Sequence-to-sequence model."""

    def __init__(self, in_vocab_size=None, out_vocab_size=None,
                 teacher_prob=1.0, embedding_dim=300, hidden_features=128,
                 hidden_layers=1, dropout=0.0, bidirectional=True):
        """Initialize the sequence-to-sequence model."""
        super().__init__()
        
        self.in_vocab_size, self.out_vocab_size = in_vocab_size, out_vocab_size
        self.teacher_prob = teacher_prob
        self.encoder = Seq2SeqEncoder(in_vocab_size, embedding_dim, \
            hidden_features, hidden_layers, dropout, bidirectional)
        self.decoder = Seq2SeqDecoder(out_vocab_size, embedding_dim, \
            hidden_features, hidden_layers, dropout)

    def forward(self, batch):
        """Perform forward pass."""
        in_sequences, out_sequences = batch
        num_sequences, sequence_length = in_sequences.size()
        device = next(self.encoder.parameters()).device
        
        _, decoder_hidden = self.encoder(in_sequences)  # = encoder_hidden
        decoder_outputs = torch.zeros(num_sequences, sequence_length,
                                      self.out_vocab_size).to(device)
        
        for t in range(sequence_length):
            if t == 0 or np.random.rand() < self.teacher_prob:
                decoder_input = out_sequences[:, t].unsqueeze(-1).to(device)
            else:
                _, top_decoder_output = decoder_output.topk(1)
                decoder_input = \
                    top_decoder_output.squeeze(-1).detach().to(device)

            decoder_output, decoder_hidden = \
                self.decoder(decoder_input, decoder_hidden)
            decoder_outputs[:, t, :] = decoder_output.squeeze(1)
        return decoder_outputs

    def evaluate(self, in_sequences):
        """Evaluate model."""
        num_sequences, sequence_length = in_sequences.size()
        device = next(self.encoder.parameters()).device
        
        with torch.no_grad():
            _, decoder_hidden = self.encoder(in_sequences)  # = encoder_hidden
        out_sequences = torch.zeros(num_sequences, sequence_length).to(device)
        
        for t in range(sequence_length):
            if t == 0:
                decoder_input = torch.full(size=(num_sequences, 1), 
                                           fill_value=1, device=device)
            else:
                decoder_input = top_decoder_output.squeeze(-1).detach()

            with torch.no_grad():
                decoder_output, decoder_hidden = \
                    self.decoder(decoder_input.to(device), decoder_hidden)
            _, top_decoder_output = decoder_output.topk(1)
            out_sequences[:, t] = top_decoder_output.squeeze()
        return out_sequences

... define the loss function ...

In [12]:
def maskedNLL(pred_sequences, target_sequences, padding=0):                     ### Carlo: Check names of arguments.
    """Calculate the masked negative log-likelihood (NLL) loss."""
    flat_pred_sequences = pred_sequences.view(-1, pred_sequences.shape[-1])     ### Carlo: Is this 1-hot encoded?
    flat_target_sequences = target_sequences.view(-1, 1)
    pred_probs = torch.gather(flat_pred_sequences, 1, flat_target_sequences)

    nll = - torch.log(pred_probs)

    mask = target_sequences != padding
    masked_nll = nll.masked_select(mask.view(-1, 1))
    
    return masked_nll.mean()  # Loss.

... implement the sequence-to-sequence application ...

In [13]:
class Seq2Seq(dl.Application):
    """Application for the sequence-to-sequence model."""

    def __init__(self, in_vocab, out_vocab, teacher_prob=1.0):
        """Initialize the application."""
        super().__init__(loss=maskedNLL, optimizer=dl.Adam(lr=1e-3))
        self.model = Seq2SeqModel(in_vocab_size=len(in_vocab), \
            out_vocab_size=len(out_vocab), teacher_prob=teacher_prob)

    def train_preprocess(self, batch):
        """Adjust the target sequence by shifting it one position backward."""
        in_sequences, out_sequences = batch
        shifted_out_sequences = \
            torch.cat((out_sequences[:, 1:], out_sequences[:, -1:]), dim=1)     ### Carlo: Is this (out_sequences[:, -1:]) just to repeat the pad token number?
        return (in_sequences, out_sequences), shifted_out_sequences
    
    def forward(self, batch):
        """Perform forward pass."""
        return self.model(batch)

... load some pretrained embeddings ...

In [14]:
from torchtext.vocab import GloVe

embedding_dim = 300

glove = GloVe(name="42B", dim=embedding_dim, cache="./.glove_cache")
glove_embeddings_in = glove.get_vecs_by_tokens(in_vocab.get_itos(), 
                                               lower_case_backup=True)
glove_embeddings_out = glove.get_vecs_by_tokens(out_vocab.get_itos(), 
                                                lower_case_backup=True)

num_special_tokens = len(special_tokens)
glove_embeddings_in[1:num_special_tokens] = \
    torch.rand(num_special_tokens - 1, embedding_dim) * 0.01
glove_embeddings_out[1:num_special_tokens] = \
    torch.rand(num_special_tokens - 1, embedding_dim) * 0.01

INFO:torchtext.vocab.vectors:Loading vectors from ./.glove_cache/glove.42B.300d.txt.pt


... instantiate the seq2seq model ...

In [15]:
seq2seq = Seq2Seq(in_vocab=in_vocab, out_vocab=out_vocab, teacher_prob=0.85)
seq2seq = seq2seq.create()

seq2seq.model.encoder.embedding.weight.data = glove_embeddings_in
seq2seq.model.encoder.embedding.weight.requires_grad = False
seq2seq.model.decoder.embedding.weight.data = glove_embeddings_out
seq2seq.model.decoder.embedding.weight.requires_grad = False

... and train the model ...

In [16]:
trainer = dl.Trainer(max_epochs=25, accelerator="auto")
trainer.fit(seq2seq, train_loader)

/Users/giovannivolpe/Documents/GitHub/DeepLearningCrashCourse/py_env_dlcc/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
/Users/giovannivolpe/Documents/GitHub/DeepLearningCrashCourse/py_env_dlcc/lib/python3.12/site-packages/lightning/pytorch/trainer/configuration_validator.py:74: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.


Output()

/Users/giovannivolpe/Documents/GitHub/DeepLearningCrashCourse/py_env_dlcc/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=10` in the `DataLoader` to improve performance.


## Testing the Model Perfomance

Implement a function to convert numerical sequences into their corresponding text ...

In [17]:
def unprocess(sequences, vocab):
    """Convert numeric sequences to sentences."""
    sentences = []
    for sequence in sequences:
        idxs = sequence[sequence > 2]
        words = [vocab.lookup_token(idx) for idx in idxs]
        sentences.append(" ".join(words))
    return sentences

... a function to translate user-defined sentences ...

In [18]:
def translate(in_sentence, model, in_lang, in_vocab, out_vocab):
    """Translate a sentence."""
    in_tokens = pad(tokenize(in_sentence, in_lang))
    in_sequence = (torch.tensor(in_vocab(in_tokens), dtype=torch.int)
                   .unsqueeze(0).to(next(model.parameters()).device))
    pred_sequence = model.evaluate(in_sequence)
    pred_sentence = unprocess(pred_sequence, out_vocab)
    print(f"Predicted Translation: {pred_sentence[0]}\n")

... try to translate a simple sentence ...

In [19]:
in_sentence = "I bought a book."
translate(in_sentence, seq2seq.model, in_lang, in_vocab, out_vocab)

Predicted Translation: Compré un libro .



... another simple sentence ...

In [20]:
in_sentence = "This book is very interesting."
translate(in_sentence, seq2seq.model, in_lang, in_vocab, out_vocab)

Predicted Translation: Este libro es muy interesante .



... and a more complex one ...

In [21]:
in_sentence = "The book that I bought is very interesting."
translate(in_sentence, seq2seq.model, in_lang, in_vocab, out_vocab)

Predicted Translation: El libro que compré muy interesante .



## Evaluating the Model with the BLEU Score

In [22]:
from torchmetrics.text import BLEUScore

bleu_score = BLEUScore()

device = next(seq2seq.model.parameters()).device
for batch_index, (in_sequences, out_sequences) in enumerate(test_loader):
    in_sentences = unprocess(in_sequences.to(device), in_vocab)
    pred_sequences = seq2seq.model.evaluate(in_sequences.to(device))
    pred_sentences = unprocess(pred_sequences, out_vocab)
    out_sentences = unprocess(out_sequences.to(device), out_vocab)
    
    bleu_score.update(pred_sentences, [[s] for s in out_sentences])

    print(f"Input text: {in_sentences[0]}\n" 
          + f"Predicted Translation: {pred_sentences[0]}\n"
          + f"Actual Translation: {out_sentences[0]}\n")

final_bleu = bleu_score.compute()
print(f"Validation BLEU Score: {final_bleu:.3f}")

Input text: Tom did not come to the last meeting .
Predicted Translation: Tom no esperaba el fin de octubre .
Actual Translation: Tom no vino a la última reunión .

Input text: You need more practice .
Predicted Translation: Tienes más práctica .
Actual Translation: Ustedes necesitan más práctica .

Input text: Tom painted .
Predicted Translation: Tom pintó .
Actual Translation: Tom se puso a pintar .

Input text: Can I tell you something ?
Predicted Translation: ¿ Puedo decir algo ?
Actual Translation: ¿ Puedo decirte algo ?

Input text: This is not fair .
Predicted Translation: No es justo .
Actual Translation: Esto no es justo .

Input text: They usually sleep in this room .
Predicted Translation: En este mercado se puede dormir en esta habitación .
Actual Translation: Normalmente duermen en esta habitación .

Input text: The doctor might have said that .
Predicted Translation: El médico que debería haber dicho .
Actual Translation: Puede que el doctor haya dicho eso .

Input text: 

In [37]:
a = seq2seq.model.forward((in_sequences.to(device), out_sequences.to(device)))

a.shape

torch.Size([109, 12, 9267])