In [None]:
import contractions, re
from torchtext.data.utils import get_tokenizer

tokenizer_eng = get_tokenizer("spacy", language="en_core_web_sm")
tokenizer_spa = get_tokenizer("spacy", language="es_core_news_sm")

def tokenize(text, lang):
    """Standardize, tokenize and filter text."""
    text = text.replace("’", "'").replace("‘", "'").replace("´", "'") \
        .replace("“", '"').replace("”", '"').replace("´´", '"')
    tokens = tokenizer_eng(contractions.fix(text)) if lang == "eng" \
        else tokenizer_spa(text)  # lang == "spa"
    filtered_tokens = [token for token in tokens if re.match(
        r"""
        ^[a-zA-Z0-9áéíóúüñÁÉÍÓÚÜÑ.,!?¡¿]+  # 1+ allowed characters.
        (-[a-zA-Z0-9áéíóúüñÁÉÍÓÚÜÑ.,!?¡¿]+)*  # Optional hyphen plus chars.
        (_[a-zA-Z0-9áéíóúüñÁÉÍÓÚÜÑ.,!?¡¿]+)*  # Optional underscore plus chars.
        $  # End of the string.
        """,
        token,
        re.VERBOSE,
    )]
    return filtered_tokens

In [None]:
def corpus_iterator(filename, lang, text_number):
    """Read and tokenize texts by iterating through a corpus file."""
    with open(filename, "r", encoding="utf-8") as file:
        for line in file:
            texts = line.strip().split("\t")
            text = texts[text_number]
            yield tokenize(text, lang)

In [None]:
from torchtext.vocab import build_vocab_from_iterator

def build_vocab(filename, lang, text_number, specials="<unk>", min_freq=5):
    """Build vocabulary."""
    vocab = build_vocab_from_iterator(
        corpus_iterator(filename, lang, text_number),
        min_freq=min_freq,
        specials=specials,
    )
    vocab.set_default_index(vocab[specials[-1]])
    return vocab

In [None]:
in_lang, out_lang = "eng", "spa"
filename = f"{in_lang}-{out_lang}.txt"
special_tokens = ["<pad>", "<sos>", "<eos>", "<unk>"]

in_vocab = \
    build_vocab(filename, in_lang, text_number=0, specials=special_tokens)
out_vocab = \
    build_vocab(filename, out_lang, text_number=1, specials=special_tokens)

In [None]:
import numpy as np

def all_words_in_vocab(text, vocab):
    """Check whether all words in a text are present in a vocabulary."""
    return all(word in vocab for word in text)

In [None]:
def pad(tokens, max_length=10):
    """Pad sequence of tokens."""
    padding_length = max_length - len(tokens)
    return ["<sos>"] + tokens + ["<eos>"] + ["<pad>"] * padding_length

In [None]:
def process(lines, in_lang, out_lang, in_vocab, out_vocab, max_length=10):
    """Process language corpus."""
    in_sequences, out_sequences = [], []
    for line in lines:
        texts = line.strip().split("\t")
        in_tokens = tokenize(texts[0], in_lang)
        out_tokens = tokenize(texts[1], out_lang)

        if (all_words_in_vocab(in_tokens, in_vocab)
            and all_words_in_vocab(out_tokens, out_vocab)
            and len(in_tokens) <= max_length
            and len(out_tokens) <= max_length):
            
            padded_in_tokens = pad(in_tokens)
            in_sequence = in_vocab(padded_in_tokens)
            in_sequences.append(in_sequence)
            
            padded_out_tokens = pad(out_tokens)
            out_sequence = out_vocab(padded_out_tokens)
            out_sequences.append(out_sequence)

    return np.array(in_sequences), np.array(out_sequences)

In [None]:
import deeptrack as dt
import torch
from torch.utils.data import DataLoader

with open(filename, "r", encoding="utf-8") as file:
    in_sequences, out_sequences = \
        process(file, in_lang, out_lang, in_vocab, out_vocab)

sources = dt.sources.Source(inputs=in_sequences, targets=out_sequences)
train_sources, test_sources = dt.sources.random_split(sources, [0.85, 0.15])

inputs_pip = dt.Value(sources.inputs) >> dt.pytorch.ToTensor(dtype=torch.int)
outputs_pip = dt.Value(sources.targets) >> dt.pytorch.ToTensor(dtype=torch.int)

train_dataset = \
    dt.pytorch.Dataset(inputs_pip & outputs_pip, inputs=train_sources)
test_dataset = \
    dt.pytorch.Dataset(inputs_pip & outputs_pip, inputs=test_sources)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

In [None]:
import deeplay as dl

class Seq2SeqEncoder(dl.DeeplayModule):
    """Sequence-to-sequence encoder."""

    def __init__(self, vocab_size, in_features=300, hidden_features=128,
                 hidden_layers=1, dropout= 0.0, bidirectional=True):
        """Initialize sequence-to-sequence encoder."""
        super().__init__()

        self.hidden_features = hidden_features
        self.hidden_layers = hidden_layers
        self.bidirectional = bidirectional
        self.embedding = dl.Layer(torch.nn.Embedding, vocab_size, in_features)
        self.rnn = dl.Layer(
            torch.nn.GRU,
            input_size=in_features,
            hidden_size=hidden_features,
            num_layers=hidden_layers,
            dropout=(0 if hidden_layers == 1 else dropout),
            bidirectional=bidirectional,
            batch_first=True,
        )

    def forward(self, in_sequences, hidden=None):
        """Perform forward pass."""
        in_sequences = self.embedding(in_sequences)
        encoder_outputs, hidden = self.rnn(in_sequences, hidden)
        if self.bidirectional:
            encoder_outputs = (
                encoder_outputs[:, :, :self.hidden_features]
                + encoder_outputs[:, :, self.hidden_features:]
            )
            hidden = hidden[: self.hidden_layers]
        return encoder_outputs, hidden

In [None]:
class Seq2SeqDecoder(dl.DeeplayModule):
    """Sequence-to-sequence decoder."""

    def __init__(self, vocab_size, in_features=300, hidden_features=128, 
                 hidden_layers=1, dropout=0.0):
        """Initialize sequence-to-sequence decoder."""
        super().__init__()

        self.embedding = dl.Layer(torch.nn.Embedding, vocab_size, in_features)
        self.rnn = dl.Layer(
            torch.nn.GRU,
            input_size=in_features,
            hidden_size=hidden_features,
            num_layers=hidden_layers,
            bidirectional=False,
            batch_first=True,
            dropout=(0 if hidden_layers == 1 else dropout),
        )
        self.dense = dl.Layer(torch.nn.Linear, hidden_features, vocab_size)
        self.softmax = dl.Layer(torch.nn.Softmax, dim=-1)
        self.relu = dl.Layer(torch.nn.ReLU)

    def forward(self, x, hidden):
        """Perform forward pass."""
        x = self.embedding(x)
        x = self.relu(x)
        output, hidden = self.rnn(x, hidden)
        output = self.dense(output)
        output = self.softmax(output)
        return output, hidden

In [None]:
class Seq2SeqModel(dl.DeeplayModule):
    """Sequence-to-sequence model."""

    def __init__(self, in_vocab_size=None, out_vocab_size=None,
                 teacher_prob=1.0, embedding_dim=300, hidden_features=128,
                 hidden_layers=1, dropout=0.0, bidirectional=True):
        """Initialize the sequence-to-sequence model."""
        super().__init__()
        
        self.out_vocab_size = out_vocab_size
        self.teacher_prob = teacher_prob
        self.encoder = Seq2SeqEncoder(in_vocab_size, embedding_dim, \
            hidden_features, hidden_layers, dropout, bidirectional)
        self.decoder = Seq2SeqDecoder(out_vocab_size, embedding_dim, \
            hidden_features, hidden_layers, dropout)

    def forward(self, batch):
        """Perform forward pass."""
        in_sequences, out_sequences = batch
        
        encoder_outputs, encoder_hidden = self.encoder(in_sequences)
        
        decoder_hidden = encoder_hidden
        decoder_outputs = torch.zeros(
            (out_sequences.size(0), out_sequences.size(1), self.out_vocab_size)
        ).to(next(self.encoder.parameters()).device)        

        for t in range(in_sequences.size(1)):
            if t == 0 or np.random.rand() < self.teacher_prob:
                decoder_input = out_sequences[:, t].unsqueeze(-1)
            else:
                decoder_input = topi.squeeze(-1).detach()

            decoder_input = \
                decoder_input.to(next(self.decoder.parameters()).device)
            decoder_output, decoder_hidden = \
                self.decoder(decoder_input, decoder_hidden)
            _, topi = decoder_output.topk(1)
            decoder_outputs[:, t, :] = decoder_output.squeeze(1)
        return decoder_outputs

    def evaluate(self, input):
        """Evaluate model."""
        with torch.no_grad():
            encoder_outputs, encoder_hidden = self.encoder(input)
        decoder_hidden = encoder_hidden
        outputs = torch.zeros(
            input.shape,
        ).to(next(self.encoder.parameters()).device)
        for t in range(input.size(1)):
            if t == 0:
                decoder_input = torch.full(
                    size=(input.size(0), 1),
                    fill_value=1,
                    device=next(self.encoder.parameters()).device,
                )
            else:
                decoder_input = topi.squeeze(-1).detach()

            decoder_input = decoder_input.to(next(self.decoder.parameters()).device)
            with torch.no_grad():
                decoder_output, decoder_hidden = self.decoder(
                    decoder_input,
                    decoder_hidden,
                )
            _, topi = decoder_output.topk(1)
            outputs[:, t] = topi.squeeze()

        return outputs

In [None]:
def maskedNLL(input, target, PADtoken=0):
    """Calculate the negative log-likelihood loss."""
    mask = target != PADtoken
    NLL = - torch.log(
        torch.gather(input.view(-1, input.shape[-1]), 1, target.view(-1, 1))
    )
    loss = NLL.masked_select(mask.view(-1, 1)).mean()
    return loss


In [None]:
class Seq2Seq(dl.Application):
    """Application for the sequence-to-sequence model."""

    def __init__(self, in_vocab, out_vocab, teacher_prob=1.0):
        """Initialize the application."""
        super().__init__(loss=maskedNLL, optimizer=dl.Adam(lr=1e-3))
        
        self.model = Seq2SeqModel(
            in_vocab_size=len(in_vocab),
            out_vocab_size=len(out_vocab),
            teacher_prob=teacher_prob,
        )

    def train_preprocess(self, batch):
        """Adjust the target sequence by shifting it one position backward."""
        x = batch
        y = torch.cat((x[-1][:, 1:], x[-1][:, -1:]), dim=1)
        return x, y

    def forward(self, input):
        """Perform forward pass."""
        return self.model(input)

In [None]:
from torchtext.vocab import GloVe

embedding_dim = 300

glove = GloVe(name="42B", dim=embedding_dim, cache="./.vector_cache")
glove_embeddings_input = \
    glove.get_vecs_by_tokens(in_vocab.get_itos(), lower_case_backup=True)
glove_embeddings_target = \
    glove.get_vecs_by_tokens(out_vocab.get_itos(), lower_case_backup=True)

num_special_tokens = len(special_tokens)
glove_embeddings_input[1:num_special_tokens] = \
    torch.rand(num_special_tokens - 1, embedding_dim) * 0.01
glove_embeddings_target[1:num_special_tokens] = \
    torch.rand(num_special_tokens - 1, embedding_dim) * 0.01


In [None]:
seq2seq = Seq2Seq(
    in_vocab=in_vocab,
    out_vocab=out_vocab,
    teacher_prob=0.85,
)
seq2seq = seq2seq.create()

seq2seq.model.encoder.embedding.weight.data = glove_embeddings_input
seq2seq.model.encoder.embedding.weight.requires_grad = False
seq2seq.model.decoder.embedding.weight.data = glove_embeddings_target
seq2seq.model.decoder.embedding.weight.requires_grad = False

In [None]:
trainer = dl.Trainer(max_epochs=1, accelerator="auto")  ### 25
trainer.fit(seq2seq, train_loader)
trainer.history.plot()

In [None]:
def unprocess(sequences, vocab):
    """Convert numeric sequences to texts."""
    texts = []
    for sequence in sequences:
        idxs = sequence[sequence > 2]
        words = [vocab.lookup_token(idx) for idx in idxs]
        texts.append(" ".join(words))
    return texts

In [None]:
def translate(input, model, in_lang, in_vocab, out_vocab):
    """Translate text."""
    input_tokens = pad(tokenize(input, in_lang))
    input_sequence = (torch.tensor(in_vocab(input_tokens), dtype=torch.int)
                      .unsqueeze(0).to(next(model.parameters()).device))
    print(f"Input text: {input}")
    
    pred_sequence = model.evaluate(input_sequence)
    pred_text = unprocess(pred_sequence, out_vocab)
    print(f"Predicted Translation: {pred_text[0]}\n")

In [None]:
in_text = "I bought a book."
translate(in_text, seq2seq.model, in_lang, in_vocab, out_vocab)

in_text = "This book is very interesting."
translate(in_text, seq2seq.model, in_lang, in_vocab, out_vocab)


In [None]:
in_text = "The book that I bought is very interesting."
translate(in_text, seq2seq.model, in_lang, in_vocab, out_vocab)

In [None]:
from torchmetrics.text import BLEUScore

bleu_score = BLEUScore()

In [None]:
device = next(seq2seq.model.parameters()).device

for batch_index, (in_sequences, out_sequences) in enumerate(test_loader):
    in_sequences = in_sequences.to(device)
    pred_sequences = seq2seq.model.evaluate(in_sequences)
    out_sequences = out_sequences.to(device)

    in_texts = unprocess(in_sequences, in_vocab)
    pred_texts = unprocess(pred_sequences, out_vocab)
    out_texts = unprocess(out_sequences, out_vocab)
    
    bleu_score.update(pred_texts, [[yi] for yi in out_texts])

    print(f"\nExamples from batch {batch_index + 1}:")
    for i in range(min(3, len(in_texts))):
        print(f"\nInput text: {in_texts[i]}")
        print(f"Predicted Translation: {pred_texts[i]}")
        print(f"Actual Translation: {out_texts[i]}")

final_bleu = bleu_score.compute()
print(f"\nValidation BLEU Score: {final_bleu:.3f}")