get corpus from https://www.manythings.org/anki/ 

In [1]:
import os

input_lang = "eng"
target_lang = "spa"

if os.path.isfile(f"{input_lang}-{target_lang}.txt"):
    filepath = f"{input_lang}-{target_lang}.txt"
    reverse = False
elif os.path.isfile(f"{target_lang}-{input_lang}.txt"):
    filepath = f"{target_lang}-{input_lang}.txt"
    reverse = True
else:
    raise FileNotFoundError("Missing training set for specified language pair")

In [2]:
from torchtext.data.utils import get_tokenizer
import re
import contractions

tokenizer_eng = get_tokenizer("spacy", language="en_core_web_sm")
tokenizer_spa = get_tokenizer("spacy", language="es_core_news_sm")


def tokenize(text, lang=None):
    if lang == "eng":
        text = contractions.fix(text)
        tokenizer = tokenizer_eng
    elif lang == "spa":
        tokenizer = tokenizer_spa
    else:
        raise NotImplementedError("Missing tokenizer for specified language ")

    standardized_text = (
        text.replace("’", "'")
        .replace("‘", "'")
        .replace("´", "'")
        .replace("“", '"')
        .replace("”", '"')
        .replace("´´", '"')
    )
    tokens = tokenizer(standardized_text)
    filtered_tokens = [
        token
        for token in tokens
        if re.match(
            r"^[a-zA-Z0-9áéíóúüñÁÉÍÓÚÜÑ.,!?¡¿]+(-[a-zA-Z0-9áéíóúüñÁÉÍÓÚÜÑ.,!?¡¿]+)*(_[a-zA-Z0-9áéíóúüñÁÉÍÓÚÜÑ.,!?¡¿]+)*$",
            token,
        )
    ]
    return filtered_tokens


#
#
#


# Example usage:
text_eng = "I'm building a translator!"
text_spa = "¡Estoy construyendo un traductor!"
print(tokenize(text_eng, lang="eng"))
print(tokenize(text_spa, lang="spa"))

['I', 'am', 'building', 'a', 'translator', '!']
['¡', 'Estoy', 'construyendo', 'un', 'traductor', '!']


tokenize as detailed in next chapter

In [3]:
# from torchtext.data.utils import get_tokenizer
# import spacy
# import re
# import contractions

# # Load the SpaCy multilingual model
# nlp = spacy.load("xx_ent_wiki_sm")


# def tokenize(text, lang=None):
#     if lang == "eng":
#         text = contractions.fix(text)
#     # Standardize quotation marks and other punctuations
#     standardized_text = (
#         text.replace("’", "'")
#         .replace("‘", "'")
#         .replace("´", "'")
#         .replace("“", '"')
#         .replace("”", '"')
#         .replace("´´", '"')
#     )

#     # Process text with the multilingual model
#     doc = nlp(standardized_text)

#     # Extract tokens from the doc, filtering with regex
#     tokens = [
#         token.text
#         for token in doc
#         if re.match(
#             r"^[a-zA-Z0-9áéíóúüñÁÉÍÓÚÜÑ.,!?¡¿]+(-[a-zA-Z0-9áéíóúüñÁÉÍÓÚÜÑ.,!?¡¿]+)*(_[a-zA-Z0-9áéíóúüñÁÉÍÓÚÜÑ.,!?¡¿]+)*$",
#             token.text,
#         )
#     ]
#     return tokens


# # Example usage:
# text = "Hello world! ¡Hola mundo! How are you? ¿Cómo estás?"
# tokens = tokenize(text)
# print(tokens)

In [4]:
# from torchtext.data.utils import get_tokenizer

# tokenizer = get_tokenizer("basic_english")

# import re
# import contractions


# def tokenize(text):
#     standardized_text = contractions.fix(text)

#     standardized_text = (
#         standardized_text.replace("’", "'")
#         .replace("‘", "'")
#         .replace("´", "'")
#         .replace("“", '"')
#         .replace("”", '"')
#         .replace("´´", '"')
#     )

#     tokens = tokenizer(standardized_text)

#     filtered_tokens = [
#         token
#         for token in tokens
#         if re.match(
#             r"^[a-zA-Z0-9.,!?]+(-[a-zA-Z0-9.,!?]+)*(_[a-zA-Z0-9.,!?]+)*$", token
#         )
#     ]
#     return filtered_tokens

define vocabularies from corpu as defined in next chapter

In [5]:
def corpus_iterator(filepath, lang, reverse):
    with open(filepath, "r", encoding="utf-8") as file:

        for line in file:

            elems = line.strip().split("\t")
            selected_text = elems[1 if reverse else 0]

            yield tokenize(selected_text, lang)

In [6]:
from torchtext.vocab import build_vocab_from_iterator


def build_vocab(filepath, lang, reverse, specials="<unk>", min_freq=5):
    vocab = build_vocab_from_iterator(
        corpus_iterator(filepath, lang, reverse),
        min_freq=min_freq,
        specials=specials,
    )
    vocab.set_default_index(vocab[specials[-1]])
    return vocab


special_tokens = ["<pad>", "<sos>", "<eos>", "<unk>"]

vocab_input = build_vocab(filepath, input_lang, reverse, special_tokens)
vocab_target = build_vocab(filepath, target_lang, not reverse, special_tokens)


#     corpus_iterator(filepath, reverse),
#     specials=special_tokens,
#     min_freq=5,
# )

# vocab_input = build_vocab_from_iterator(
#     corpus_iterator(filepath, reverse),
#     specials=special_tokens,
#     min_freq=5,
# )
# vocab_target = build_vocab_from_iterator(
#     corpus_iterator(filepath, (not reverse)),
#     specials=special_tokens,
#     min_freq=5,
# )

# vocab_input.set_default_index(vocab_input["<unk>"])
# vocab_target.set_default_index(vocab_target["<unk>"])

In [7]:
vocab_input.__len__()

6082

In [8]:
vocab_input.lookup_token(120)

'some'

process data 

In [9]:
import numpy as np


def all_words_in_vocab(sentence, vocab):
    return all(word in vocab for word in sentence)


def pad_sentence(sequence, max_length=10):
    padding_length = max_length - len(sequence)
    return ["<sos>"] + sequence + ["<eos>"] + ["<pad>"] * padding_length


def process_corpus(
    lines,
    reverse,
    input_lang,
    target_lang,
    vocab_input,
    vocab_target,
    max_length=10,
):
    processed_inputs, processed_targets = [], []

    for line in lines:
        elems = line.strip().split("\t")

        input_seq = tokenize(elems[int(reverse)], input_lang)
        target_seq = tokenize(elems[int(not reverse)], target_lang)

        if (
            all_words_in_vocab(input_seq, vocab_input)
            and all_words_in_vocab(target_seq, vocab_target)
            and len(input_seq) <= max_length
            and len(target_seq) <= max_length
        ):
            padded_input = pad_sentence(input_seq)
            padded_target = pad_sentence(target_seq)

            processed_inputs.append(vocab_input(padded_input))
            processed_targets.append(vocab_target(padded_target))

    return np.array(processed_inputs), np.array(processed_targets)


with open(filepath, "r", encoding="utf-8") as file:
    inputs, targets = process_corpus(
        file, reverse, input_lang, target_lang, vocab_input, vocab_target
    )

datasets, dataloader

In [10]:
import deeptrack as dt
import torch

sources = dt.sources.Source(inputs=inputs, targets=targets)
train_sources, test_sources = dt.sources.random_split(sources, [0.85, 0.15])

inputs_pl = dt.Value(sources.inputs) >> dt.pytorch.ToTensor(dtype=torch.int)
targets_pl = dt.Value(sources.targets) >> dt.pytorch.ToTensor(dtype=torch.int)


from torch.utils.data import DataLoader

train_dataset = dt.pytorch.Dataset(
    inputs_pl & targets_pl,
    inputs=train_sources,
)
train_loader = DataLoader(
    train_dataset,
    batch_size=256,
    shuffle=True,
)

test_dataset = dt.pytorch.Dataset(
    inputs_pl & targets_pl,
    inputs=test_sources,
)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

In [11]:
import numpy as np

print(f"Number of queries/responses: {len(inputs)}")

Number of queries/responses: 91183


In [12]:
# pairs = [[s, t] for (s, t) in zip(inputs, targets)]

# print([vocab_input.lookup_token(p) for p in pairs[1000][0]])
# print([vocab_target.lookup_token(p) for p in pairs[1000][1]])

embedding

deeplay modules

encoder

In [13]:
from deeplay import DeeplayModule
from typing import Optional, Union, Type, Literal
import torch


class Seq2SeqEncoder(DeeplayModule):

    def __init__(
        self,
        vocab_size: Optional[int],
        in_features: Optional[int] = 300,
        hidden_features: Optional[int] = 128,
        hidden_layers: Optional[int] = 1,
        dropout: Optional[float] = 0.0,
        rnn_type: Union[Literal["RNN", "LSTM", "GRU"], Type[torch.nn.Module]] = "GRU",
        bidirectional=True,
    ):
        super().__init__()
        if isinstance(rnn_type, type) and issubclass(rnn_type, torch.nn.Module):
            self.rnn_class = rnn_type
        elif rnn_type == "LSTM":
            self.rnn_class = torch.nn.LSTM
        elif rnn_type == "GRU":
            self.rnn_class = torch.nn.GRU
        else:
            self.rnn_class = torch.nn.RNN

        self.hidden_features = hidden_features
        self.hidden_layers = hidden_layers
        self.bidirectional = bidirectional
        self.embedding = dl.Layer(torch.nn.Embedding, vocab_size, in_features)
        self.rnn = dl.Layer(
            self.rnn_class,
            input_size=in_features,
            hidden_size=hidden_features,
            num_layers=hidden_layers,
            dropout=(0 if hidden_layers == 1 else dropout),
            bidirectional=bidirectional,
            batch_first=True,
        )

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        outputs, hidden = self.rnn(x, hidden)
        if self.bidirectional:
            outputs = (
                outputs[:, :, : self.hidden_features]
                + outputs[:, :, self.hidden_features :]
            )
            hidden = hidden[: self.hidden_layers]
        return outputs, hidden

  Referenced from: <A549E5FA-1487-3474-A747-4913D621982E> /Users/841602/Documents/GitHub/Environments/deeplay_env/lib/python3.10/site-packages/torchvision/image.so
  warn(


decoder

In [15]:
# no attn


class Seq2SeqDecoder(DeeplayModule):

    def __init__(
        self,
        vocab_size: Optional[int],
        in_features: Optional[int] = 300,
        hidden_features: Optional[int] = 128,
        hidden_layers: Optional[int] = 1,
        dropout: Optional[float] = 0.0,
        rnn_type: Union[Literal["RNN", "LSTM", "GRU"], Type[torch.nn.Module]] = "GRU",
    ):
        super().__init__()
        if isinstance(rnn_type, type) and issubclass(rnn_type, torch.nn.Module):
            self.rnn_class = rnn_type
        elif rnn_type == "LSTM":
            self.rnn_class = torch.nn.LSTM
        elif rnn_type == "GRU":
            self.rnn_class = torch.nn.GRU
        else:
            self.rnn_class = torch.nn.RNN

        self.in_features = in_features
        self.hidden_features = hidden_features
        self.num_layers = hidden_layers
        self.embedding = dl.Layer(torch.nn.Embedding, vocab_size, in_features)
        self.rnn = dl.Layer(
            self.rnn_class,
            input_size=in_features,
            hidden_size=hidden_features,
            num_layers=hidden_layers,
            bidirectional=False,
            batch_first=True,
            dropout=(0 if hidden_layers == 1 else dropout),
        )
        self.dense = dl.Layer(torch.nn.Linear, hidden_features, vocab_size)
        self.softmax = dl.Layer(torch.nn.Softmax, dim=-1)
        self.relu = dl.Layer(torch.nn.ReLU)

    def forward(self, x, hidden):
        x = self.embedding(x)
        x = self.relu(x)
        output, hidden = self.rnn(x, hidden)
        output = self.dense(output)
        output = self.softmax(output)
        return output, hidden

seq2seq

In [18]:
# no attn


class Seq2SeqModel(DeeplayModule):

    def __init__(
        self,
        in_vocab_size: Optional[int] = None,
        out_vocab_size: Optional[int] = None,
        teacher_prob=1.0,
        embedding_dim: Optional[int] = 300,
        hidden_features: Optional[int] = 128,
        hidden_layers: Optional[int] = 1,
        dropout: Optional[float] = 0.0,
        rnn_type: Union[Literal["RNN", "LSTM", "GRU"], Type[torch.nn.Module]] = "GRU",
        bidirectional=True,
    ):
        super().__init__()
        # self.hidden_layers = hidden_layers
        # self.dropout = dropout
        # self.hidden_features = hidden_features
        # self.embedding_dim = embedding_dim
        self.out_vocab_size = out_vocab_size
        self.teacher_prob = teacher_prob
        self.encoder = Seq2SeqEncoder(
            in_vocab_size,
            embedding_dim,
            hidden_features,
            hidden_layers,
            dropout,
            rnn_type,
            bidirectional,
        )
        self.decoder = Seq2SeqDecoder(
            out_vocab_size,
            embedding_dim,
            hidden_features,
            hidden_layers,
            dropout,
            rnn_type,
        )

    def forward(self, x):
        input, target = x
        encoder_outputs, encoder_hidden = self.encoder(input)
        decoder_hidden = encoder_hidden
        decoder_outputs = torch.zeros(
            (
                target.size(0),
                target.size(1),
                self.out_vocab_size,
            )
        ).to(next(self.encoder.parameters()).device)

        for t in range(input.size(1)):
            if t == 0 or np.random.rand() < self.teacher_prob:
                decoder_input = target[:, t].unsqueeze(-1)
            else:
                decoder_input = topi.squeeze(-1).detach()

            decoder_input = decoder_input.to(next(self.decoder.parameters()).device)

            decoder_output, decoder_hidden = self.decoder(
                decoder_input,
                decoder_hidden,
            )

            _, topi = decoder_output.topk(1)
            decoder_outputs[:, t, :] = decoder_output.squeeze(1)

        return decoder_outputs, decoder_hidden

    def evaluate(self, x):
        if isinstance(x, (list, tuple)):
            input = x[0]
        else:
            input = x

        with torch.no_grad():
            encoder_outputs, encoder_hidden = self.encoder(input)
        decoder_hidden = encoder_hidden
        decoder_outputs = torch.zeros(
            (
                input.size(0),
                input.size(1),
                self.out_vocab_size,
            )
        ).to(next(self.encoder.parameters()).device)
        for t in range(input.size(1)):
            if t == 0:
                decoder_input = torch.full(
                    size=(input.size(0), 1),
                    fill_value=1,  # self.vocab_input(["<sos>"]),
                    device=next(self.encoder.parameters()).device,
                )
            else:
                decoder_input = topi.squeeze(-1).detach()

            decoder_input = decoder_input.to(next(self.decoder.parameters()).device)
            with torch.no_grad():
                decoder_output, decoder_hidden = self.decoder(
                    decoder_input,
                    decoder_hidden,
                )
                attentions.append(attn_weights)
            _, topi = decoder_output.topk(1)
            decoder_outputs[:, t, :] = decoder_output.squeeze(1)

        return decoder_outputs

In [None]:
import deeplay as dl

from typing import Optional, Union, Type, Literal, Sequence

from deeplay import DeeplayModule, Application, Optimizer

import torch
import torch.nn
import torchmetrics as tm

from torchtext import vocab


def maskedNLL(inp, target, PADtoken=0):
    mask = target != PADtoken
    crossEntropy = -torch.log(
        torch.gather(inp.view(-1, inp.shape[-1]), 1, target.view(-1, 1))
    )
    loss = crossEntropy.masked_select(mask.view(-1, 1)).mean()
    return loss


class Seq2SeqEncoder(DeeplayModule):

    def __init__(
        self,
        vocab_size: Optional[int],
        in_features: Optional[int] = 300,
        hidden_features: Optional[int] = 128,
        hidden_layers: Optional[int] = 1,
        dropout: Optional[float] = 0.0,
        rnn_type: Union[Literal["RNN", "LSTM", "GRU"], Type[torch.nn.Module]] = "GRU",
        bidirectional=True,
    ):
        super().__init__()
        if isinstance(rnn_type, type) and issubclass(rnn_type, torch.nn.Module):
            self.rnn_class = rnn_type
        elif rnn_type == "LSTM":
            self.rnn_class = torch.nn.LSTM
        elif rnn_type == "GRU":
            self.rnn_class = torch.nn.GRU
        else:
            self.rnn_class = torch.nn.RNN

        self.hidden_features = hidden_features
        self.hidden_layers = hidden_layers
        self.bidirectional = bidirectional
        self.embedding = dl.Layer(torch.nn.Embedding, vocab_size, in_features)
        self.rnn = dl.Layer(
            self.rnn_class,
            input_size=in_features,
            hidden_size=hidden_features,
            num_layers=hidden_layers,
            dropout=(0 if hidden_layers == 1 else dropout),
            bidirectional=bidirectional,
            batch_first=True,
        )

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        outputs, hidden = self.rnn(x, hidden)
        if self.bidirectional:
            outputs = (
                outputs[:, :, : self.hidden_features]
                + outputs[:, :, self.hidden_features :]
            )
            hidden = hidden[: self.hidden_layers]
        return outputs, hidden


class Seq2SeqDecoder(DeeplayModule):

    def __init__(
        self,
        vocab_size: Optional[int],
        in_features: Optional[int] = 300,
        hidden_features: Optional[int] = 128,
        hidden_layers: Optional[int] = 1,
        dropout: Optional[float] = 0.0,
        rnn_type: Union[Literal["RNN", "LSTM", "GRU"], Type[torch.nn.Module]] = "GRU",
        attn: Optional[torch.nn.Module] = None,
    ):
        super().__init__()
        if isinstance(rnn_type, type) and issubclass(rnn_type, torch.nn.Module):
            self.rnn_class = rnn_type
        elif rnn_type == "LSTM":
            self.rnn_class = torch.nn.LSTM
        elif rnn_type == "GRU":
            self.rnn_class = torch.nn.GRU
        else:
            self.rnn_class = torch.nn.RNN

        self.in_features = in_features
        self.hidden_features = hidden_features
        self.num_layers = hidden_layers
        self.attn = attn
        self.embedding = dl.Layer(torch.nn.Embedding, vocab_size, in_features)
        self.rnn = dl.Layer(
            self.rnn_class,
            input_size=(
                in_features + hidden_features if attn is not None else in_features
            ),
            hidden_size=hidden_features,
            num_layers=hidden_layers,
            bidirectional=False,
            batch_first=True,
            dropout=(0 if hidden_layers == 1 else dropout),
        )
        self.dense = dl.Layer(torch.nn.Linear, hidden_features, vocab_size)
        self.softmax = dl.Layer(torch.nn.Softmax, dim=-1)
        self.relu = dl.Layer(torch.nn.ReLU)

    def forward(self, x, hidden, encoder_outputs):
        x = self.embedding(x)
        if self.attn:
            query = hidden[-1:, :, :].permute(1, 0, 2)
            context, attn_weights = self.attn(query, encoder_outputs)
            x = torch.cat((x, context), dim=2)
        else:
            x = self.relu(x)
            attn_weights = []
        output, hidden = self.rnn(x, hidden)
        output = self.dense(output)
        output = self.softmax(output)
        return output, hidden, attn_weights


class Seq2SeqModel(DeeplayModule):

    def __init__(
        self,
        in_vocab_size: Optional[int] = None,
        out_vocab_size: Optional[int] = None,
        teacher_prob=1.0,
        embedding_dim: Optional[int] = 300,
        hidden_features: Optional[int] = 128,
        hidden_layers: Optional[int] = 1,
        dropout: Optional[float] = 0.0,
        rnn_type: Union[Literal["RNN", "LSTM", "GRU"], Type[torch.nn.Module]] = "GRU",
        bidirectional=True,
        attn: Optional[torch.nn.Module] = None,
    ):
        super().__init__()
        self.hidden_layers = hidden_layers
        self.dropout = dropout
        self.hidden_features = hidden_features
        self.embedding_dim = embedding_dim
        self.attn = attn
        if self.attn:
            self.attn.configure(
                hidden_size=hidden_features,
                embedding_size=embedding_dim,
            )

        self.out_vocab_size = out_vocab_size
        self.teacher_prob = teacher_prob
        self.encoder = Seq2SeqEncoder(
            in_vocab_size,
            embedding_dim,
            hidden_features,
            hidden_layers,
            dropout,
            rnn_type,
            bidirectional,
        )
        self.decoder = Seq2SeqDecoder(
            out_vocab_size,
            embedding_dim,
            hidden_features,
            hidden_layers,
            dropout,
            rnn_type,
            attn,
        )

    def forward(self, x):
        input, target = x
        encoder_outputs, encoder_hidden = self.encoder(input)
        decoder_hidden = encoder_hidden
        decoder_outputs = torch.zeros(
            (
                target.size(0),
                target.size(1),
                self.out_vocab_size,
            )
        ).to(next(self.encoder.parameters()).device)

        for t in range(input.size(1)):
            if t == 0 or np.random.rand() < self.teacher_prob:
                decoder_input = target[:, t].unsqueeze(-1)
            else:
                decoder_input = topi.squeeze(-1).detach()

            decoder_input = decoder_input.to(next(self.decoder.parameters()).device)

            decoder_output, decoder_hidden, _ = self.decoder(
                decoder_input,
                decoder_hidden,
                encoder_outputs,
            )

            _, topi = decoder_output.topk(1)
            decoder_outputs[:, t, :] = decoder_output.squeeze(1)

        return decoder_outputs, decoder_hidden

    def evaluate(self, x):
        if isinstance(x, (list, tuple)):
            input = x[0]
        else:
            input = x

        with torch.no_grad():
            encoder_outputs, encoder_hidden = self.encoder(input)
        decoder_hidden = encoder_hidden
        decoder_outputs = torch.zeros(
            (
                input.size(0),
                input.size(1),
                self.out_vocab_size,
            )
        ).to(next(self.encoder.parameters()).device)
        attentions = []
        for t in range(input.size(1)):
            if t == 0:
                decoder_input = torch.full(
                    size=(input.size(0), 1),
                    fill_value=1,  # self.vocab_input(["<sos>"]),
                    device=next(self.encoder.parameters()).device,
                )
            else:
                decoder_input = topi.squeeze(-1).detach()

            decoder_input = decoder_input.to(next(self.decoder.parameters()).device)
            with torch.no_grad():
                decoder_output, decoder_hidden, attn_weights = self.decoder(
                    decoder_input,
                    decoder_hidden,
                    encoder_outputs,
                )
                attentions.append(attn_weights)
            _, topi = decoder_output.topk(1)
            decoder_outputs[:, t, :] = decoder_output.squeeze(1)

        return decoder_outputs, attentions


class Seq2Seq(Application):
    in_vocab: vocab
    out_vocab: vocab
    teacher_prob: float
    model: torch.nn.Module
    loss: torch.nn.Module
    optimizer: Optimizer
    loss: torch.nn.Module
    metrics: list

    def __init__(
        self,
        in_vocab,
        out_vocab,
        teacher_prob=1.0,
        model=None,
        optimizer=None,
        loss=maskedNLL,
        metrics: Optional[Sequence[tm.Metric]] = None,
        attn: Optional[torch.nn.Module] = None,
        **kwargs,
    ):
        super().__init__(loss=loss, **kwargs)

        in_vocab_size = len(in_vocab)
        out_vocab_size = len(out_vocab)

        self.model = model or self._get_default_model(
            in_vocab_size,
            out_vocab_size,
            teacher_prob,
            attn,
        )
        self.in_vocab_size = in_vocab_size
        self.out_vocab_size = out_vocab_size
        self.teacher_prob = teacher_prob
        self.optimizer = optimizer or dl.Adam(lr=1e-3)
        self.out_vocab = out_vocab

        @self.optimizer.params
        def params(self):
            return self.parameters()

    def _get_default_model(self, in_vocab_size, out_vocab_size, teacher_prob, attn):
        seq2seq_mod = Seq2SeqModel(
            in_vocab_size=in_vocab_size,
            out_vocab_size=out_vocab_size,
            teacher_prob=teacher_prob,
            attn=attn,
        )

        return seq2seq_mod

    def train_preprocess(self, batch):
        x = batch
        y = torch.cat((x[-1][:, 1:], x[-1][:, -1:]), dim=1)

        return x, y

    val_preprocess = train_preprocess
    test_preprocess = train_preprocess

    def forward(self, x):

        decoder_outputs, decoder_hidden = self.model(x)

        return decoder_outputs

In [None]:
import torch

# The dimensionality of GloVe embeddings
embedding_dim = 300

num_special_tokens = len(special_tokens)

# Initialize a tensor to hold the embeddings for special tokens
# Here, PAD is initialized to zeros and SOS, EOS to random values
special_embeddings = torch.zeros(num_special_tokens, embedding_dim)
special_embeddings[1:] = (
    torch.rand(num_special_tokens - 1, embedding_dim) * 0.01
)  # Small random numbers for SOS, EOS

from torchtext.vocab import GloVe

# Load GloVe embeddings
glove = GloVe(name="42B", dim=embedding_dim, cache="./.vector_cache")

# Get GloVe embeddings for the vocabulary tokens
# Assuming 'vocab' is a list of vocabulary tokens including special tokens at the beginning
glove_embeddings_input = glove.get_vecs_by_tokens(
    vocab_input.get_itos(), lower_case_backup=True
)
glove_embeddings_target = glove.get_vecs_by_tokens(
    vocab_target.get_itos(), lower_case_backup=True
)

# Concatenate the special token embeddings with the GloVe embeddings
extended_embeddings_input = torch.cat(
    [special_embeddings, glove_embeddings_input], dim=0
)
extended_embeddings_target = torch.cat(
    [special_embeddings, glove_embeddings_target], dim=0
)

attention based on concat with dense layer (Bahdanau)

In [None]:
class BahdanauAttention(DeeplayModule):

    def __init__(self, hidden_size=None, embedding_size=None):
        super().__init__()
        self.hidden_size = hidden_size
        self.Wa = dl.Layer(torch.nn.Linear, hidden_size, embedding_size)
        self.Ua = dl.Layer(torch.nn.Linear, hidden_size, embedding_size)
        self.Va = dl.Layer(torch.nn.Linear, embedding_size, 1)
        self.act = dl.Layer(torch.nn.Softmax, dim=-1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = self.act(scores)

        context = torch.bmm(weights, keys)

        return context, weights

attention with dot product (Luong)

In [None]:
class LuongAttention(DeeplayModule):

    def __init__(self, hidden_size=None, embedding_size=None):
        super().__init__()
        self.hidden_size = hidden_size
        self.act = dl.Layer(torch.nn.Softmax, dim=-1)

    def forward(self, query, keys):

        energies = torch.sum(query * keys, dim=2).unsqueeze(1)
        weights = self.act(energies)
        context = torch.bmm(weights, keys)

        return context, weights

define model

In [None]:
seq2seq = Seq2Seq(
    in_vocab=vocab_input,
    out_vocab=vocab_target,
    teacher_prob=0.85,
    # attn=BahdanauAttention(),
    # attn=LuongAttention(),
)
# seq2seq.model.configure(hidden_layers=2, dropout=0.2)

seq2seq = seq2seq.create()

seq2seq.model.encoder.embedding.weight.data = extended_embeddings_input
seq2seq.model.encoder.embedding.weight.requires_grad = False
seq2seq.model.decoder.embedding.weight.data = extended_embeddings_target
seq2seq.model.decoder.embedding.weight.requires_grad = False

In [None]:
trainer = dl.Trainer(max_epochs=25, accelerator="mps")

train it

In [None]:
trainer.fit(seq2seq, train_loader)  # , val_loader)

visualize learning curves

In [None]:
trainer.history.plot()

evaluate results

In [None]:
seq2seq.model.to("cpu")
device = next(seq2seq.model.decoder.parameters()).device

In [None]:
def unprocess(x, vocab):
    unproc = []
    words = []
    for s in x:
        idxs = s[s > 2]
        words.append([vocab.lookup_token(idx) for idx in s])
        unproc.append(" ".join([vocab.lookup_token(idx) for idx in idxs]))

    return unproc, words

calculate BLEU score

In [None]:
from torchmetrics.text import BLEUScore

bleu_score = BLEUScore()

for batch_index, batch in enumerate(test_loader):

    inputs, targets = batch[0].to(device), batch[1].to(device)
    # x = next(iter(val_loader))
    # x[0].to(device)
    # x[1].to(device)

    y_hat, attentions = seq2seq.model.evaluate(batch)

    _, topi = y_hat.topk(1)
    x_un, input_words = unprocess(inputs, vocab_input)
    y_hat_un, output_words = unprocess(topi.squeeze(-1), vocab_target)
    y, gt_words = unprocess(targets, vocab_target)
    y = [[yi] for yi in y]

    bleu_score.update(y_hat_un, y)

    if batch_index < 3:  # Only print examples from the first three batches
        print("\nExamples from batch {}:".format(batch_index + 1))
        for i in range(min(3, len(input_words))):  # Print up to 3 examples per batch
            print(f"Input Sentence: {' '.join(input_words[i])}")
            print(f"Predicted Translation: {' '.join(output_words[i])}")
            print(f"Actual Translation: {' '.join(gt_words[i])}")

final_bleu = bleu_score.compute()
print(f"Validation BLEU Score: {final_bleu:.3f}")

try random sentencs from test set

get attentions

In [None]:
att = torch.cat(attentions, dim=1)

plot attention

In [None]:
from matplotlib import pyplot as plt
from matplotlib.ticker import MultipleLocator

idx = 2

fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(att[idx], cmap="bone")
fig.colorbar(cax)

# Set up axes - Add empty string at the beginning to correct label positions
ax.set_xticklabels(
    [""] + input_words[idx], rotation=90
)  # Correct the label positioning
ax.set_yticklabels([""] + output_words[idx])

# Show label at every tick
ax.xaxis.set_major_locator(MultipleLocator(1))
ax.yaxis.set_major_locator(MultipleLocator(1))

plt.show()

try on a few new sentences

In [None]:
def make_inference(model, source_text, max_length=MAX_LENGTH):
    # Tokenize the source text
    query_tokens = tokenize(source_text, input_lang)

    # Process the tokens into the model's expected format, including adding <sos>, <eos>, and padding
    query = process_sentence(query_tokens)

    # Convert tokens to indices using the vocabulary
    query = np.array(vocab_input(query))

    # Convert list of indices to a tensor and add a batch dimension
    source_sequence = torch.tensor(query, dtype=torch.int)

    # Move tensor to the same device as the model
    source_sequence = source_sequence.to(next(model.parameters()).device).unsqueeze(0)
    # # Encoder inference
    y_hat, _ = model.evaluate(source_sequence)

    _, topi = y_hat.topk(1)
    y_hat_un, _ = unprocess(topi.squeeze(-1), vocab_target)
    # y = unprocess(x[-1], vocab_target)
    # y = [[yi] for yi in y]

    # Convert indices back to tokens
    # predicted_tokens = [vocab_target.lookup_token(idx) for idx in predictions]

    return y_hat_un  # " ".join(predicted_tokens)

In [None]:
source_text = "should we go?"
response = make_inference(seq2seq.model, source_text)
print(response)

In [None]:
source_text = "what's your name?"  # "¿como te llamas?"  #
response = make_inference(seq2seq.model, source_text)
print(response)

In [None]:
source_text = "Don't you have a surname?"
response = make_inference(seq2seq.model, source_text)
print(response)

In [None]:
source_text = "como va?"  # "How are you doing?"
response = make_inference(seq2seq.model, source_text)
print(response)

In [None]:
source_text = "I'm doing great too."
response = make_inference(seq2seq.model, source_text)
print(response)

In [None]:
source_text = "are you still happy?"
response = make_inference(seq2seq.model, source_text)
print(response)

In [None]:
source_text = "who is the american president?"
response = make_inference(seq2seq.model, source_text)
print(response)

In [None]:
source_text = "Are you messing around with me?"
response = make_inference(seq2seq.model, source_text)
print(response)

In [None]:
source_text = "why don't you stop laughing at me?"
response = make_inference(seq2seq.model, source_text)
print(response)

In [None]:
source_text = "I would like a glass of wine"
response = make_inference(seq2seq.model, source_text)
print(response)