<a href="https://colab.research.google.com/github/AchrafAsh/awesome-pytorch-notebooks/blob/main/01_machine_translation_with_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Translation with Transformers

In [5]:
import os
import pandas as pd
import spacy  # for tokenization

spacy_eng = spacy.load("en_core_web_sm")

from typing import List

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence  # padding of every batch
from torch.utils.data import Dataset, DataLoader

In [10]:
import gensim

## Vocabulary

In [7]:
def tokenizer_en(text:str) -> List[str]:
    """Tokenize a text
    """
    return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

In [9]:
class Vocab:
    def __init__(self, freq_threshold=1):
        self.freq_threshold = freq_threshold
        """Special words:
            PAD = Padding (filler word to make a short sentence the same size as the longest sentence in the batch)
            SOS = Start of Sentence
            EOC = End of Sentence
            UNK = Unknown word
        """
        self.idx_to_str = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.str_to_idx = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}

    def __len__(self):
        return len(self.idx_to_str)

    def build_vocab(self, sentences: List[str]):
        frequencies = {}  # store the frequency of each word encountered
        idx = 4  # 0, 1, 2 and 3 are already set

        for sentence in sentences:
            for word in tokenizer_en(sentence):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1

                if frequencies[word] == self.freq_threshold:
                    self.str_to_idx[word] = idx
                    self.idx_to_str[idx] = word
                    idx += 1
        return

    def to_int(self, word:str) -> int:
        """Transform a word into it's corresponding index.
        """
        if word in self.str_to_idx:
            return self.str_to_idx[word]
        else:
            return self.str_to_idx["<UNK>"]

    def __getitem__(self, idx:int) -> str:
        if idx in self.idx_to_str:
            return self.idx_to_str[idx]
        else:
            return "<UNK>"

    def encode(self, sentence:str) -> List[str]:
        tokens = tokenizer_en(sentence)
        return [
            self.str_to_idx[token] if token in self.str_to_idx else self.str_to_idx["<UNK>"]
            for token in tokens
        ]

## Build the model

- Embeddings â†’ encode one-hot-encoded words as continuous vectors to catch semantic (might use pre-trained word2vec for that)
- Transformer Block stacked

In [None]:
class Translator(nn.Module):
    def __init__(self, src_vocab_size, trgt_vocab_size, hidden_dim=124, padding_idx=0, word_vectors=None):
        if word_vectors not None:
            self.embedding = nn.Embedding.from_pretrained(weight)
            assert hidden_dim == weight.size(0) # TODO: update to make sure the output is the same as the input of the transformer
        else:
            self.embedding = nn.Embedding(num_embeddings=src_vocab_size, embedding_dim=hidden_dim, padding_idx=padding_idx)
        
        self.transformer = nn.Transformer(d_model=hidden_dim, nhead=8) # docs: https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html
    
    def load_embeddings(self, keyed_vectors):
        self.embedd

    def forward(self, src, tgt):
        src, tgt = self.embedding(src), self.embedding(tgt)
        return self.transformer(src, tgt)

## Training the model

In [None]:
# TODO: download a pre-trained word2vec (a very small one to see if a pre-trained yield better results)
!wget 

In [None]:
# Load pre-trained word vectors
model = gensim.models.KeyedVectors.load_word2vec_format('path/to/file')
weights = torch.FloatTensor(model.vectors)

In [None]:
def run(model, dataset, epochs, lr=0.01, weight_decay=0.001):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    for epoch in range(1, epochs+1):
        total_loss = 0
        for idx, data in enumerate(dataset):
            src, tgt = zip(data)
            loss = train(model, src, tgt, optimizer)
            total_loss += loss

        print(f"Epoch: [{epoch} / {epochs}] | Loss: {total_loss}")


def train(model, src, tgt, optimizer):
    model.train()
    output = model(src, tgt)
    loss = nn.CrossEntropyLoss()(output, tgt[])
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss
    

def evaluate(model, dataset):
    model.eval()
    criterion = nn.CrossEntropyLoss()
    total_loss = 0
    accuracy = 0

    for idx, data in enumerate(dataset):
        src, tgt = zip(data)
        output = model(src, tgt)
        loss = criterion(output, tgt)
        total_loss += loss
    
    print(f"Total Loss: {total_loss}")

# Building the Transformer from scratch

- Attention is all you need: [link to paper]

[image of the architecture]

## Self-Attention

## Transformer Block

## Encode / Decoder

## Putting everything together