In [1]:
# If on Colab, make sure you switch the runtime to use a T4 GPU. Or else each Epoch will take ~10 minutes :(
!pip3 install torchtext==0.6

!python -m spacy download fr_core_news_sm
!python -m spacy download en_core_web_sm

Collecting torchtext==0.6
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from torchtext==0.6)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.15.2
    Uninstalling torchtext-0.15.2:
      Successfully uninstalled torchtext-0.15.2
Successfully installed sentencepiece-0.1.99 torchtext-0.6.0
2023-09-19 03:37:18.476379: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available

In [None]:
# This project was created with help from different tutorials/articles and based off the paper for Seq2Seq: https://arxiv.org/abs/1409.3215

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random

In [None]:
# Code for loading/saving states of a model as well as testing the model.
# This was not developed by me, I just found it and used it.


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)

def translate_sentence(model, sentence, french, english, device, max_length=50):
    spacy_fr = spacy.load("fr_core_news_sm")
    if type(sentence) == str:
        tokens = tokenize_fr(sentence)
    else:
        tokens = [token.text for token in spacy_fr(sentence)]

    tokens.insert(0, french.init_token)
    tokens.append(french.eos_token)

    text_to_indices = [french.vocab.stoi[token] for token in tokens]

    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [english.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]

    return translated_sentence[1:]




In [9]:
fr = spacy.load("fr_core_news_sm")
en = spacy.load("en_core_web_sm")

def tokenize_fr(text):
    """
    Tokenizes French text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in fr.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in en.tokenizer(text)]

# Tokenize the text
# <sos> refers to "Start of Sentence", <eos> refers to "End of Sentence"
french = Field(tokenize=tokenize_fr, lower=True, init_token="<sos>",
                eos_token="<eos>")

english = Field(tokenize=tokenize_en, lower=True, init_token="<sos>",
                eos_token="<eos>")
# Here we change the root for our data since you can't create a folder called ".data" in colab
train_data, valid_data, test_data = Multi30k.splits(exts=(".fr", ".en"),
                                                    fields=(french, english),
                                                    root="data/")

# Building the vocab, we choose min_freq of 2 so that we only have words that appear more than once.
# Additionally, I chose to do a max_size of 5000 tokens to increase training speed on my own personal computer (which is forcing me to use my CPU)
french.build_vocab(train_data, max_size=5000, min_freq=2)
english.build_vocab(train_data, max_size=5000, min_freq=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, drop):
        super(Encoder, self).__init__()

        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.dropout = nn.Dropout(drop)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=drop)

    def forward(self, src):
        # src = [src_len, batch_size]
        embedded = self.dropout(self.embedding(src))
        # embedded = [src_len, batch_size, emb_dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs = [src_len, batch_size, hid_dim * n_directions]
        # hidden = [n_layers * n_directions, batch_size, hid_dim]
        # cell = [n_layers * n_directions, batch_size, hid_dim]
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, output_dim, n_layers, drop):
        super(Decoder, self).__init__()

        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.dropout = nn.Dropout(drop)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=drop)
        self.fc_out = nn.Linear(hid_dim, output_dim)

    def forward(self, input, hidden, cell):
        # shape of input: (N) but we want (1, N)
        input = input.unsqueeze(0)

        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch_size, emb_dim]
        outputs, (hidden, cell) = self.rnn(embedded, (hidden, cell))

        prediction = self.fc_out(outputs)
        prediction = prediction.squeeze(0)

        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()

        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = len(english.vocab)

        hidden, cell = self.encoder(src)

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(device)

        input = trg[0]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output

            teacher_force = random.random() < teacher_forcing_ratio

            top1 = output.argmax(1)

            input = trg[t] if teacher_force else top1

        return outputs



In [3]:
# Hyperparameters
num_epochs = 80
learning_rate = 0.001
BATCH_SIZE = 64

# Change to True if you already have a model (my_checkpoint.pth.tar file)
load_model = False
input_dim_encoder = len(french.vocab)
input_dim_decoder = len(english.vocab)
output_dim = len(english.vocab)

enc_emb_dim = 256
dec_emb_dim = 256
hid_dim = 512
n_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

print("Creating model...")
# Create the model
enc = Encoder(input_dim_encoder, enc_emb_dim, hid_dim, n_layers, enc_dropout)
dec = Decoder(input_dim_decoder, dec_emb_dim, hid_dim, output_dim, n_layers, dec_dropout)

model = Seq2Seq(enc, dec).to(device)

step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src),
    device=device
)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

# Example sentence used for seeing how the model improves translations. Sentence in English means "an average woman eats an apple"
sentence = "une femme française moyenne mange une pomme"


total_batch = len(train_iterator)


for epoch in range(num_epochs):
    completed = 0
    print(f"[Epoch {epoch} / {num_epochs}]")

    checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
    save_checkpoint(checkpoint)

    model.eval()

    translated_sentence = translate_sentence(model, sentence, french, english, device, max_length=50)

    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()

    for batch_idx, batch in enumerate(train_iterator):
        inp_data = batch.src.to(device)

        target = batch.trg.to(device)

        output = model(inp_data, target)

        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()

        loss = criterion(output, target)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        optimizer.step()

        step += 1

        completed += 1

        percent_complete = round((completed / total_batch) * 100)


        if percent_complete % 5 == 0:
            print("Percent Complete:", percent_complete, "%", "Completed so far:", completed, "out of", total_batch, "batches")



Starting...
Tokenizing...
Building vocab...
Creating iterators...
Creating model...
=> Loading checkpoint
[Epoch 0 / 80]
=> Saving checkpoint
Translated example sentence: 
 ['a', '<unk>', 'woman', 'is', 'is', 'a', 'a', 'chip', '<eos>']
Percent Complete: 0 % Completed so far: 1 out of 454 batches
Percent Complete: 0 % Completed so far: 2 out of 454 batches
Percent Complete: 5 % Completed so far: 21 out of 454 batches
Percent Complete: 5 % Completed so far: 22 out of 454 batches
Percent Complete: 5 % Completed so far: 23 out of 454 batches
Percent Complete: 5 % Completed so far: 24 out of 454 batches
Percent Complete: 10 % Completed so far: 44 out of 454 batches
Percent Complete: 10 % Completed so far: 45 out of 454 batches
Percent Complete: 10 % Completed so far: 46 out of 454 batches
Percent Complete: 10 % Completed so far: 47 out of 454 batches
Percent Complete: 15 % Completed so far: 66 out of 454 batches
Percent Complete: 15 % Completed so far: 67 out of 454 batches
Percent Complete

KeyboardInterrupt: ignored

In [7]:
# The sentence below it got perfectly correct

sentence = "La femme répare sa maison"
model.eval()

translated_sentence = translate_sentence(model, sentence, french, english, device, max_length=50)

print(f"Translated example sentence: \n {translated_sentence}")


Translated example sentence: 
 ['the', 'lady', 'is', 'fixing', 'her', 'home', '<eos>']
