In [1]:
!python -m spacy download fr_core_news_sm
!python -m spacy download en_core_web_sm

Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, DatasetDict
import spacy
# import gc

In [3]:
''' 
This cell performs the following actions.
1. load data via huggingface
2. convert to dataframe to split and convert to huggingface dataset format.
'''

data_files = {'validation': 'fr-en/validation-00000-of-00001.parquet'}
dataset = load_dataset(path='wmt/wmt14', trust_remote_code=True, data_files=data_files)
data = pd.DataFrame(dataset['validation'])

train, temp = train_test_split(data, test_size=0.4, random_state=0)
test, validation = train_test_split(temp, test_size=0.5, random_state=0)

def process_translations(df):
    en_texts = [item['en'] for item in df['translation']]
    fr_texts = [item['fr'] for item in df['translation']]
    
    return pd.DataFrame({
        'en': en_texts,
        'fr': fr_texts
    })

train_processed = process_translations(train)
test_processed = process_translations(test)
validation_processed = process_translations(validation)

train_dataset = Dataset.from_pandas(train_processed.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_processed.reset_index(drop=True))
validation_dataset = Dataset.from_pandas(validation_processed.reset_index(drop=True))

ds = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'validation': validation_dataset
})

ds

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/475k [00:00<?, ?B/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['en', 'fr'],
        num_rows: 1800
    })
    test: Dataset({
        features: ['en', 'fr'],
        num_rows: 600
    })
    validation: Dataset({
        features: ['en', 'fr'],
        num_rows: 600
    })
})

In [4]:
# del dataset, data
# del train, temp, test, validation
# del process_translations, train_processed, test_processed, validation_dataset
# gc.collect()

In [5]:
'''
This cell performs the following actions.
1. load the english and french spacy tokenizer
2. set the special characters.
3. tokenizer the whole dataset and split into train, test and validation
'''

en_nlp = spacy.load('en_core_web_sm')
fr_nlp = spacy.load('fr_core_news_sm')

def tokenize_example(example, en_nlp, fr_nlp, max_length, sos_token, eos_token):
    en_tokens = [token.text.lower() for token in en_nlp.tokenizer(example['en'])][:max_length]
    fr_tokens = [token.text.lower() for token in fr_nlp.tokenizer(example['fr'])][:max_length]

    en_tokens = [sos_token] + en_tokens + [eos_token]
    fr_tokens = [sos_token] + fr_tokens + [eos_token]

    return {'en_tokens': en_tokens, 'fr_tokens': fr_tokens} 

    
max_length = 1000
sos_token = '<sos>'
eos_token = '<eos>'
pad_token = '<pad>'

fn_kwargs = {
    'en_nlp': en_nlp,
    'fr_nlp': fr_nlp,
    'max_length': max_length,
    'sos_token': sos_token,
    'eos_token': eos_token,
}

train_data, test_data, validation_data = (
    ds['train'],
    ds['test'],
    ds['validation'],
)

train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)
validation_data = validation_data.map(tokenize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [6]:
''' 
Create English and French language vocabulary.
Mapping of word to number(integer)
'''

from collections import Counter

def lang_str_int(lang, nlp):
    lang_vocab = []
    special_vocab = ['<unk>', '<pad>', '<sos>', '<eos>'] 

    flattened_list = [token.text.lower() for sentence in lang for token in nlp.tokenizer(sentence)]
    lang_count = Counter(flattened_list)
    lang_words = [string for string, freq in lang_count.items() if freq >= 2]

    lang_vocab = special_vocab + lang_words
    # lang_vocab.extend(special_vocab)
    # lang_vocab.extend(lang_words)

    lang_str2int = {ch: i for i, ch in enumerate(lang_vocab)}
    lang_int2str = {i: ch for i, ch in enumerate(lang_vocab)}

    return lang_str2int, lang_int2str

en = process_translations(data)['en'].tolist()
fr = process_translations(data)['fr'].tolist()

fr_str2int, fr_int2str = lang_str_int(fr, fr_nlp)
en_str2int, en_int2str = lang_str_int(en, en_nlp)

In [7]:
''' 
create a new feature of tokens(words) to numbers(integers).
'''

import torch
import numpy as np
import torch.nn as nn

def token_to_int(example, str2int):
    return [str2int.get(token, str2int['<unk>']) for token in example]

def tokens_to_ids(example):
    example['en_ids'] = token_to_int(example['en_tokens'], en_str2int)
    example['fr_ids'] = token_to_int(example['fr_tokens'], fr_str2int)
    return example

train_data = train_data.map(tokens_to_ids)
test_data = test_data.map(tokens_to_ids)
validation_data = validation_data.map(tokens_to_ids)

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [8]:
'''
Reverse the source language, eg source_lang: 'hello' -> 'olleh'
This is said to give a significant boost in the model accuracy.
example: 
    source_lang: a, b, c
    target_lang: α, β, γ

    reversing: c, b, a -> α, β, γ
    Why? because it makes it faster to establish a communication.
'''

def reverse_source_lang(example):
    '''reverse list'''
    example['en_ids'] = example['en_ids'][::-1]
    return example

train_data = train_data.map(reverse_source_lang)
test_data = test_data.map(reverse_source_lang)
validation_data = validation_data.map(reverse_source_lang)

train_data.set_format(
    type='torch',
    columns=['en_ids', 'fr_ids'],
    output_all_columns=False
)
test_data.set_format(
    type='torch',
    columns=['en_ids', 'fr_ids'],
    output_all_columns=False
)
validation_data.set_format(
    type='torch',
    columns=['en_ids', 'fr_ids'],
    output_all_columns=False
)

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [9]:
# data batching
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example['en_ids'] for example in batch]
        batch_fr_ids = [example['fr_ids'] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_fr_ids = nn.utils.rnn.pad_sequence(batch_fr_ids, padding_value=pad_index)
        batch = {
            'en_ids': batch_en_ids,
            'fr_ids': batch_fr_ids
        }
        return batch
    return collate_fn


def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle
    )
    return data_loader


# data loader
batch_size = 128
PAD_INDEX = en_str2int[pad_token]

train_data_loader = get_data_loader(train_data, batch_size, PAD_INDEX, shuffle=True)
test_data_loader = get_data_loader(test_data, batch_size, PAD_INDEX, shuffle=False)
validation_data_loader = get_data_loader(validation_data, batch_size, PAD_INDEX, shuffle=False)

result = next(iter(train_data_loader))
result['en_ids'].shape, result['fr_ids'].shape

(torch.Size([72, 128]), torch.Size([82, 128]))

## Model architecture

In [10]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell


class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
     
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input_ = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input_))
        output, (hidden, cell) = self.rnn(embedded, (hidden,cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, src, trg, trg_len):
        ''' 
        src: [src_len, batch_size]
        trg: [trg_len, batch_size]
        trg_len: length o
        '''
        batch_size = src.shape[1]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        hidden, cell = self.encoder(src)

        input_ = trg[0, :]
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input_, hidden, cell)
            outputs[t] = output            
            top1 = output.argmax(1)
            input_ = top1
        
        return outputs

In [11]:
'''
model hyperparameters
'''
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

input_dim = len(en_str2int)
output_dim = len(fr_str2int)

encoder_embedding_dim = 256
decoder_embedding_dim = 256

hidden_dim = 512
n_layers = 2

encoder_dropout = 0.5
decoder_dropout = 0.5
CLIP = 1

encoder = Encoder(
    input_dim, 
    encoder_embedding_dim, 
    hidden_dim, 
    n_layers,
    encoder_dropout
).to(device)

decoder = Decoder(
    output_dim, 
    decoder_embedding_dim, 
    hidden_dim, 
    n_layers,
    decoder_dropout
).to(device)


''' 
The paper suggest that initialzing the weight with a uniform distribution of -0.08 to 0.08
'''
for param in encoder.parameters():
    torch.nn.init.uniform_(param, a=-0.08, b=0.08)

for param in decoder.parameters():
    torch.nn.init.uniform_(param, a=-0.08, b=0.08)
    
model = Seq2Seq(encoder, decoder, device).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.7) 
criterion = nn.CrossEntropyLoss(ignore_index=fr_str2int['<pad>'])

In [12]:
def train(model, dataloader, optimizer, criterion, clip):
    model.train()

    epoch_loss = 0

    for batch in dataloader:
        src = batch['en_ids'].to(device)
        trg = batch['fr_ids'].to(device)

        optimizer.zero_grad()

        output = model(src, trg, trg.shape[0])

        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()

        # torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
    
    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, criterion):
    model.eval()

    epoch_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            src = batch['en_ids'].to(device)
            trg = batch['fr_ids'].to(device)

            output = model(src, trg, trg.shape[0])            

            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].reshape(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

In [14]:
import time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    mins = int(elapsed_time / 60)
    secs = int(elapsed_time - (mins * 60))
    return mins, secs

N_EPOCHS = 7.5
initial_lr = 0.7
half_epoch_counter = 0
CLIP = 1

for epoch in range(int(N_EPOCHS)):
    start_time = time.time()
    
    train_loss = train(model, train_data_loader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, validation_data_loader, criterion)
    

    if epoch >= 5:
        half_epoch_counter += 1
        
        if half_epoch_counter % 1 == 0:
            new_lr = optimizer.param_groups[0]['lr'] / 2
            for param_group in optimizer.param_groups:
                param_group['lr'] = new_lr

        # elif epoch % 0.5 == 0:
        #     new_lr = optimizer.param_groups[0]['lr'] / 2
        #     for param_group in optimizer.param_groups:
        #         param_group['lr'] = new_lr

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    
    print(f'Epoch: {epoch+1:02} | Learning Rate: {optimizer.param_groups[0]["lr"]} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')
# torch.save(model.state_dict(), 'seq2seq_model_no_teacher_forcing.pth')

Epoch: 01 | Learning Rate: 0.7 | Time: 0m 6s
	Train Loss: 5.998
	 Val. Loss: 6.012
Epoch: 02 | Learning Rate: 0.7 | Time: 0m 6s
	Train Loss: 5.994
	 Val. Loss: 6.044
Epoch: 03 | Learning Rate: 0.7 | Time: 0m 6s
	Train Loss: 6.025
	 Val. Loss: 5.952
Epoch: 04 | Learning Rate: 0.7 | Time: 0m 6s
	Train Loss: 5.957
	 Val. Loss: 5.965
Epoch: 05 | Learning Rate: 0.7 | Time: 0m 6s
	Train Loss: 5.915
	 Val. Loss: 5.967
Epoch: 06 | Learning Rate: 0.35 | Time: 0m 6s
	Train Loss: 5.918
	 Val. Loss: 5.966
Epoch: 07 | Learning Rate: 0.175 | Time: 0m 6s
	Train Loss: 5.893
	 Val. Loss: 5.900


In [15]:
torch.save(model.state_dict(), 'seq2seq_model_no_teacher_forcing.pth')

In [16]:
def translate_sentence(sentence, encoder, decoder, src_vocab, trg_vocab, device, max_len=50):
    """
    sentence: list of token indices
    src_vocab: mapping from indices to tokens for source language
    trg_vocab: mapping from indices to tokens for target language
    """
    encoder.eval()
    decoder.eval()
    
    # Convert to tensor and add batch dimension
    src_tensor = torch.LongTensor(sentence).unsqueeze(1).to(device)  # [src_len, 1]
    
    with torch.no_grad():
        hidden, cell = encoder(src_tensor)
    
    # Start with <sos> token
    trg_indexes = [fr_str2int['<sos>']]
    
    for _ in range(max_len):
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
        
        with torch.no_grad():
            output, hidden, cell = decoder(trg_tensor, hidden, cell)
        
        pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)
        
        if pred_token == fr_str2int['<eos>']:
            break
    
    # Convert indices to tokens
    trg_tokens = [fr_int2str[token] for token in trg_indexes]
    
    return trg_tokens


In [17]:
sample_sentence = [
    en_str2int['<sos>'], 
    en_str2int['i'], 
    en_str2int['am'], 
    en_str2int['a'], 
    en_str2int['student'], 
    en_str2int['<eos>']
]

# Translate
translation = translate_sentence(sample_sentence, encoder, decoder, en_str2int, fr_str2int, device)

print('Translated French Sentence:', ' '.join(translation))

Translated French Sentence: <sos> les <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <eos>


In [18]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

def calculate_bleu(data, model, device, max_len=50):
    trgs = []
    pred_trgs = []
    
    for datum in data:
        src = datum['en_ids']
        trg = datum['fr_ids']
        
        src_sentence = src.tolist()[0]
        trg_sentence = trg.tolist()[0]
        
        pred_tokens = translate_sentence(src_sentence, model.encoder, model.decoder, en_str2int, fr_str2int, device, max_len)
        pred_tokens = pred_tokens[1:-1]  # Remove <sos> and <eos>
        
        trg_tokens = [int2fr[token] for token in trg_sentence if token not in [fr_str2int['<sos>'], fr_str2int['<eos>'], fr_str2int['<pad>']]]
        
        trgs.append([trg_tokens])
        pred_trgs.append(pred_tokens)
    
    return corpus_bleu(trgs, pred_trgs)


In [19]:
calculate_bleu(validation_data_loader, model, device=device)

0

In [None]:
# Saving
torch.save({
    'encoder_state_dict': encoder.state_dict(),
    'decoder_state_dict': decoder.state_dict(),
}, 'seq2seq_model.pth')

# Loading
checkpoint = torch.load('seq2seq_model.pth')
encoder.load_state_dict(checkpoint['encoder_state_dict'])
decoder.load_state_dict(checkpoint['decoder_state_dict'])
model = Seq2Seq(encoder, decoder, device).to(device)
model.eval()
