In [117]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import torch.nn as nn
from datasets import Dataset
from tokenizers import BertWordPieceTokenizer

from sklearn.model_selection import train_test_split
from transformers_from_scratch import Transformer

import os
import pandas as pd

In [111]:

file_path = "src/sample_dataset_1.csv"
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,French,Bassa
0,Il y avait dans le pays d'Uts un homme dont le...,"Mut wada a bé yééne i loñ Us, jôl jé li bé le ..."
1,Et il lui naquit sept fils et trois filles;,"A bééna bon bôlôm basaambok, ni bon bôda baa."
2,"et il possédait sept mille brebis, et trois mi...","A bééna ki 7 000 di mintômba, 3 000 di kamél, ..."
3,"Et ses fils allaient et faisaient un festin, c...",Hiki man wé nu munlôm a bééna yé ngéda i tégba...
4,"Et il arrivait que, quand les jours de festin ...","I ngéda ba bé ba mal i mangand ma, Hiôb a bé a..."


Method to train an encoder-decoder transformer model from scratch where the encoder processes French text and the decoder processes Bassa text. To do this, it is better to use separate tokenizers for each language. This way, each tokenizer can focus on the specific vocabulary and tokenization rules of its respective language, which can improve the efficiency and accuracy of the model.

### Step 1: Create Separate Tokenizers
We'll create separate tokenizers for French and Bassa.

In [112]:


# Save the French texts to a temporary file
os.makedirs("temp", exist_ok=True)
french_texts_path = "temp/french_texts.txt"
data['French'].to_csv(french_texts_path, index=False, header=False)

# Save the Bassa texts to a temporary file
bassa_texts_path = "temp/bassa_texts.txt"
data['Bassa'].to_csv(bassa_texts_path, index=False, header=False)

# Initialize and train the French tokenizer
french_tokenizer = BertWordPieceTokenizer()
french_tokenizer.train(files=[french_texts_path], vocab_size=30_000, min_frequency=2, special_tokens=[
    "[PAD]",
    "[UNK]",
    "[CLS]",
    "[SEP]",
    "[MASK]",
])
os.makedirs("french_tokenizer", exist_ok=True)
french_tokenizer.save_model("french_tokenizer")

# Initialize and train the Bassa tokenizer
bassa_tokenizer = BertWordPieceTokenizer()
bassa_tokenizer.train(files=[bassa_texts_path], vocab_size=30_000, min_frequency=2, special_tokens=[
    "[PAD]",
    "[UNK]",
    "[CLS]",
    "[SEP]",
    "[MASK]",
])
os.makedirs("bassa_tokenizer", exist_ok=True)
bassa_tokenizer.save_model("bassa_tokenizer")

# Load the tokenizers using the transformers library
from transformers import BertTokenizerFast

french_tokenizer = BertTokenizerFast.from_pretrained("french_tokenizer")
bassa_tokenizer = BertTokenizerFast.from_pretrained("bassa_tokenizer")









### Step 2: Prepare the Dataset

We will prepare the datasets for training using the respective tokenizers.

In [113]:
# Split the dataset into training and validation sets
train_french, val_french, train_bassa, val_bassa = train_test_split(
    data['French'], data['Bassa'], test_size=0.1, random_state=42
)

In [114]:
# Function to prepare dataset
def tokenize_and_prepare_dataset(src_texts, tgt_texts, src_tokenizer, tgt_tokenizer):
    src_encodings = src_tokenizer(src_texts.tolist(), truncation=True, padding=True, max_length=100, return_tensors="pt")
    tgt_encodings = tgt_tokenizer(tgt_texts.tolist(), truncation=True, padding=True, max_length=100, return_tensors="pt")

    dataset = TensorDataset(src_encodings['input_ids'], tgt_encodings['input_ids'])
    return dataset

In [115]:
# Tokenize and prepare training and validation datasets
train_dataset = tokenize_and_prepare_dataset(train_french, train_bassa, french_tokenizer, bassa_tokenizer)
val_dataset = tokenize_and_prepare_dataset(val_french, val_bassa, french_tokenizer, bassa_tokenizer)

# DataLoader
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

  0%|          | 0/15606 [09:35<?, ?it/s]


### Step 2: Train the Model
Now, we need to set up the training loop to train your custom transformer model using the prepared data.



In [118]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
src_vocab_size = french_tokenizer.vocab_size
trg_vocab_size = bassa_tokenizer.vocab_size
# Define the padding token indices for the source and target tokenizers
src_pad_idx = french_tokenizer.pad_token_id
trg_pad_idx = bassa_tokenizer.pad_token_id
embed_size = 256
num_layers = 6
forward_expansion = 4
heads = 8
dropout = 0.1
max_length = 100
learning_rate = 0.0003

# Initialize the transformer model
model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx=src_pad_idx, trg_pad_idx=trg_pad_idx, embed_size=embed_size, num_layers=num_layers, forward_expansion=forward_expansion, heads=heads, dropout=dropout, device=device, max_length=max_length).to(device)


In [119]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
num_epochs = 50


In [120]:
for epoch in range(num_epochs):
    model.train()
    for batch_idx, (src, trg) in enumerate(train_loader):
        src, trg = src.to(device), trg.to(device)
        trg_input = trg[:, :-1]
        trg_target = trg[:, 1:]

        optimizer.zero_grad()
        output = model(src, trg_input)
        
        # Reshape output and target to calculate loss
        output = output.reshape(-1, output.shape[2])
        trg_target = trg_target.reshape(-1)
        
        loss = criterion(output, trg_target)
        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx}/{len(train_loader)}], Loss: {loss.item():.4f}")

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for src, trg in val_loader:
            src, trg = src.to(device), trg.to(device)
            trg_input = trg[:, :-1]
            trg_target = trg[:, 1:]

            output = model(src, trg_input)
            output = output.reshape(-1, output.shape[2])
            trg_target = trg_target.reshape(-1)

            loss = criterion(output, trg_target)
            val_loss += loss.item()

    val_loss /= len(val_loader)
    print(f"Validation Loss after Epoch [{epoch+1}/{num_epochs}]: {val_loss:.4f}")

Epoch [1/10], Step [0/651], Loss: 9.0890
Epoch [1/10], Step [100/651], Loss: 5.5956
Epoch [1/10], Step [200/651], Loss: 5.6855
Epoch [1/10], Step [300/651], Loss: 5.6484
Epoch [1/10], Step [400/651], Loss: 5.7071
Epoch [1/10], Step [500/651], Loss: 5.0472
Epoch [1/10], Step [600/651], Loss: 4.9304
Validation Loss after Epoch [1/10]: 4.8212
Epoch [2/10], Step [0/651], Loss: 4.8757
Epoch [2/10], Step [100/651], Loss: 4.8275
Epoch [2/10], Step [200/651], Loss: 4.7205
Epoch [2/10], Step [300/651], Loss: 4.4361
Epoch [2/10], Step [400/651], Loss: 4.5861
Epoch [2/10], Step [500/651], Loss: 4.5293
Epoch [2/10], Step [600/651], Loss: 4.4801
Validation Loss after Epoch [2/10]: 4.3943
Epoch [3/10], Step [0/651], Loss: 4.5319
Epoch [3/10], Step [100/651], Loss: 4.4030
Epoch [3/10], Step [200/651], Loss: 4.4922
Epoch [3/10], Step [300/651], Loss: 4.3558
Epoch [3/10], Step [400/651], Loss: 4.2479
Epoch [3/10], Step [500/651], Loss: 4.4492
Epoch [3/10], Step [600/651], Loss: 4.1903
Validation Loss a

In [122]:
bassa_tokenizer.pad_token_id

0

In [123]:
src_pad_idx = french_tokenizer.pad_token_id
trg_pad_idx = bassa_tokenizer.pad_token_id

In [124]:
# Save the model
model_save_path = "transformer_model.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

# Load the model
loaded_model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, embed_size=256, num_layers=6, forward_expansion=4, heads=8, dropout=0.1, device=device, max_length=100).to(device)
loaded_model.load_state_dict(torch.load(model_save_path))
print(f"Model loaded from {model_save_path}")

Model saved to transformer_model.pth
Model loaded from transformer_model.pth


In [127]:
def translate_sentence(model, sentence, src_tokenizer, tgt_tokenizer, device, max_length=50):
    src_tokens = src_tokenizer(sentence, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length")
    src_tokens = src_tokens['input_ids'].to(device)
    
    model.eval()
    with torch.no_grad():
        # Generate the source mask
        src_mask = model.make_src_mask(src_tokens)
        
        # Encode the source tokens
        enc_src = model.encoder(src_tokens, src_mask)
        
        # Prepare the initial target input token ([CLS] token)
        tgt_tokens = torch.tensor([[tgt_tokenizer.cls_token_id]], dtype=torch.long).to(device)
        
        for _ in range(max_length):
            # Generate the target mask
            trg_mask = model.make_trg_mask(tgt_tokens).to(device)
            
            # Decode the current target tokens
            output = model.decoder(tgt_tokens, enc_src, src_mask, trg_mask)
            
            # Get the last token's logits and apply softmax to get probabilities
            preds = output[:, -1, :].softmax(dim=-1)
            
            # Get the token ID with the highest probability
            next_token = preds.argmax(1).unsqueeze(0)
            
            # Concatenate the predicted token to the target tokens
            tgt_tokens = torch.cat((tgt_tokens, next_token), dim=1)
            
            # Stop if the end token is generated
            if next_token.item() == tgt_tokenizer.sep_token_id:
                break
        
    # Decode the token IDs to get the translated sentence
    translated_sentence = tgt_tokenizer.decode(tgt_tokens.squeeze().tolist(), skip_special_tokens=True)
    return translated_sentence




In [131]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Assuming the model and tokenizers are already initialized and trained
# sentence = "Your input sentence in French"
#sentence = "Bonjour tout le monde"
sentence = "Et l'Éternel dit à Satan"
translated_sentence = translate_sentence(loaded_model, sentence, french_tokenizer, bassa_tokenizer, device)
print(f"Translated Sentence: {translated_sentence}")

Translated Sentence: yehova a podos ki me, a kal nye le :


In [140]:
sentence = "l'Éternel"
translated_sentence = translate_sentence(loaded_model, sentence, french_tokenizer, bassa_tokenizer, device)
print(f"Translated Sentence: {translated_sentence}")

Translated Sentence: yehova a bi podos mosi, a kal nye le :


### Step-by-Step Explanation

1. **Tokenizing the Dataset**:
   - **Splitting the Dataset**:
     - We use `train_test_split` to split the dataset into training and validation sets.
     - This helps in evaluating the model on unseen data to understand its performance better.
   - **Tokenizing the Text**:
     - We use the `french_tokenizer` and `bassa_tokenizer` to convert the French and Bassa texts into token IDs.
     - Token IDs are necessary for the model to process the text.

2. **Creating the Custom Transformer Model**:
   - **Model Components**:
     - `SelfAttention`: Computes attention scores and applies them to the values.
     - `TransformerBlock`: Consists of self-attention and feed-forward network, with layer normalization and dropout.
     - `Encoder`: Stacks multiple `TransformerBlock`s and adds positional encoding to the input embeddings.
     - `DecoderBlock`: Similar to `TransformerBlock` but includes an additional attention layer for encoder-decoder attention.
     - `Decoder`: Stacks multiple `DecoderBlock`s and adds positional encoding to the input embeddings, followed by a linear layer to produce output token logits.
     - `Transformer`: Combines `Encoder` and `Decoder`, applying masks to the source and target sequences during forward passes.

3. **Training the Model**:
   - **Preparing the DataLoaders**:
     - `DataLoader` is used to batch the data and shuffle it during training.
     - This helps in efficient data loading and training stability.
   - **Training Loop**:
     - For each epoch, we iterate over batches of the training data.
     - For each batch, we perform a forward pass, compute the loss, perform backpropagation, and update the model weights.
     - We evaluate the model on the validation set at the end of each epoch to track its performance.

