In [None]:
!pip install torch==2.0.0 torchtext==0.15.1



In [None]:
!pip install torchdata==0.6.0



In [None]:
!pip install numpy==1.23.5



In [None]:
import torch

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
import spacy
import math
import os
import sys

In [None]:
!python -m spacy download fr_core_news_sm
!python -m spacy download en_core_web_sm

Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m102.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m117.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation s

In [None]:
# Tokenization using spaCy
spacy_en = spacy.load("en_core_web_sm")
spacy_fr = spacy.load("fr_core_news_sm")



In [None]:
#Tokenization

def tokenize_en(text):
    return [tok.text.lower() for tok in spacy_en.tokenizer(text)]

def tokenize_fr(text):
    return [tok.text.lower() for tok in spacy_fr.tokenizer(text)]


In [None]:
english_file = "english.txt"  # Example: "english.txt" in the Colab's root
french_file = "french.txt"   # Example: "french.txt" in the Colab's root

In [None]:
with open(english_file, "r", encoding="utf-8") as f:
    english_sentences = f.read().splitlines()

with open(french_file, "r", encoding="utf-8") as f:
    french_sentences = f.read().splitlines()

In [None]:
# Build vocabulary
def yield_tokens(sentences, tokenizer):
    for sentence in sentences:
        yield tokenizer(sentence)

vocab_en = build_vocab_from_iterator(yield_tokens(english_sentences, tokenize_en), specials=["<unk>", "<pad>", "<bos>", "<eos>"])
vocab_fr = build_vocab_from_iterator(yield_tokens(french_sentences, tokenize_fr), specials=["<unk>", "<pad>", "<bos>", "<eos>"])

vocab_en.set_default_index(vocab_en["<unk>"])
vocab_fr.set_default_index(vocab_fr["<unk>"])

In [None]:

# Convert sentences to tensor
def numericalize(sentence, vocab, tokenizer):
    return [vocab["<bos>"]] + [vocab[token] for token in tokenizer(sentence)] + [vocab["<eos>"]]


In [None]:
# Dataset class
class TranslationDataset(Dataset):
    def __init__(self, source_sentences, target_sentences, src_vocab, tgt_vocab, src_tokenizer, tgt_tokenizer):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def __len__(self):
        return len(self.source_sentences)

    def __getitem__(self, idx):
        src_tensor = torch.tensor(numericalize(self.source_sentences[idx], self.src_vocab, self.src_tokenizer))
        tgt_tensor = torch.tensor(numericalize(self.target_sentences[idx], self.tgt_vocab, self.tgt_tokenizer))
        return src_tensor, tgt_tensor

In [None]:

# Collate function for padding
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=vocab_en["<pad>"], batch_first=True) # batch_first=True
    tgt_batch = pad_sequence(tgt_batch, padding_value=vocab_fr["<pad>"], batch_first=True) # batch_first=True
    return src_batch.to(device), tgt_batch.to(device) # Move to device here

In [None]:

# DataLoader
BATCH_SIZE = 32
dataset_en_fr = TranslationDataset(english_sentences, french_sentences, vocab_en, vocab_fr, tokenize_en, tokenize_fr)
dataloader_en_fr = DataLoader(dataset_en_fr, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

In [None]:
# Transformer Model Components
class PositionalEncoding(nn.Module): # No variable sequence length
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        self.pe[:, 0::2] = torch.sin(position * div_term)
        self.pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = self.pe.unsqueeze(0)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].to(x.device) # Move pe to the same device as x and use only required length
        return self.dropout(x)


In [None]:
# Token Embedding
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, tokens):
        return self.embedding(tokens.long()) * math.sqrt(self.embedding.embedding_dim)

In [None]:
# ---------------------------
# Modern Transformer with `nn.TransformerEncoder` and `nn.TransformerDecoder`
class TransformerModel(nn.Module):
    def __init__(self, src_lang, tgt_lang, src_vocab_size, tgt_vocab_size, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # move device variable to self
        self.init_weights()


    def init_weights(self):
        initrange = 0.1
        self.src_embedding.weight.data.uniform_(-initrange, initrange)
        self.tgt_embedding.weight.data.uniform_(-initrange, initrange)
        self.fc_out.bias.data.zero_()
        self.fc_out.weight.data.uniform_(-initrange, initrange)

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones((sz, sz), device=self.device)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def create_mask(self, src, tgt):
      src_seq_len = src.shape[1]
      tgt_seq_len = tgt.shape[1]

      tgt_mask = self.generate_square_subsequent_mask(tgt_seq_len)
      src_mask = torch.zeros((src_seq_len, src_seq_len),device=self.device).type(torch.bool)

      if self.src_lang == "english":
        src_vocab = vocab_en["<pad>"]
      elif self.src_lang == "french":
        src_vocab = vocab_fr["<pad>"]
      else:
        print ("Error: unknown source language selected")
        sys.exit()

      if self.tgt_lang == "english":
        tgt_vocab = vocab_en["<pad>"]
      elif self.tgt_lang == "french":
        tgt_vocab = vocab_fr["<pad>"]
      else:
        print ("Error: unknown target language selected")
        sys.exit()

      src_padding_mask = (src == src_vocab)
      tgt_padding_mask = (tgt == tgt_vocab)
      return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = self.create_mask(src, tgt)
        src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))
        tgt_emb = self.pos_encoder(self.tgt_embedding(tgt)  * math.sqrt(self.d_model))
        outs = self.transformer(src_emb, tgt_emb, src_mask=src_mask, tgt_mask=tgt_mask, src_key_padding_mask=src_padding_mask, tgt_key_padding_mask=tgt_padding_mask)
        return self.fc_out(outs)

    def encode(self, src):

        src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))
        src_mask = torch.zeros((src.shape[1], src.shape[1]), device=self.device).type(torch.bool)
        return self.transformer.encoder(src_emb, src_mask)

    def decode(self, tgt, memory):

        tgt_emb = self.pos_encoder(self.tgt_embedding(tgt) * math.sqrt(self.d_model))
        tgt_mask = self.generate_square_subsequent_mask(tgt.shape[1]).to(self.device)
        return self.transformer.decoder(tgt_emb, memory, tgt_mask)



In [None]:
# Initialize model
EMB_SIZE = 128
NHEAD = 8
FFN_HID_DIM = 256
NUM_ENCODER_LAYERS = 2
NUM_DECODER_LAYERS = 2
model_en_fr = TransformerModel("english", "french", len(vocab_en), len(vocab_fr), d_model=EMB_SIZE, nhead=NHEAD, num_encoder_layers=NUM_ENCODER_LAYERS, num_decoder_layers=NUM_DECODER_LAYERS, dim_feedforward=FFN_HID_DIM, dropout=0.1).to(device)


In [None]:

# Training setup
criterion_en_fr = nn.CrossEntropyLoss(ignore_index=vocab_fr["<pad>"])
optimizer_en_fr = optim.Adam(model_en_fr.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)


In [None]:
# Training function
def train_epoch(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0.
    for src, tgt in dataloader:
        # src, tgt = src.to(device), tgt.to(device) # Data is moved to device in the collate_fn
        optimizer.zero_grad()
        tgt_input = tgt[:, :-1]
        output = model(src, tgt_input)  # Teacher forcing, pass source and target (except last token)
        loss = criterion(output.reshape(-1, output.shape[-1]), tgt[:, 1:].reshape(-1)) #compare with target (except first token)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) # Gradient clipping
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)


In [None]:
def train(model, dataloader, criterion, optimizer, epochs=3):
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train_loss = train_epoch(model, dataloader, criterion, optimizer)
        val_loss = evaluate(model, dataloader, criterion)
        print(f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(time.time() - epoch_start_time):.3f}s")


In [None]:

def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0.
    with torch.no_grad():
        for src, tgt in dataloader:
            tgt_input = tgt[:, :-1]
            output = model(src, tgt_input)
            loss = criterion(output.reshape(-1, output.shape[-1]), tgt[:, 1:].reshape(-1))
            total_loss += loss.item()
    return total_loss / len(dataloader)

In [None]:
import time

# ------------------------------------
# ✅ Improved Greedy Decoding Function
def greedy_decode(model, src_tensor, src_vocab, tgt_vocab, max_len=50):
    """
    Decodes a sentence word-by-word using greedy decoding.
    - Encodes the source sentence
    - Generates one token at a time
    - Stops at <eos>
    """
    model.eval()

    # Encode the source sentence
    with torch.no_grad():
        memory = model.encode(src_tensor)

    # Initialize the target sentence with <bos>
    tgt_indices = [tgt_vocab["<bos>"]]

    for _ in range(max_len):
        tgt_tensor = torch.tensor(tgt_indices).unsqueeze(0).to(model.device)  # (1, seq_len)

        # Decode using current sequence
        with torch.no_grad():
            output = model.decode(tgt_tensor, memory)
            logits = model.fc_out(output[:, -1, :])  # Get logits for the last token
            next_token = logits.argmax(dim=1).item()  # Get the highest probability token

        tgt_indices.append(next_token)

        # Stop at <eos>
        if next_token == tgt_vocab["<eos>"]:
            break

    # Convert token indices to words
    decoded_sentence = " ".join([tgt_vocab.get_itos()[idx] for idx in tgt_indices[1:]])  # Skip <bos>

    return decoded_sentence

In [None]:
EPOCHS = 3
train(model_en_fr, dataloader_en_fr, criterion_en_fr, optimizer_en_fr, epochs=EPOCHS)

# English to French Conversion
def translate_en_to_fr(sentence, model, vocab_en, vocab_fr, tokenize_en):
    return greedy_decode(model, torch.tensor(numericalize(sentence, vocab_en, tokenize_en)).unsqueeze(0).to(model.device), vocab_en, vocab_fr)

# Testing the model --> English to French
print(translate_en_to_fr("I am a student.", model_en_fr, vocab_en, vocab_fr, tokenize_en))



  return torch._transformer_encoder_layer_fwd(
  return torch._native_multi_head_attention(


Epoch: 1, Train loss: 4.133, Val loss: 3.060, Epoch time = 149.261s
Epoch: 2, Train loss: 2.957, Val loss: 2.445, Epoch time = 148.425s
Epoch: 3, Train loss: 2.524, Val loss: 2.118, Epoch time = 149.640s
je suis un étudiant . <eos>


In [None]:
# clear GPU memory to save mem space to start reverse translation model
torch.cuda.empty_cache()

model_fr_en = TransformerModel("french", "english", len(vocab_fr), len(vocab_en), d_model=EMB_SIZE, nhead=NHEAD, num_encoder_layers=NUM_ENCODER_LAYERS, num_decoder_layers=NUM_DECODER_LAYERS, dim_feedforward=FFN_HID_DIM, dropout=0.1).to(device)

criterion_fr_en = nn.CrossEntropyLoss(ignore_index=vocab_en["<pad>"])
optimizer_fr_en = optim.Adam(model_fr_en.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# DataLoader
BATCH_SIZE = 32
dataset_fr_en = TranslationDataset(french_sentences, english_sentences, vocab_fr, vocab_en, tokenize_fr, tokenize_en)
dataloader_fr_en = DataLoader(dataset_fr_en, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

train(model_fr_en, dataloader_fr_en, criterion_fr_en, optimizer_fr_en, epochs=EPOCHS)

# English to French Conversion
def translate_fr_to_en(sentence, model, vocab_en, vocab_fr, tokenize_en):
    return greedy_decode(model, torch.tensor(numericalize(sentence, vocab_fr, tokenize_fr)).unsqueeze(0).to(model.device), vocab_fr, vocab_en)

# Testing the model --> English to French
print(translate_en_to_fr("je suis étudiant .", model_fr_en, vocab_fr, vocab_en, tokenize_fr))

Epoch: 1, Train loss: 3.773, Val loss: 2.788, Epoch time = 149.254s
Epoch: 2, Train loss: 2.681, Val loss: 2.176, Epoch time = 148.277s
Epoch: 3, Train loss: 2.244, Val loss: 1.847, Epoch time = 147.851s
i 'm a student . <eos>
