In [None]:
import torch
import torch.nn as nn
import math


class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.heads_dim = embed_size // heads

        assert (self.heads_dim * heads ==
                embed_size), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.heads_dim, self.heads_dim, bias=False)
        self.keys = nn.Linear(self.heads_dim, self.heads_dim, bias=False)
        self.queries = nn.Linear(self.heads_dim, self.heads_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.heads_dim, embed_size)

    def forward(self, values, Keys, query, mask):

        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], Keys.shape[1], query.shape[1]

        # Split embedding into self.heads pieces
        values = values.reshape(N, value_len, self.heads, self.heads_dim)
        keys = Keys.reshape(N, key_len, self.heads, self.heads_dim)
        queries = query.reshape(N, query_len, self.heads, self.heads_dim)

        values = self.values(values)  # (N, value_len, heads, heads_dim)
        keys = self.keys(keys)        # (N, key_len, heads, heads_dim)
        queries = self.queries(queries)  # (N, query_len, heads, heads_dim)

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        # queries shape: (N, query_len, heads, heads_dim)
        # keys shape: (N, key_len, heads, heads_dim)
        # energy shape: (N, heads, query_len, key_len)

        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        attention = torch.softmax(energy / (self.heads_dim ** (1 / 2)), dim=3)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.heads_dim
        )
        # attention shape: (N, heads, query_len, key_len)
        # values shape: (N, value_len, heads, heads_dim)
        # after einsum: (N, query_len, heads, heads_dim) that flattened last two dimensions

        out = self.fc_out(out)
        return out


class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size)
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)

        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out


class Encoder(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        embed_size,
        num_layers,
        heads,
        device,
        forward_expansion,
        dropout,
        max_length
    ):
        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.device = device
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    embed_size,
                    heads,
                    dropout=dropout,
                    forward_expansion=forward_expansion
                )
                for _ in range(num_layers)
            ]
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(
            N, seq_length).to(self.device)
        out = self.dropout(self.word_embedding(
            x) + self.position_embedding(positions))

        for layer in self.layers:
            out = layer(out, out, out, mask)

        return out


class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        super(DecoderBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm = nn.LayerNorm(embed_size)
        self.transformer_block = TransformerBlock(
            embed_size, heads, dropout, forward_expansion
        )
        self.dropout = nn.Dropout(dropout)
        self.device = device

    def forward(self, x, value, key, src_mask, trg_mask):
        attention = self.attention(x, x, x, trg_mask)
        query = self.dropout(self.norm(attention + x))
        out = self.transformer_block(value, key, query, src_mask)
        return out


class Decoder(nn.Module):
    def __init__(
        self,
        trg_vocab_size,
        embed_size,
        num_layers,
        heads,
        forward_expansion,
        dropout,
        device,
        max_length
    ):
        super(Decoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(embed_size, heads,
                             forward_expansion, dropout, device)
                for _ in range(num_layers)
            ]
        )

        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, trg_mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(
            N, seq_length).to(self.device)
        x = self.dropout((self.word_embedding(
            x) + self.position_embedding(positions)))

        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)

        out = self.fc_out(x)

        return out


class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        trg_pad_idx,
        embed_size=256,
        num_layers=6,
        forward_expansion=4,
        heads=8,
        dropout=0,
        device="cuda",
        max_length=100
    ):
        super(Transformer, self).__init__()

        self.encoder = Encoder(
            src_vocab_size,
            embed_size,
            num_layers,
            heads,
            device,
            forward_expansion,
            dropout,
            max_length
        )

        self.decoder = Decoder(
            trg_vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            device,
            max_length
        )

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # (N, 1, 1, src_len)
        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            N, 1, trg_len, trg_len
        )
        # (N, 1, trg_len, trg_len)

        return trg_mask.to(self.device)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        return out



In [None]:
!pip install datasets transformers sentencepiece



In [None]:
from datasets import load_dataset

# Try loading wmt14
dataset = load_dataset("wmt14", "fr-en")  # or maybe "en-fr"
print(dataset)


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/30 [00:00<?, ?files/s]

fr-en/train-00000-of-00030.parquet:   0%|          | 0.00/252M [00:00<?, ?B/s]

fr-en/train-00001-of-00030.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

fr-en/train-00002-of-00030.parquet:   0%|          | 0.00/243M [00:00<?, ?B/s]

fr-en/train-00003-of-00030.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

fr-en/train-00004-of-00030.parquet:   0%|          | 0.00/242M [00:00<?, ?B/s]

fr-en/train-00005-of-00030.parquet:   0%|          | 0.00/238M [00:00<?, ?B/s]

fr-en/train-00006-of-00030.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

fr-en/train-00007-of-00030.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

fr-en/train-00008-of-00030.parquet:   0%|          | 0.00/242M [00:00<?, ?B/s]

fr-en/train-00009-of-00030.parquet:   0%|          | 0.00/239M [00:00<?, ?B/s]

fr-en/train-00010-of-00030.parquet:   0%|          | 0.00/239M [00:00<?, ?B/s]

fr-en/train-00011-of-00030.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

fr-en/train-00012-of-00030.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

fr-en/train-00013-of-00030.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

fr-en/train-00014-of-00030.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

fr-en/train-00015-of-00030.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

fr-en/train-00016-of-00030.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

fr-en/train-00017-of-00030.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

fr-en/train-00018-of-00030.parquet:   0%|          | 0.00/261M [00:00<?, ?B/s]

fr-en/train-00019-of-00030.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

fr-en/train-00020-of-00030.parquet:   0%|          | 0.00/261M [00:00<?, ?B/s]

fr-en/train-00021-of-00030.parquet:   0%|          | 0.00/264M [00:00<?, ?B/s]

fr-en/train-00022-of-00030.parquet:   0%|          | 0.00/267M [00:00<?, ?B/s]

fr-en/train-00023-of-00030.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

fr-en/train-00024-of-00030.parquet:   0%|          | 0.00/274M [00:00<?, ?B/s]

fr-en/train-00025-of-00030.parquet:   0%|          | 0.00/278M [00:00<?, ?B/s]

fr-en/train-00026-of-00030.parquet:   0%|          | 0.00/365M [00:00<?, ?B/s]

fr-en/train-00027-of-00030.parquet:   0%|          | 0.00/322M [00:00<?, ?B/s]

fr-en/train-00028-of-00030.parquet:   0%|          | 0.00/370M [00:00<?, ?B/s]

fr-en/train-00029-of-00030.parquet:   0%|          | 0.00/311M [00:00<?, ?B/s]

fr-en/validation-00000-of-00001.parquet:   0%|          | 0.00/475k [00:00<?, ?B/s]

fr-en/test-00000-of-00001.parquet:   0%|          | 0.00/536k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/40836715 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3003 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 40836715
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3003
    })
})


In [None]:
!pip install datasets tokenizers torch tqdm



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from tokenizers import ByteLevelBPETokenizer
from tqdm import tqdm

In [None]:
small_train = dataset["train"].select(range(50000))
valid = dataset["validation"]

In [None]:
from tokenizers import ByteLevelBPETokenizer
import os

texts = []
for ex in small_train:
    texts.append(ex["translation"]["en"])
    texts.append(ex["translation"]["fr"])

# Train tokenizer with special tokens
tokenizer = ByteLevelBPETokenizer()
tokenizer.train_from_iterator(
    texts,
    vocab_size=32000,
    min_frequency=2,
    special_tokens=["[PAD]", "[BOS]", "[EOS]"]
)

os.makedirs("tokenizer", exist_ok=True)

# Save vocab + merges
tokenizer.save_model("tokenizer")

# Save tokenizer.json for HuggingFace
tokenizer_json_path = "tokenizer/tokenizer.json"
with open(tokenizer_json_path, "w", encoding="utf-8") as f:
    f.write(tokenizer.to_str())

# Reload in HuggingFace wrapper
from transformers import PreTrainedTokenizerFast

hf_tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_json_path)
hf_tokenizer.pad_token = "[PAD]"
hf_tokenizer.bos_token = "[BOS]"
hf_tokenizer.eos_token = "[EOS]"

SRC_PAD_IDX = hf_tokenizer.pad_token_id
TRG_PAD_IDX = hf_tokenizer.pad_token_id

src_vocab_size = len(hf_tokenizer)
trg_vocab_size = len(hf_tokenizer)

print("Vocab size:", src_vocab_size)
print("SRC_PAD_IDX:", SRC_PAD_IDX)
print("TRG_PAD_IDX:", TRG_PAD_IDX)


Vocab size: 32000
SRC_PAD_IDX: 0
TRG_PAD_IDX: 0


In [None]:
from torch.utils.data import Dataset, DataLoader
import torch

class TranslationDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=50):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
      item = self.data[idx]["translation"]
      src = item["en"]
      trg = item["fr"]

      # Encode with tokenizer, add BOS/EOS, truncate
      src_enc = [self.tokenizer.bos_token_id] + \
               self.tokenizer.encode(src, add_special_tokens=False)[:self.max_len-2] + \
               [self.tokenizer.eos_token_id]

      trg_enc = [self.tokenizer.bos_token_id] + \
                self.tokenizer.encode(trg, add_special_tokens=False)[:self.max_len-2] + \
                [self.tokenizer.eos_token_id]

      return torch.tensor(src_enc), torch.tensor(trg_enc)



# Collate function to pad sequences in batch
def collate_fn(batch):
    srcs, trgs = zip(*batch)
    srcs = torch.nn.utils.rnn.pad_sequence(srcs, batch_first=True, padding_value=SRC_PAD_IDX)
    trgs = torch.nn.utils.rnn.pad_sequence(trgs, batch_first=True, padding_value=TRG_PAD_IDX)
    return srcs, trgs

# Create Dataset and DataLoader
train_dataset = TranslationDataset(small_train, hf_tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)


In [None]:
for src_batch, trg_batch in train_loader:
    print(src_batch.shape)  # (batch_size, max_src_len_in_batch)
    print(trg_batch.shape)  # (batch_size, max_trg_len_in_batch)
    break



torch.Size([32, 50])
torch.Size([32, 50])


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize your Transformer
model = Transformer(
    src_vocab_size=src_vocab_size,
    trg_vocab_size=trg_vocab_size,
    src_pad_idx=SRC_PAD_IDX,
    trg_pad_idx=TRG_PAD_IDX,
    embed_size=256,
    num_layers=3,
    heads=8,
    device=device,
    forward_expansion=4,
    dropout=0.1,
    max_length=100
).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
optimizer = optim.Adam(model.parameters(), lr=3e-4)

EPOCHS = 3

for epoch in range(EPOCHS):
    model.train()
    loop = tqdm(train_loader, leave=True)
    total_loss = 0

    for src, trg in loop:
        src, trg = src.to(device), trg.to(device)

        # -----------------------------
        # Prepare decoder input & labels
        # -----------------------------
        trg_input = trg[:, :-1]              # remove last token
        trg_labels = trg[:, 1:].contiguous() # remove first token (shifted)

        # -----------------------------
        # Forward pass
        # -----------------------------
        output = model(src, trg_input)
        # output shape: (batch_size, trg_len-1, vocab_size)

        # Flatten for CrossEntropyLoss
        output = output.view(-1, output.shape[-1])
        trg_labels = trg_labels.view(-1)

        loss = criterion(output, trg_labels)

        # -----------------------------
        # Backprop & optimization
        # -----------------------------
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    print(f"\nEpoch {epoch+1} finished. Avg Loss: {avg_loss:.4f}\n")


Epoch 1: 100%|██████████| 1563/1563 [01:54<00:00, 13.69it/s, loss=5.03]



Epoch 1 finished. Avg Loss: 5.3435



Epoch 2: 100%|██████████| 1563/1563 [01:55<00:00, 13.49it/s, loss=4.64]



Epoch 2 finished. Avg Loss: 4.4975



Epoch 3: 100%|██████████| 1563/1563 [01:56<00:00, 13.45it/s, loss=4.33]


Epoch 3 finished. Avg Loss: 4.1687






In [None]:
# Save the model weights
torch.save(model.state_dict(), "transformer_en_fr.pth")

# Optional: save the tokenizer too
hf_tokenizer.save_pretrained("tokenizer_hf")

('tokenizer_hf/tokenizer_config.json',
 'tokenizer_hf/special_tokens_map.json',
 'tokenizer_hf/tokenizer.json')

In [None]:
def translate_sentence(model, tokenizer, sentence, max_len=50):
    model.eval()
    tokens = [tokenizer.bos_token_id] + tokenizer.encode(sentence, add_special_tokens=False) + [tokenizer.eos_token_id]
    src_tensor = torch.tensor(tokens).unsqueeze(0).to(device)

    trg_indices = [tokenizer.bos_token_id]
    for _ in range(max_len):
        trg_tensor = torch.tensor(trg_indices).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(src_tensor, trg_tensor)
        next_token = output.argmax(-1)[:, -1].item()
        if next_token == tokenizer.eos_token_id:
            break
        trg_indices.append(next_token)

    return tokenizer.decode(trg_indices[1:])  # remove BOS token

# Example
print(translate_sentence(model, hf_tokenizer, "I hate you"))


Je vous remercie de vous vous avez parlé.


In [1]:
!git config --global user.email "devvrattiwari2005@gmail.com"

In [2]:
!git config --global user.name "Devvrat12"

In [3]:
!git clone https://github.com/Devvrat12/Transformer-Translating-English-to-French-.git

Cloning into 'Transformer-Translating-English-to-French-'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.


In [5]:
!cp Translator_transformer(eng-french).ipynb Transformer_translating/

/bin/bash: -c: line 1: syntax error near unexpected token `('
/bin/bash: -c: line 1: `cp Translator_transformer(eng-french).ipynb Transformer_translating/'
