# Language Translation using Transformer

## Import the Requriments

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast

from datasets import load_dataset

from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers import normalizers
from tokenizers.normalizers import NFD, Lowercase, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordPieceTrainer
from tokenizers import decoders

from pathlib import Path
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
import random
import gc

## Define the Configuration for the parms

In [2]:
n_dim = 128
n_heads = 4
attn_dropout = 0.1
mlp_dropout = 0.1,
depth = 8
max_len = 128

## Load the dataset

In [3]:
dataset1 = load_dataset('opus100', 'en-hi')
dataset2 = load_dataset('opus100', 'en-mr')

README.md:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/259k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/65.2M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/247k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/534319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/128k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/127k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/27007 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [4]:
dataset1, dataset2

(DatasetDict({
     test: Dataset({
         features: ['translation'],
         num_rows: 2000
     })
     train: Dataset({
         features: ['translation'],
         num_rows: 534319
     })
     validation: Dataset({
         features: ['translation'],
         num_rows: 2000
     })
 }),
 DatasetDict({
     test: Dataset({
         features: ['translation'],
         num_rows: 2000
     })
     train: Dataset({
         features: ['translation'],
         num_rows: 27007
     })
     validation: Dataset({
         features: ['translation'],
         num_rows: 2000
     })
 }))

In [5]:
df1_train = pd.DataFrame(dataset1['train']['translation'], columns=['en', 'hi']).rename(columns={'en': 'lang1', 'hi': 'lang2'})
df1_val = pd.DataFrame(dataset1['test']['translation'], columns=['en', 'hi']).rename(columns={'en': 'lang1', 'hi': 'lang2'})
df1_test = pd.DataFrame(dataset1['validation']['translation'], columns=['en', 'hi']).rename(columns={'en': 'lang1', 'hi': 'lang2'})

df1_train['lang2_id'] = 'hi'
df1_test['lang2_id'] = 'hi'
df1_val['lang2_id'] = 'hi'

df2_train = pd.DataFrame(dataset2['train']['translation'], columns=['en', 'mr']).rename(columns={'en': 'lang1', 'mr': 'lang2'})
df2_val = pd.DataFrame(dataset2['test']['translation'], columns=['en', 'mr']).rename(columns={'en': 'lang1', 'mr': 'lang2'})
df2_test = pd.DataFrame(dataset2['validation']['translation'], columns=['en', 'mr']).rename(columns={'en': 'lang1', 'mr': 'lang2'})

df2_train['lang2_id'] = 'mr'
df2_test['lang2_id'] = 'mr'
df2_val['lang2_id'] = 'mr'

train_df = pd.concat([df1_train, df2_train]).reset_index(drop=True)
test_df = pd.concat([df1_test, df2_test]).reset_index(drop=True)
val_df = pd.concat([df1_val, df2_val]).reset_index(drop=True)

In [6]:
test_df

Unnamed: 0,lang1,lang2,lang2_id
0,"No, no, not so fast.",तुम इतनी आसानी से छूट नहीं सकते.,hi
1,", eject!",", बेदखल!",hi
2,I'm Dr. Messa.,Messa हूँ.,hi
3,So we notify the cops about big ticket sales a...,तोहमबड़ीटिकटोंकीबिक्रीकेबारे मेंपुलिस सूचित......,hi
4,"receiving what their Lord has given them, for ...","जो कुछ उनके रब ने उन्हें दिया, वे उसे ले रहे ह...",hi
...,...,...,...
3995,"He did it not once, but twice.",त्याने एकदाच नाही तर दोनदा केलं.,mr
3996,Tom turned the tap on.,टॉमने नळ चालू केला.,mr
3997,Are you listening to the radio?,रेडियो ऐकतोयस का?,mr
3998,Do you know what they're called?,त्यांना काय म्हणतात तुला माहीत आहे का?,mr


In [7]:
len(train_df), len(test_df), len(val_df)

(561326, 4000, 4000)

In [8]:
full_df = pd.concat([train_df, val_df])
lang1, lang2 = list(full_df['lang1']), list(full_df['lang2'])
full = lang1+lang2
random.shuffle(full)

## Tokenize the data

### Train the tokenizer

In [9]:
bert_tokenizer = Tokenizer(WordPiece(unk_token='<unk>'))
bert_tokenizer.normalizer = normalizers.Sequence([Lowercase()])
bert_tokenizer.pre_tokenizer = Whitespace()

trainer = WordPieceTrainer(special_tokens=['<unk>', '<pad>', '<s-en>', '<s-hi>', '<s-mr>', '</s>'])

bert_tokenizer.train_from_iterator(full, trainer)
bert_tokenizer.enable_padding(
    pad_id=bert_tokenizer.token_to_id("<pad>"),
    length=max_len,
    pad_token='<pad>'
)
bert_tokenizer.enable_truncation(max_len)

base = Path('translator/tokenizer')
base.mkdir(exist_ok=True, parents=True)
bert_tokenizer.save(str(base/'en_hi_mr.json'))

### Tokenize the dataset

In [10]:
x = bert_tokenizer.encode(f"<s-hi>{lang2[1234]}</s>")
for a, b in zip(x.ids, x.tokens):
    if b != "<pad>":
        print(f"{a} -> {b}")

print(f"\n {bert_tokenizer.decode(x.ids)} \n\n")

x = bert_tokenizer.encode(f"<s-mr>{lang2[-1234]}</s>")
for a, b in zip(x.ids, x.tokens):
    if b != "<pad>":
        print(a, "->", b)

print(f"\n {bert_tokenizer.decode(x.ids)} \n\n")

3 -> <s-hi>
3747 -> खाली
1654 -> किए
2039 -> जाने
762 -> के
4852 -> दौरान
2378 -> त्रुटि
5 -> </s>

 खाली किए जाने के दौरान त्रुटि 


4 -> <s-mr>
3647 -> ती
14477 -> जुळ
422 -> ##ी
1819 -> आहे
19 -> .
5 -> </s>

 ती जुळ ##ी आहे . 




In [11]:
print('pad', "->" ,bert_tokenizer.token_to_id('<pad>'))
print('en', "->" ,bert_tokenizer.token_to_id('<s-en>'))
print('hi', "->" ,bert_tokenizer.token_to_id('<s-hi>'))
print('mr', "->" ,bert_tokenizer.token_to_id('<s-mr>'))
print('eos', "->" ,bert_tokenizer.token_to_id('</s>'))

pad -> 1
en -> 2
hi -> 3
mr -> 4
eos -> 5


## Create the data loader

In [12]:
class Dataset:
    def __init__(self,df):
        self.df = df
    def __len__(self,):
        return len(self.df)
    def __getitem__(self,idx):
        sample = self.df.iloc[idx,:]
        en,lang2 = sample['lang1'], sample['lang2']
        start_token = "<s-hi>" if sample['lang2_id']=='hi' else "<s-mr>"
        en = bert_tokenizer.encode(f'<s-en>{en.strip()}</s>').ids
        l2 = bert_tokenizer.encode(f'{start_token}{lang2.strip()}</s>').ids
        l2_shift = l2.copy()
        l2_shift[:-1] = l2[1:]
        l2_shift[-1] = bert_tokenizer.token_to_id('<pad>')
        
        en = torch.tensor(en,dtype=torch.long)
        l2 = torch.tensor(l2,dtype=torch.long)
        l2_shift = torch.tensor(l2_shift,dtype=torch.long)
        l2_shift[l2_shift==1]=-100
        return en,l2,l2_shift

In [13]:
train_ids = Dataset(train_df)
val_ids = Dataset(val_df)
test_ids = Dataset(test_df)

In [14]:
train_ids[1]

(tensor([    2,    38, 13883,    40,     5,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,  

## Build the model

### Define the RMSNorm

In [16]:
class RMSNorm(nn.Module):
    def __init__(self, d, p=-1, eps=1e-8, bias=False):
        super().__init__()
        
        self.d = d
        self.p = p
        self.eps = eps
        self.bias = bias

        self.scale = nn.Parameter(torch.ones(d))
        self.register_parameter("scale", self.scale)

        if self.bias:
            self.offset = nn.Parameter(torch.zeros(d))
            self.register_parameter("offset", self.offset)

    def forward(self, x):
        if self.p < 0. or self.p >= 1.:
            norm_x = x.norm(2, dim=-1, keepdim=True)
            d_x = self.d
        else:
            partial_size = int(self.d * self.p)
            partial_x, _ = torch.split(x, [partial_size, self.d - partial_size], dim=-1)
            norm_x = partial_x.norm(2, dim=-1, keepdim=True)
            d_x = partial_size

        rms_x = norm_x * d_x **(-1./2)
        x_normed = x / (rms_x + self.eps)

        if self.bias:
            return self.scale * x_normed + self.offset

        return self.scal * x_normed
            

### Multihead Attention

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, dim, n_heads, dropout=0.):
        super().__init__()  # Fixed syntax
        self.dim = dim
        self.n_heads = n_heads
        assert dim % n_heads == 0, 'dim should be divisible by n_heads'
        self.head_dim = self.dim // self.n_heads
        
        # Linear layers for query, key, and value transformations
        self.q = nn.Linear(dim, dim, bias=False)
        self.k = nn.Linear(dim, dim, bias=False)
        self.v = nn.Linear(dim, dim, bias=False)

        # Dropout for attention weights
        self.attn_dropout = nn.Dropout(dropout)
        # Scaling factor for dot-product attention
        self.scale = self.head_dim ** -0.5
        # Linear layer for output projection
        self.out_proj = nn.Linear(dim, dim, bias=False)

    def forward(self, q, k, v, mask=None):
        B, T, C = q.shape  # [batch_size, seq_len, dim]

        # Linear projections for Q, K, V
        q = self.q(q)  # [B, T, C]
        k = self.k(k)  # [B, T, C]
        v = self.v(v)  # [B, T, C]

        # Reshape and permute for multi-head attention
        q = q.view(B, T, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        k = k.view(B, T, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        v = v.view(B, T, self.n_heads, self.head_dim).permute(0, 2, 1, 3)

        # Compute scaled dot-product attention
        wei = torch.matmul(q, k.transpose(-1, -2)) * self.scale  # [B, n_heads, T, T]

        # Apply mask before softmax
        if mask is not None:
            mask = mask.to(dtype=wei.dtype, device=wei.device)  # Fixed variable name
            a, b = wei.size(-2), wei.size(-1)
            wei = wei.masked_fill(mask[:, :, :a, :b] == 0, float('-inf'))

        # Apply softmax and dropout
        wei = F.softmax(wei, dim=-1)  # [B, n_heads, T, T]
        wei = self.attn_dropout(wei)

        # Compute attention output
        attn = torch.matmul(wei, v)  # [B, n_heads, T, head_dim]

        # Rearrange dimensions and merge heads
        attn = attn.permute(0, 2, 1, 3).contiguous().view(B, T, C)  # [B, T, C]

        # Final output projection
        out = self.out_proj(attn)  # [B, T, C]
        return out


### FeedForward

In [17]:
class FeedForward(nn.Module):
    def __init__(self, dim, dropout=0.):
        super().__init__()
        self.ffw = nn.Sequential(
            nn.Linear(dim, dim*4, bias=False),
            nn.Dropout(dropout),
            nn.GELU(),
            nn.Linear(dim*4, dim, bias=False)
        )

    def forward(self, x):
        out = self.ffw(x)
        return out