# Transformer

# https://github.com/shreydan/multilingual-translation/blob/main/en-hi-te-translation.ipynb

In [2]:
#!pip install torchtext

In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Wed_Jul_14_19:47:52_Pacific_Daylight_Time_2021
Cuda compilation tools, release 11.4, V11.4.100
Build cuda_11.4.r11.4/compiler.30188945_0


In [4]:
#!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [5]:
#!pip uninstall torchtext
#!pip install datasets

In [6]:
from pathlib import Path

In [7]:
from datasets import load_dataset
import pandas as pd
import random
from tokenizers import Tokenizer
from tokenizers.trainers import WordPieceTrainer
from tokenizers.models import WordPiece
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import decoders
import torch.nn as nn
from tqdm import tqdm
import torch.nn.functional as F
from torch.amp import autocast,GradScaler
import gc
import torch

torch.cuda.is_available()

  from .autonotebook import tqdm as notebook_tqdm


False

In [8]:
dataset1 = load_dataset("opus100", "en-hi")
df1_train = pd.DataFrame(dataset1['train']['translation'],columns=['en','hi'])
df1_train.rename(columns={'en':'lang1','hi':'lang2'},inplace=True)
df1_val = pd.DataFrame(dataset1['validation']['translation'],columns=['en','hi'])
df1_val.rename(columns={'en':'lang1','hi':'lang2'},inplace=True)
df1_test = pd.DataFrame(dataset1['test']['translation'],columns=['en','hi'])
df1_test.rename(columns={'en':'lang1','hi':'lang2'},inplace=True)
df1_train['lang2_id'] = 'hi'
df1_val['lang2_id'] = 'hi'
df1_test['lang2_id'] = 'hi'

dataset2 = load_dataset("opus100", "en-te")
df2_train = pd.DataFrame(dataset2['train']['translation'],columns=['en','te'])
df2_train.rename(columns={'en':'lang1','te':'lang2'},inplace=True)
df2_val = pd.DataFrame(dataset2['validation']['translation'],columns=['en','te'])
df2_val.rename(columns={'en':'lang1','te':'lang2'},inplace=True)
df2_test = pd.DataFrame(dataset2['test']['translation'],columns=['en','te'])
df2_test.rename(columns={'en':'lang1','te':'lang2'},inplace=True)
df2_train['lang2_id'] = 'te'
df2_val['lang2_id'] = 'te'
df2_test['lang2_id'] = 'te'
train_df = pd.concat([df1_train,df2_train]).reset_index(drop=True)
val_df = pd.concat([df1_val,df2_val]).reset_index(drop=True)
test_df = pd.concat([df1_test,df2_test]).reset_index(drop=True)

In [9]:
train_df.shape

(598671, 3)

In [10]:
train_df.tail(10)

Unnamed: 0,lang1,lang2,lang2_id
598661,"Right, put your hands in front of you like this.","రైట్, ఈ వంటి మీరు ముందు మీ చేతులు చాలు.",te
598662,Damn right.,డామన్ కుడి.,te
598663,"We got kids, we can't afford to do this.","మేము పిల్లలు వచ్చింది , మేము దీన్ని పొందలేని.",te
598664,Hmm.,అయ్యో.,te
598665,He's lost the will to live,అతనుకోల్పోతేజీవించాలనే,te
598666,Sad life.,సాడ్ జీవితం.,te
598667,Are you taking your driver's test drunk?,మీరు మీ డ్రైవర్ యొక్క పరీక్ష తాగిన తీసుకున్నట...,te
598668,Initiate recall.,రీకాల్ ప్రారంభించు.,te
598669,No. You need to come see me right now.,నంఇప్పుడేచూడండిమీరు రావాలి.,te
598670,All right.,అన్ని కుడి.,te


In [11]:
full_df = pd.concat([train_df,val_df])
lang1,lang2 = list(full_df['lang1']), list(full_df['lang2'])
full = lang1+lang2
random.shuffle(full)

bert_tokenizer = Tokenizer(WordPiece(unk_token="<unk>"))
bert_tokenizer.normalizer = normalizers.Sequence([Lowercase()])
bert_tokenizer.pre_tokenizer = Whitespace()
bert_tokenizer.decoder = decoders.WordPiece()
trainer = WordPieceTrainer(special_tokens=["<unk>","<pad>","<s-en>","<s-hi>","<s-te>","</s>"])
bert_tokenizer.train_from_iterator(full,trainer)
bert_tokenizer.enable_padding(
    pad_id=bert_tokenizer.token_to_id('<pad>'),
    length=128,
    pad_token='<pad>'
)
bert_tokenizer.enable_truncation(128)

base = Path('translator/tokenizer',)
base.mkdir(exist_ok=True,parents=True)
bert_tokenizer.save(str(base / 'en_hi_te.json'))

In [12]:
x = bert_tokenizer.encode(f"<s-hi>{lang2[12345]}</s>")
for a,b in zip(x.ids, x.tokens):
    if b!= '<pad>':
        print(f'{a} : {b}')
    
print('\n',bert_tokenizer.decode(x.ids),'\n\n')

x = bert_tokenizer.encode(f"<s-te>{lang2[-111]}</s>")
for a,b in zip(x.ids, x.tokens):
    if b!= '<pad>':
        print(f'{a} : {b}')
    
print('\n',bert_tokenizer.decode(x.ids))

3 : <s-hi>
5058 : आपने
8860 : चैनल
3695 : मालिक
969 : का
23556 : विशेषाधिकार
10 : %
22 : 1
933 : से
2837 : वापस
1172 : ले
1616 : लिया
897 : है
19 : .
5 : </s>

 आपने चैनल मालिक का विशेषाधिकार % 1 से वापस ले लिया है. 


4 : <s-te>
55 : l
10142 : అతనికి
17550 : వ్యవ
5374 : ##సా
534 : ##య
2076 : నా
4086 : వా
4885 : ##టా
2526 : వి
10364 : ##క్ర
13918 : ##యిం
4463 : ##చి
1814 : ##ంది
19 : .
5 : </s>

 l అతనికి వ్యవసాయ నా వాటా విక్రయించింది.


In [13]:
print('en',bert_tokenizer.token_to_id('<s-en>'))
print('hi',bert_tokenizer.token_to_id('<s-hi>'))
print('te',bert_tokenizer.token_to_id('<s-te>'))
print('eos',bert_tokenizer.token_to_id('</s>'))
print('pad',bert_tokenizer.token_to_id('<pad>'))

en 2
hi 3
te 4
eos 5
pad 1


In [14]:
class Dataset:
    def __init__(self,df):
        self.df = df
    def __len__(self,):
        return len(self.df)
    def __getitem__(self,idx):
        sample = self.df.iloc[idx,:]
        en,lang2 = sample['lang1'], sample['lang2']
        start_token = "<s-hi>" if sample['lang2_id']=='hi' else "<s-te>"
        en = bert_tokenizer.encode(f'<s-en>{en.strip()}</s>').ids
        l2 = bert_tokenizer.encode(f'{start_token}{lang2.strip()}</s>').ids
        l2_shift = l2.copy()
        l2_shift[:-1] = l2[1:]
        l2_shift[-1] = bert_tokenizer.token_to_id('<pad>')
        
        en = torch.tensor(en,dtype=torch.long)
        l2 = torch.tensor(l2,dtype=torch.long)
        l2_shift = torch.tensor(l2_shift,dtype=torch.long)
        l2_shift[l2_shift==1]=-100
        return en,l2,l2_shift

In [15]:
train_ds = Dataset(train_df)
val_ds = Dataset(val_df)

In [16]:
print('english tokens\n',train_ds[0][0])
print('lang2 tokens\n',train_ds[0][1])
print('right-shifted lang2 tokens\n',train_ds[0][2])

english tokens
 tensor([   2, 1716,   17, 7303, 2171,    5,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1])
lang2 tokens
 tensor([   3, 3372,   17, 1048, 3302, 2934,    5,    1,    1,    1,    1,    1,
           1,    1,    1,    1,  

In [17]:
class Embedding(nn.Module):
    def __init__(self,vocab_size,max_len,dim):
        super().__init__()
        self.max_len = max_len
        self.class_embedding = nn.Embedding(vocab_size,dim)
        self.positional_embedding = nn.Embedding(max_len,dim)
    def forward(self,x):
        x = self.class_embedding(x)
        pos = torch.arange(0,x.size(1),device = x.device)
        x = x+ self.positional_embedding(pos)
        return x

In [18]:
config = {
    'dim': 128,
    'n_heads': 4,
    'attn_dropout': 0.1,
    'mlp_dropout': 0.1,
    'depth': 8,
    'vocab_size': bert_tokenizer.get_vocab_size(),
    'max_len': 128,
    'pad_token_id': bert_tokenizer.token_to_id('<pad>')
}
config

{'dim': 128,
 'n_heads': 4,
 'attn_dropout': 0.1,
 'mlp_dropout': 0.1,
 'depth': 8,
 'vocab_size': 30000,
 'max_len': 128,
 'pad_token_id': 1}

In [19]:
class RMSNorm(nn.Module):
    

    def __init__(self,d,p = -1.,eps = 1e8,bias = False):
        """Root mean sruare layer normalization.
        
        :param d: model size
        :param p: partial RMSNorm, valid value [0, 1], default -1.0 (disabled)
        :param eps:  epsilon value, default 1e-8
        :param bias: whether use bias term for RMSNorm, disabled by
            default because RMSNorm doesn't enforce re-centering invariance.
        """
        super().__init__()
        self.p = p #disabled
        self.d = d
        self.eps = eps
        self.bias = bias
        self.scale = nn.Parameter(torch.ones(d))
        self.register_parameter("scale", self.scale)
        if self.bias:
            self.offset = nn.Parameter(torch.zeros(d))
            self.register_parameter("offset", self.offset)
        

    def forward(self,x):
        if self.p <0 or self.p >1:
            norm_x = x.norm(2,dim = 1,keepdim = True)
            d_x = self.d
        else:
            partial_size  = int(self.d * self.p)
            partial_x,_= torch.split(x,[partial_size,self.d - partial_size],dim = -1)
            norm_x = partial_x.norm(2,dim = -1,keepdim=True)
            d_x = partial_size

        rms_x = norm_x * d_x **(-1./2)
        x_normed = x /(rms_x + self.eps)

        if self.bias:
            return self.scale * x_normed + self.offset
        return self.scale * x_normed
        
        

In [20]:
class MultiheadAttention(nn.Module):
    def __init__(self, dim, n_heads, dropout=0.):
        super().__init__()
        self.dim = dim
        self.n_heads = n_heads
        assert dim % n_heads == 0, 'dim should be div by n_heads'
        self.head_dim = self.dim // self.n_heads
        self.q = nn.Linear(dim,dim,bias=False)
        self.k = nn.Linear(dim,dim,bias=False)
        self.v = nn.Linear(dim,dim,bias=False)
        self.attn_dropout = nn.Dropout(dropout)
        self.scale = self.head_dim ** -0.5
        self.out_proj = nn.Linear(dim,dim,bias=False)
        
    def forward(self,q,k,v,mask=None):
        batch,t,c = q.shape
        q = self.q(q)
        k = self.k(k)
        v = self.v(v)
        q = q.view(batch,q.size(1),self.n_heads,self.head_dim).permute(0,2,1,3)
        k = k.view(batch,k.size(1),self.n_heads,self.head_dim).permute(0,2,1,3)
        v = v.view(batch,v.size(1),self.n_heads,self.head_dim).permute(0,2,1,3)
        
        qkT = torch.matmul(q,k.transpose(-1,-2)) * self.scale
        qkT = self.attn_dropout(qkT)
        
        if mask is not None:
            mask = mask.to(dtype=qkT.dtype,device=qkT.device)
            a,b = qkT.size(-2), qkT.size(-1)
            qkT = qkT.masked_fill(mask[:,:,:a,:b]==0,float('-inf'))
            
        qkT = nn.functional.softmax(qkT,dim=-1)
            
        attn = torch.matmul(qkT,v)
        attn = attn.permute(0,2,1,3).contiguous().view(batch,t,c)
        out = self.out_proj(attn)
        return out

In [21]:
class FeedForward(nn.Module):
    def __init__(self,dim,dropout=0.):
        super().__init__()
        self.feed_forward = nn.Sequential(
            nn.Linear(dim,dim*4,bias=False),
            nn.Dropout(dropout),
            nn.GELU(),
            nn.Linear(dim*4,dim,bias=False)
        )
        
    def forward(self, x):
        return self.feed_forward(x)

In [22]:
class EncoderBlock(nn.Module):
    def __init__(self, dim, n_heads, attn_dropout=0., mlp_dropout=0.):
        super().__init__()
        self.attn = MultiheadAttention(dim,n_heads,attn_dropout)
        self.ffd = FeedForward(dim,mlp_dropout)
        self.ln_1 = RMSNorm(dim)
        self.ln_2 = RMSNorm(dim)
        
    def forward(self,x,mask=None):
        x = self.ln_1(x)
        x = x + self.attn(x,x,x,mask)
        x = self.ln_2(x)
        x = x + self.ffd(x)
        return x

In [23]:
class DecoderBlock(nn.Module):
    def __init__(self, dim, n_heads, attn_dropout=0., mlp_dropout=0.):
        super().__init__()
        self.self_attn = MultiheadAttention(dim,n_heads,attn_dropout)
        self.cross_attn = MultiheadAttention(dim,n_heads,attn_dropout)
        self.ln_1 = RMSNorm(dim)
        self.ln_2 = RMSNorm(dim)
        self.ln_3 = RMSNorm(dim)
        self.ffd = FeedForward(dim,mlp_dropout)
        
    def forward(self, x, enc_out, src_mask, tgt_mask):
        x = self.ln_1(x)
        x = x + self.self_attn(x,x,x,tgt_mask)
        x = self.ln_2(x)
        x = x + self.cross_attn(x,enc_out,enc_out,src_mask) # decoder: q, encoder: k,v
        x = self.ln_3(x)
        x = x + self.ffd(x)
        
        return x

In [24]:
class Seq2SeqTransformer(nn.Module):
    def __init__(self, config):
        
        super().__init__()
        
        self.embedding = Embedding(config['vocab_size'],config['max_len'],config['dim'])
        
        self.depth = config['depth']
        self.encoders = nn.ModuleList([
            EncoderBlock(
                dim=config['dim'],
                n_heads=config['n_heads'],
                attn_dropout=config['attn_dropout'],
                mlp_dropout=config['mlp_dropout']
            ) for _ in range(self.depth)
        ])
        self.decoders = nn.ModuleList([
            DecoderBlock(
                dim=config['dim'],
                n_heads=config['n_heads'],
                attn_dropout=config['attn_dropout'],
                mlp_dropout=config['mlp_dropout']
            ) for _ in range(self.depth)
        ])
        
        self.ln_f = RMSNorm(config['dim'])
        self.lm_head = nn.Linear(config['dim'],config['vocab_size'],bias=False)
        
        self.embedding.class_embedding.weight = self.lm_head.weight
        
        self.pad_token_id = config['pad_token_id']
        self.register_buffer('tgt_mask',torch.tril(torch.ones(1,1,config['max_len'],config['max_len'])))
    
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    
    def create_src_mask(self,src):
        return (src != self.pad_token_id).unsqueeze(1).unsqueeze(2) # N, 1, 1, src_len
    
    def forward(self, src, tgt, labels=None):
        
        src_mask = self.create_src_mask(src)
        
        enc_out = self.embedding(src)
        dec_out = self.embedding(tgt)
        
        for i in range(self.depth):
            enc_out = self.encoders[i](enc_out,mask=src_mask)
            dec_out = self.decoders[i](dec_out,enc_out,src_mask=src_mask,tgt_mask=self.tgt_mask)

        dec_out = self.ln_f(dec_out)
        
        if labels is not None:
            lm_logits = self.lm_head(dec_out)
            loss = F.cross_entropy(lm_logits.view(-1, lm_logits.shape[-1]), labels.view(-1))
            return loss
        
        lm_logits = self.lm_head(dec_out[:,[-1],:])
        return lm_logits

In [25]:
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=32,shuffle=True,pin_memory=True,num_workers=0)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=32,shuffle=False,pin_memory=True,num_workers=0)
print(len(train_dl), len(val_dl))

18709 125


In [26]:
model = Seq2SeqTransformer(config).to('cuda')
print(sum([p.numel() for p in model.parameters() if p.requires_grad]))

RuntimeError: No CUDA GPUs are available

In [None]:
epochs = 1
train_losses = []
valid_losses = []
best_val_loss = 1e9

all_tl = []
all_lr = []

optim = torch.optim.Adam(model.parameters(),lr=1e-4)
sched = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
    optim,
    T_0=250,
    eta_min=1e-8
)

scaler = GradScaler('cuda')

In [None]:
for i,(a,b,c) in enumerate(train_dl):
    src, tgt, labels = a.to('cuda'),b.to('cuda'),c.to('cuda')
    print(src)
    loss = model(src,tgt,labels)
    print('loss')
    if i ==2:
        break

NameError: name 'train_dl' is not defined

In [None]:
e

In [None]:
for ep in tqdm(range(epochs)):
    model.train()
    trl = 0.
    #tprog = tqdm(enumerate(train_dl),total=len(train_dl))
    for i, (a,b,c) in tqdm(enumerate(train_dl)):
        with autocast('cuda'):
            src, tgt, labels = a.to('cuda'),b.to('cuda'),c.to('cuda')
            loss = model(src,tgt,labels)
            print(loss)
            scaler.scale(loss).backward()
            scaler.unscale_(optim)
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0, norm_type=2)
            scaler.step(optim)
            scaler.update()
            optim.zero_grad()
            sched.step(ep + i / len(train_dl))
            all_lr.append(sched.get_last_lr())
            trl += loss.item()
            all_tl.append(loss.item())
            #tprog.set_description(f'train step loss: {loss.item():.4f}')
    train_losses.append(trl/len(train_dl))
    
    gc.collect()
    torch.cuda.empty_cache()
        
    model.eval()

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
!nvidia-smi


In [None]:
end

In [None]:
mask = (a != config['pad_token_id']).unsqueeze(1).unsqueeze(2)

In [None]:
res = model(a,b,c)

In [None]:
embedding = EMbedding(config['vocab_size'],config['max_len'],config['dim'])

In [None]:
tmp_emb = embedding(a)

In [None]:
tmp = nn.Linear(config['dim'],config['dim'],bias = False)

In [None]:
res = tmp(tmp_emb)

In [None]:
res.shape

In [None]:
config['n_heads'],128/4 #

In [None]:
res.view(2,128,4,32).shape#.permute(0,2,1,3)

In [None]:
q = res.view(2,128,4,32).permute(0,2,1,3)

In [None]:
k = torch.rand(2,4,128,32)
v = torch.rand(2,4,128,32)

In [None]:
q.shape,k.shape

In [None]:
scale = 32 ** -0.5
qkT = torch.matmul(q,k.transpose(-1,-2)) * scale

In [None]:
32

In [None]:
qkT.shape

In [None]:
k.transpose(-1,-2).shape,q.shape

In [None]:
mask = mask.to(dtype=qkT.dtype,device=qkT.device)

In [None]:
qa,qb = qkT.size(-2), qkT.size(-1)

In [None]:
qa,qb

In [None]:
qkt = qkT.masked_fill(mask[:,:,:qa,:qb]==0,float('-inf'))

In [None]:
attn = torch.matmul(qkT,v)

In [None]:
attn[0][0][0]