<a href="https://colab.research.google.com/github/Arun-nexus/deep_learning/blob/main/language_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
df=pd.read_csv(r"C:\Users\Arun\Downloads\1000K IITM.csv\1000K IITM.csv")
df

In [None]:
df=df[:50000]
df.fillna('',inplace=True)

In [None]:
hindi=df["hindi"].to_list()
eng=df["english"].to_list()

In [None]:
from nltk import word_tokenize
def preprocess(text):
    alltokens=[word_tokenize(word) for word in text]
    return alltokens
hindi_tokens=preprocess(hindi)
eng_tokens=preprocess(eng)

In [None]:
hindi_tokens_list=[]
eng_tokens_list=[]
for row in hindi_tokens:
    for word in row:
        hindi_tokens_list.append(word)
for row in eng_tokens:
    for word in row:
        eng_tokens_list.append(word)

In [None]:
from collections import Counter
def counter(text):
    counts=Counter(text)
    return counts
hindi_word_counts=counter(hindi_tokens_list)
english_word_counts=counter(eng_tokens_list)

In [None]:
hindi_vocab={"<pad>":0,"<unk>":1}
english_vocab={"<pad>":0,"<unk>":1}
for word in hindi_word_counts:
    if word not in hindi_vocab:
        hindi_vocab[word]=len(hindi_vocab)
for word in english_word_counts:
    if word not in english_vocab:
        english_vocab[word]=len(english_vocab)

In [None]:
vocab_size=len(hindi_vocab)

In [None]:
def process(text,dict):
    sequence=[]
    index=[dict.get(word) for row in text for word in row]
    sequence.append(index)
    print(sequence)
    return sequence
x_sequence=process(hindi,hindi_vocab)
y_sequence=process(eng,english_vocab)

In [None]:
import torch
import torch.nn as nn
import math

In [None]:
class positional_encoding(nn.Module):
    def __init__(self,max_len=int,d_model=int):
        super().__init__()

        pe=torch.zeros(max_len,d_model)
        position=torch.arange(0,max_len)
        div_term=torch.exp(torch.arange(0,d_model,2)*(-math.log(10000.0)/d_model))
        pe[:,0::2]=torch.sin(position*div_term)
        pe[:,1::2]=torch.cos(position*div_term)
        pe=pe.unsqueeze(0)
        self.register_buffer("pe",pe)


    def forward(self,x):
        return x+x[:,:x.size(1)]

In [None]:
class attention(nn.Module):
    def __init__(self,d_model=int,num_heads=int):
        super().__init__()
        assert d_model%num_heads==0
        self.d_k=d_model//num_heads
        self.num_heads=num_heads

        self.q_linear=nn.Linear(d_model,d_model)
        self.k_linear=nn.Linear(d_model,d_model)
        self.v_linear=nn.Linear(d_model,d_model)
        self.out=nn.Linear(d_model,d_model)

    def forward(self,q,k,v,mask=None):
        batch=q.size(0)
        Q=self.q_linear(q).view(batch,-1,self.num_heads,self.d_k).transpose(1,2)
        K=self.q_linear(k).view(batch,-1,self.num_heads,self.d_k).transpose(1,2)
        V=self.q_linear(v).view(batch,-1,self.num_heads,self.d_k).transpose(1,2)

        scores=torch.matmul(Q,K.transpose(-2,-1))/math.sqrt(self.d_k)
        if mask is not None:
            scores=scores.masked_fill(mask==0,float("-inf"))
        attn=torch.softmax(scores,dim=-1)
        output=torch.matmul(attn,V)
        output=output.transpose(1,2).contiguous().view(B,-1,self.num_heads*self.d_k)
        return self.out(output)


In [None]:
class feedforward(nn.Module):
    def __init__(self,d_model,neurons=2048,drop=float):
        super().__init__()
        self.features=nn.Sequential(
            nn.Linear(d_model,neurons),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(neurons,d_model)
            )
    def forward(self,x):
        return self.features(x)

In [None]:
class encoder(nn.Module):
    def __init__(self,d_model=int,num_heads=int,neurons=2048,drop=float):
        super().__init__()
        self.attn=attention(d_model,num_heads)
        self.ff=feedforward(d_model,drop)
        self.norm=nn.LayerNorm(d_model)
        self.norm2=nn.LayerNorm(d_model)
        self.dropout=nn.Dropout(drop)
    def forward(self,x,mask=None):
        attn=self.attn(x,x,x,mask)
        x=self.norm(x+self.dropout(attn))
        ff_out=self.ff(x)
        x=self.norm2(x+self.dropout(ff_out))
        return(x)

In [None]:
class decoder(nn.Module):
    def __init__(self,d_model=int,num_heads=int,neurons=2048,drop=float):
        super().__init__()
        self.attn=attention(d_model,num_heads)
        self.cross_attn=attention(d_model,num_heads)
        self.ff=feedforward(d_model,drop)
        self.norm1=nn.LayerNorm(d_model)
        self.norm2=nn.LayerNorm(d_model)
        self.norm3=nn.LayerNorm(d_model)
        self.dropout=nn.Dropout(drop)
    def forward(self,x,enc_out,src_mask=None,tgt_mask=None):
        attn=self.attn(x,x,x,tgt_mask)
        x=self.norm1(x+self.dropout(attn))
        cross_attn=self.cross_attn(x,enc_out,enc_out,src_mask)
        x=self.norm2(x+self.dropout(cross_attn))
        ff_out=self.ff(x)
        x=self.norm3(x+self.dropout(ff_out))
        return x

In [None]:
class Transformer(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, d_model=512, num_heads=8, num_layers=6, neurons=2048, dropout=0.1):
        super().__init__()
        self.src_embed = nn.Embedding(src_vocab, d_model)
        self.tgt_embed = nn.Embedding(tgt_vocab, d_model)
        self.pos_enc = positional_encoding(d_model)

        self.encoder_layers = nn.ModuleList([encoder(d_model, num_heads, neurons, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([encoder(d_model, num_heads, neurons, dropout) for _ in range(num_layers)])

        self.fc_out = nn.Linear(d_model, tgt_vocab)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src = self.pos_enc(self.src_embed(src))
        tgt = self.pos_enc(self.tgt_embed(tgt))

        for layer in self.encoder_layers:
            src = layer(src, src_mask)

        for layer in self.decoder_layers:
            tgt = layer(tgt, src, src_mask, tgt_mask)

        return self.fc_out(tgt)

In [None]:
import torch.nn as nn
class early_stopping(nn.Module):
    def __init__(self,min_delta,patience):
        super().__init__()
        self.min_delta=min_delta
        self.partience=patience
        self.early_stop=False
        self.best_loss=float("inf")
        self.counter=0

    def __call__(self,val_loss):
        if self.min_delta >= val_loss-self.best_loss:
            self.counter=0
            self.best_loss=val_loss
        else:
            self.counter+=1
            if self.counter>=self.partience:
                self.early_stop=True

In [None]:
device=("cuda" if torch.cuda.is_available() else "cpu")
model=Transformer(vocab_size=vocab_size,d_model=512,max_len=200,num_layers=6,num_heads=8,d_ff=2048,num_classes=len(english_vocab)).to(device)
model_loss=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=0.0001)
scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,mode="min",factor=0.5,patience=2)

In [None]:
def tensor(tensor):
    new_tensor= torch.tensor(tensor,dtype=torch.long)
    return new_tensor
x_sequence=tensor(x_sequence)
y_sequence=tensor(y_sequence)

In [None]:
from torch.utils.data import DataLoader,TensorDataset
from sklearn.model_selection import train_test_split
dataset=TensorDataset(x_sequence,y_sequence)
training_dataset,validation_data=train_test_split(dataset,test_size=0.2,random_state=7)
training_dataset_loader=DataLoader(training_dataset,batch_size=256)
validation_dataset_loader=DataLoader(validation_data,batch_size=256)

In [None]:
from torch.amp import GradScaler,autocast
epochs=50
stopper=early_stopping(1e-4,4)
training_loss_store=[]
validation_loss_store=[]
training_accuracy_store=[]
validation_accuracy_store=[]
scaler=GradScaler()

for epoch in range(epochs):
    running,total,correct=0,0,0
    for x,y in train_data_loader:
        x,y=x.to(device),y.to(device)

        optimizer.zero_grad()
        with autocast("cuda"):
            output=model(x)
            loss=model_loss(output,y)
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        running+=loss.item()
        _,pred=torch.max(output,1)
        correct+=(pred==y).sum().item()
        total+=y.size(0)
        # print(torch.isnan(pred).any(), torch.isnan(y).any())

    training_loss=running/len(y)
    training_acc=correct/total
    training_loss_store.append(training_loss)
    training_accuracy_store.append(training_acc)

    model.eval()
    vrunning,vtotal,vcorrect=0,0,0
    for x,y in val_data_loader:
        x,y=x.to(device),y.to(device)

        optimizer.zero_grad()
        with autocast("cuda"):
            voutput=model(x)
            vloss=model_loss(voutput,y)

        vrunning+=vloss.item()
        _,vpred=torch.max(voutput,1)
        vcorrect+=(vpred==y).sum().item()
        vtotal+=y.size(0)
    val_loss=vrunning/len(y)
    val_acc=vcorrect/vtotal
    validation_loss_store.append(val_loss)
    validation_accuracy_store.append(val_acc)

    print(f"epoch: {epoch+1} training_loss: {training_loss:.4f} training acc:{training_acc:.2f} validation_loss: {val_loss:.4f} validation_acc: {val_acc:.2f} ")
    stopper(val_loss)
    if stopper.early_stop:
        print("early_stopping triggered")
        break