In [1]:
!pip3 install indic-nlp-library

Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Downloading indic_nlp_library-0.92-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Downloading sphinx_argparse-0.5.2-py3-none-any.whl (12 kB)
Installing collected packages: morfessor, sphinx-argparse, indic-nlp-library
Successfully installed indic-nlp-library-0.92 morfessor-2.0.6 sphinx-argparse-0.5.2


In [2]:
import pandas as pd  #for dataset

import numpy as np   # for matrices and array

from transformers import AutoTokenizer # trnasformer is a library and AutoTokenizer is a part of it.
# Tokenizer => To break a sentence into words is called tokenization.It is done so that the data can be easily understood by the model.
# Ex : I am with 5th Sem students of KIET. => AFter tokenization : [I,am,with,5th,Sem,students,of,KIET]
# Natural language data is the hardest to train
# Autotokenizer is a program which as great functions to tokenize such type of hard data
# Tokenizers are also trained ml models to do tokenization

import torch 
from indicnlp.tokenize import indic_tokenize
import torch.nn.functional as F



In [3]:
data = pd.read_csv("/kaggle/input/english-hindi/Sentence pairs in English-Hindi - 2025-02-11.tsv",
                  sep="\t", header=None, names=["SrcSentID","SrcSent","DstSentID","DstSent"])

In [4]:
data.drop(labels=[data.columns[0], data.columns[2]], axis=1, inplace=True)
# This line removes the first and third columns of the dataframe, permanently updating data.

In [5]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,Muiriel is 20 now.,म्यूरियल अब बीस साल की हो गई है।
1,Muiriel is 20 now.,म्यूरियल अब बीस साल की है।
2,Education in this world disappoints me.,मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,That won't happen.,वैसा नहीं होगा।
4,I miss you.,मुझें तुम्हारी याद आ रही है।


In [6]:
# to tokenize english sentence we will use AutoTokenizer 
# for hindi => indic_tokenize
src_sent_tokenizer = AutoTokenizer.from_pretrained("google-T5/T5-base") # T5 is an LLM based on tenasformers made by google
# we are using base model of T5
# "google-T5/T5-base" => It is like a path to a repository on internet => hugging face ki ek repository hogi uske andr ggoogle-T5 ek repository hogi jiske andr ek folder hoga T5-base
# hr LLM ka Apna ek tokenizer hota hai jo ki ek alg trh se train kiya jata hai


config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [7]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,Muiriel is 20 now.,म्यूरियल अब बीस साल की हो गई है।
1,Muiriel is 20 now.,म्यूरियल अब बीस साल की है।
2,Education in this world disappoints me.,मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,That won't happen.,वैसा नहीं होगा।
4,I miss you.,मुझें तुम्हारी याद आ रही है।


In [8]:
data["SrcSent"] = data["SrcSent"].apply(lambda x : src_sent_tokenizer.tokenize(x))
# hr cell pr kch operation krna hai to applymap use krenge
# kisi ek row ya column  pr opr krna hai to yse apply
# agr hashing krni hai to use map
# panda has => apply, map, applymap



In [9]:
# this type of tokenization is called sub-word tokenization as it also breaks a single word into parts and it also treart space as a token represnted by unserscore(_)
# BPE(Byte-Pair encoding) => Algorithm is used behind this to do this type of tokenization
# WPT(Word Piece Tokenizer) => it is also an algo for tokenization

In [10]:
data["DstSent"] = data["DstSent"].apply(lambda x : indic_tokenize.trivial_tokenize(x,lang="hi"))

In [11]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]","[म्यूरियल, अब, बीस, साल, की, हो, गई, है, ।]"
1,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]","[म्यूरियल, अब, बीस, साल, की, है, ।]"
2,"[▁Education, ▁in, ▁this, ▁world, ▁disappoint, ...","[मैं, इस, दुनिया, में, शिक्षा, पर, बहुत, निराश..."
3,"[▁That, ▁won, ', t, ▁happen, .]","[वैसा, नहीं, होगा, ।]"
4,"[▁I, ▁miss, ▁you, .]","[मुझें, तुम्हारी, याद, आ, रही, है, ।]"


In [12]:
data["SrcSent"] = data["SrcSent"].apply(src_sent_tokenizer.convert_tokens_to_ids)

In [13]:
Vs = src_sent_tokenizer.get_vocab() # it is the mapping of tokens with ids in T5 tokenizer

In [14]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[4159, 23, 14018, 19, 460, 230, 5]","[म्यूरियल, अब, बीस, साल, की, हो, गई, है, ।]"
1,"[4159, 23, 14018, 19, 460, 230, 5]","[म्यूरियल, अब, बीस, साल, की, है, ।]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[मैं, इस, दुनिया, में, शिक्षा, पर, बहुत, निराश..."
3,"[466, 751, 31, 17, 1837, 5]","[वैसा, नहीं, होगा, ।]"
4,"[27, 3041, 25, 5]","[मुझें, तुम्हारी, याद, आ, रही, है, ।]"


In [15]:
hindi_vocab = set()

for tokenized_hindi_sent in data["DstSent"] :
    hindi_vocab.update(tokenized_hindi_sent) # this is a set of unique hindi words from our dst column

In [16]:
Vd = dict()
for idx, token in enumerate(hindi_vocab) : 
    Vd[token] = idx + 3
Vd["<PAD>"] = 0
Vd["<SOS>"] = 1
Vd["<EOS>"] = 2

In [17]:
hindi_idx2vocab = dict(zip(Vd.values(),Vd.keys()))
print(hindi_idx2vocab)

{3: 'होंठ', 4: 'पाठशालाओं', 5: 'प्रक्षेपित', 6: 'काँच', 7: 'जलने', 8: 'मिली', 9: 'गीता', 10: 'कॉनरी', 11: 'हँस', 12: 'धुंध', 13: 'नान', 14: 'चोट', 15: 'इसका', 16: 'लड़ो', 17: 'दाँत', 18: 'सफ़ेद', 19: 'बत्ती', 20: 'समृद्ध', 21: 'जापानियों', 22: 'लेटकर', 23: 'ओर', 24: 'ग्रामीण', 25: 'चिंतित', 26: 'बोर', 27: 'तोड़ा', 28: 'बीमारी', 29: 'एस्केलेटर', 30: 'सहयोग', 31: 'भागे', 32: 'रखेंगे', 33: 'आयेगा', 34: 'पहचानतीं', 35: 'करनाटक', 36: 'सीखें', 37: 'बोतलें', 38: 'पकड़ीं', 39: 'ग्यारह', 40: 'मश्हूर', 41: 'टैड्डी', 42: 'उद्योग', 43: 'सवा', 44: 'नाक', 45: 'गंवाने', 46: 'चलते', 47: 'निकलते', 48: 'अनार', 49: 'सुना', 50: 'बचाओ', 51: 'मोरोपंत', 52: 'खोदा', 53: 'एओस्तेराइख़', 54: 'मुशकिल', 55: 'कार्बाइड', 56: 'मौज', 57: 'जुटाने', 58: 'छात्रों', 59: 'ब्रिटिशकाल', 60: 'तोड़ोगे', 61: 'ज़बरदस्त', 62: 'सम्मेलन', 63: 'नक़्शा', 64: 'बांटती', 65: 'नहायेंगी', 66: 'दिलाता', 67: 'देखते', 68: 'तारे', 69: 'सैंडविच', 70: 'सूज़न', 71: 'डाला', 72: 'करोगी', 73: 'गिरजाघर', 74: 'बनातीं', 75: 'झेल', 76: 'कोयले', 77: 'आए

Tokens in neural network: 
"<SOS>" (1) => Start of Sentence => USed to denote or tell the neural network that a sentece is going to start.
"<PAD>" (0) => Padding => Used for doing padding to make the length of all the sentence equal, so that a matrix can be made"
"<EOS>" (2) => End of Service => USed to denote the end of the sentence.

In [18]:
def convert_hindi_tokens_to_ids(hindi_sent) : 
    return [Vd[token] for token in hindi_sent]  # function that takes hindi tokenized sentence as 
                                                # input and returns ids of those tokens

In [19]:
data["DstSent"] = data["DstSent"].apply(lambda x : convert_hindi_tokens_to_ids(x))

In [20]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[4159, 23, 14018, 19, 460, 230, 5]","[3668, 1248, 832, 5569, 1139, 4014, 5019, 3215..."
1,"[4159, 23, 14018, 19, 460, 230, 5]","[3668, 1248, 832, 5569, 1139, 3215, 1696]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[5946, 4593, 3384, 5161, 6409, 4032, 665, 703,..."
3,"[466, 751, 31, 17, 1837, 5]","[6193, 5005, 3146, 1696]"
4,"[27, 3041, 25, 5]","[3923, 484, 6423, 1236, 3340, 3215, 1696]"


In [21]:
# adding SOS(1) in start
def insert_sos_token_id(hindi_sent_token_ids):
    return [1] + hindi_sent_token_ids # Concatenation hora h

In [22]:
data["DstSentInput"] = data["DstSent"].apply(lambda x : insert_sos_token_id(x))

In [23]:
 # adding EOS (2) at end
def insert_eos_token_id(hindi_sent_token_ids): 
    return hindi_sent_token_ids + [2]

In [24]:
data["DstSentLabel"] = data["DstSent"].apply(lambda x : insert_eos_token_id(x))

In [25]:
data.drop(labels=[data.columns[1]],axis=1,inplace=True)

In [26]:
X = list(data["SrcSent"])
Y_input = list(data["DstSentInput"])
Y_label = list(data["DstSentLabel"])

In [27]:
X_tensor = [torch.tensor(tokenized_eng_sent_ids) for tokenized_eng_sent_ids in X]
Y_input_tensor = [torch.tensor(tokenized_hin_sent_ids) for tokenized_hin_sent_ids in Y_input]
Y_label_tensor = [torch.tensor(tokenized_hin_sent_ids) for tokenized_hin_sent_ids in Y_label]

In [28]:
X_padded = torch.nn.utils.rnn.pad_sequence(X_tensor,batch_first=True)
Y_padded_input = torch.nn.utils.rnn.pad_sequence(Y_input_tensor,batch_first=True)
Y_padded_label = torch.nn.utils.rnn.pad_sequence(Y_label_tensor,batch_first=True)

In [29]:
Ns = X_padded.shape[1]
Nd = Y_padded_label.shape[1]

In [30]:
# Making a class Encoder and inheriting a parent class torch.nn.module
# We have made the layers of encoders but not connected them yet
# forward is function from the parent class that is Module and we are inherriting and overwriting that function so it can not be named anything else.
# forward function me hmne connection kiya hai
class Encoder(torch.nn.Module):

    def __init__(self,src_lang_vocab_size,word_embedding_dim):
        super(Encoder,self).__init__()
        self.first_embedding_layer = torch.nn.Embedding(num_embeddings=src_lang_vocab_size,embedding_dim=word_embedding_dim)
        self.second_lstm_layer = torch.nn.LSTM(input_size=word_embedding_dim,hidden_size=word_embedding_dim,batch_first=True)

    def forward(self,X_padded_mini_batch):
        first_embedding_layer_out = self.first_embedding_layer(X_padded_mini_batch)
        encoder_output, (final_encoder_output,final_cell_state) = self.second_lstm_layer(first_embedding_layer_out)

        return encoder_output, (final_encoder_output,final_cell_state)

        

In [31]:
# word_embedding_size in encoder and decoder should be same
class Decoder(torch.nn.Module):

    def __init__(self,dst_lang_vocab_size,word_embedding_dim):
        super(Decoder,self).__init__()

        self.first_embedding_layer = torch.nn.Embedding(num_embeddings=dst_lang_vocab_size,
                                                       embedding_dim=word_embedding_dim)
        self.second_lstm_layer = torch.nn.LSTM(input_size=word_embedding_dim,
                                               hidden_size=word_embedding_dim,
                                              batch_first=True)
        self.prediction_layer = torch.nn.Linear(in_features=word_embedding_dim,out_features=dst_lang_vocab_size)
        # self.prediction_layer_activation = torch.nn.Softmax(dim=2)

    def forward(self,Y_padded_input_mini_batch,final_encoder_output,final_cell_state):

        first_embedding_layer_out = self.first_embedding_layer(Y_padded_input_mini_batch)
        decoder_lstm_layer_out, (final_decoder_lstm_layer_out, final_cell_state) = self.second_lstm_layer(first_embedding_layer_out,
                                                                                                         (final_encoder_output,
                                                                                                          final_cell_state))
        prediction = self.prediction_layer(decoder_lstm_layer_out)
        
        return prediction, (final_decoder_lstm_layer_out, final_cell_state)

In [32]:
class Seq2SeqEncDec(torch.nn.Module):
    def __init__(self,src_lang_vocab_size, dst_lang_vocab_size, word_embedding_dim):
        super(Seq2SeqEncDec, self).__init__()

        self.encoder = Encoder(src_lang_vocab_size,word_embedding_dim)
        self.decoder = Decoder(dst_lang_vocab_size,word_embedding_dim)

    def forward(self,X_padded_mini_batch,Y_padded_input_mini_batch):
        encoder_output, (final_encoder_output,final_cell_state) = self.encoder(X_padded_mini_batch)
        y_hat_mini_batch = self.decoder(Y_padded_input_mini_batch, final_encoder_output,final_cell_state)

        return y_hat_mini_batch

Gradient Descent is a covex optimization algo and it is unconstrained

In [33]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

cuda


In [34]:
X_padded_train = X_padded[0:13000]
Y_padded_input_train = Y_padded_input[0:13000]
Y_padded_label_train = Y_padded_label[0:13000]

X_padded_test = X_padded[13000:]
Y_padded_input_test = Y_padded_input[13000:]
Y_padded_label_test = Y_padded_label[13000:]

In [36]:
network = Seq2SeqEncDec(len(Vs),len(Vd),128).to(device)

In [None]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(network.parameters())
num_epochs = 400
mb_size = 65

for epoch in range(num_epochs):
    for i in range(X_padded_train.shape[0]//mb_size):

        X_train_mb = X_padded_train[i*mb_size:(i+1)*mb_size]
        Y_input_mb = Y_padded_input_train[i*mb_size:(i+1)*mb_size]
        Y_label_mb = Y_padded_label_train[i*mb_size:(i+1)*mb_size]
        Y_label_mb = Y_label_mb.reshape(Y_label_mb.shape[0]*Y_label_mb.shape[1],)
        
        X_train_mb, Y_input_mb, Y_label_mb = X_train_mb.to(device), Y_input_mb.to(device), Y_label_mb.to(device)

        y_hat_train_mb = network(X_train_mb,Y_input_mb)
        y_hat_train_mb = y_hat_train_mb[0]
        y_hat_train_mb = y_hat_train_mb.reshape(y_hat_train_mb.shape[0]*y_hat_train_mb.shape[1],
                                                y_hat_train_mb.shape[2])

        loss_fn_value = loss_fn(y_hat_train_mb,Y_label_mb)

        loss_fn_value.backward()
        #torch.nn.utils.clip_grad_norm_(network.parameters(),max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()

        print("Epoch # {}, Time Step # {}, Loss Value = {}".format(epoch,i,loss_fn_value))

Epoch # 0, Time Step # 0, Loss Value = 8.870214462280273
Epoch # 0, Time Step # 1, Loss Value = 8.837637901306152
Epoch # 0, Time Step # 2, Loss Value = 8.80357551574707
Epoch # 0, Time Step # 3, Loss Value = 8.78576946258545
Epoch # 0, Time Step # 4, Loss Value = 8.761303901672363
Epoch # 0, Time Step # 5, Loss Value = 8.721358299255371
Epoch # 0, Time Step # 6, Loss Value = 8.70339298248291
Epoch # 0, Time Step # 7, Loss Value = 8.643969535827637
Epoch # 0, Time Step # 8, Loss Value = 8.634782791137695
Epoch # 0, Time Step # 9, Loss Value = 8.567230224609375
Epoch # 0, Time Step # 10, Loss Value = 8.504158020019531
Epoch # 0, Time Step # 11, Loss Value = 8.463910102844238
Epoch # 0, Time Step # 12, Loss Value = 8.321535110473633
Epoch # 0, Time Step # 13, Loss Value = 8.323922157287598
Epoch # 0, Time Step # 14, Loss Value = 8.144098281860352
Epoch # 0, Time Step # 15, Loss Value = 8.022953987121582
Epoch # 0, Time Step # 16, Loss Value = 7.879510879516602
Epoch # 0, Time Step # 17, 

In [41]:
torch.save(network.state_dict(),"model.pth") # Saving the weights of model because all the data is stored in weights

In [42]:
# This is the code of inference

def generate_translation(eng_sentence): # this function will accept a string like : "My name is Arjun" , whichj will be given by the user from frontend which is a streamlet page
    
    tokenized_eng_sentence = src_sent_tokenizer.tokenize(eng_sentence) # Tokenize the seng sentence and return tokenized list => output : ["My", "name", "is", "Jas"]
    token_ids = src_sent_tokenizer.convert_tokens_to_ids(tokenized_eng_sentence) # converts the words to numeric ids => output : [4,11,48,39]
    token_ids_tensor = torch.tensor(token_ids) # Convert list to tensor -> similar to numpy array => output : tensor[4,11,48,39]
    token_ids_tensor = torch.unsqueeze(token_ids_tensor,0) # Increases the dimension of the array (converts to 2D tensor) => output : tensor[[4,11,48,39]]

    # gpu hoga to gpu pr chla jyga sb kch
    if torch.cuda.is_available():
        device = torch.device("cuda")
        token_ids_tensor = token_ids_tensor.to(device)

    encoder_outputs,(final_encoder_output,final_candidate_cell_state) = network.encoder(token_ids_tensor) # encoder_output is a matrix(4,128) jo ki hr word ka meaning store krta h, final_enocder_output(long term memory), final_candidate_cell_state(short term memory) are vectors of 128 dimension which will go in decoder and they contain complete sentence meaning
    decoder_first_time_step_input = torch.tensor([[1]])

    if torch.cuda.is_available():
        encoder_outputs = encoder_outputs.to(device)
        final_encoder_output = final_encoder_output.to(device)
        final_candidate_cell_state = final_candidate_cell_state.to(device)
        decoder_first_time_step_input = decoder_first_time_step_input.to(device)

    decoder_first_time_step_output, (hidden_decoder_output, hidden_decoder_cell_state) = network.decoder(decoder_first_time_step_input,
                                                                                                         final_encoder_output,
                                                                                                         final_candidate_cell_state,) # hindi ka first word yha se niklega
    
    generated_token_id = torch.argmax(F.softmax(decoder_first_time_step_output[:,0,:],dim=1),1)
    generated_token_id = torch.unsqueeze(generated_token_id,1)
    
    hindi_translated_sentence = str()
    hindi_translated_sentence += " " + hindi_idx2vocab[generated_token_id.item()]
    if torch.cuda.is_available():
        generated_token_id = generated_token_id.to(device)
        hidden_decoder_output = hidden_decoder_output.to(device)
        hidden_decoder_cell_state = hidden_decoder_cell_state.to(device)
    
    for i in range(Nd-1): # Nd is the number of words in the longest hindi sentence ,ye ek demerit bhi h ki hm maximum itna bda hi sentence bna skte h, this thing in NLP is called context window

        # it is a vector of 7072 dimension , isis se hmara tranlation ayga
        decoder_first_time_step_output, (hidden_decoder_output, hidden_decoder_cell_state) = network.decoder(generated_token_id,
                                                                                                             hidden_decoder_output,
                                                                                                             hidden_decoder_cell_state) # hindi ka first word yha se niklega
    
        generated_token_id = torch.argmax(F.softmax(decoder_first_time_step_output[:,0,:],dim=1),1)
        generated_token_id = torch.unsqueeze(generated_token_id,1)
    
        if torch.cuda.is_available():
            generated_token_id = generated_token_id.to(device)
            hidden_decoder_output = hidden_decoder_output.to(device)
            hidden_decoder_cell_state = hidden_decoder_cell_state.to(device)
    
        if generated_token_id.item() == Vd["<EOS>"]:
            break

        hindi_translated_sentence += " " + hindi_idx2vocab[generated_token_id.item()]

    return hindi_translated_sentence

In [43]:
generate_translation("Muriel")

' किताब पेंट में मैं तैरने गति काम सह सीड़ियों पेन्सिल दुल्हन पेंट लेखक बंदूकें सीख उछलो कौनसा मारो तुमने अम्रीकी अजय रूसी शरीर क्रांति वादी शिक्षक धरती साम्राज्य पाल इनका में फेंक स्वच्छ इसको गोश्त ब्रिज काग़ज़ बस्ता ब्रह्मांड यॉर्गेन इसलिए चालीस पत्नी अनुपस्थित हां सुशी बंदूकें संरक्षण बाज़ार प्रस्ताव स्वीकार नहीं घटना सहारा अनुपस्थित महिलाओं जरूरी आपातकालीन न्यू चाईनिज पिंजरा कार्लोस ज़ाहिर बास्टन प्रार्थना भाईसाहब निकम्मे विभिन्न'