In [1]:
!pip3 install indic-nlp-library



In [3]:
import pandas as pd  #for dataset

import numpy as np   # for matrices and array

from transformers import AutoTokenizer # trnasformer is a library and AutoTokenizer is a part of it.
# Tokenizer => To break a sentence into words is called tokenization.It is done so that the data can be easily understood by the model.
# Ex : I am with 5th Sem students of KIET. => AFter tokenization : [I,am,with,5th,Sem,students,of,KIET]
# Natural language data is the hardest to train
# Autotokenizer is a program which as great functions to tokenize such type of hard data
# Tokenizers are also trained ml models to do tokenization

import torch 
from indicnlp.tokenize import indic_tokenize



In [4]:
data = pd.read_csv("/kaggle/input/english-hindi/Sentence pairs in English-Hindi - 2025-02-11.tsv",
                  sep="\t", header=None, names=["SrcSentID","SrcSent","DstSentID","DstSent"])

In [7]:
data.drop(labels=[data.columns[0], data.columns[2]], axis=1, inplace=True)
# This line removes the first and third columns of the dataframe, permanently updating data.

In [8]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,Muiriel is 20 now.,म्यूरियल अब बीस साल की हो गई है।
1,Muiriel is 20 now.,म्यूरियल अब बीस साल की है।
2,Education in this world disappoints me.,मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,That won't happen.,वैसा नहीं होगा।
4,I miss you.,मुझें तुम्हारी याद आ रही है।


In [9]:
# to tokenize english sentence we will use AutoTokenizer 
# for hindi => indic_tokenize
src_sent_tokenizer = AutoTokenizer.from_pretrained("google-T5/T5-base") # T5 is an LLM based on tenasformers made by google
# we are using base model of T5
# "google-T5/T5-base" => It is like a path to a repository on internet => hugging face ki ek repository hogi uske andr ggoogle-T5 ek repository hogi jiske andr ek folder hoga T5-base
# hr LLM ka Apna ek tokenizer hota hai jo ki ek alg trh se train kiya jata hai


In [10]:
data["SrcSent"] = data["SrcSent"].apply(lambda x : src_sent_tokenizer.tokenize(x))
# hr cell pr kch operation krna hai to applymap use krenge
# kisi ek row ya column  pr opr krna hai to yse apply
# agr hashing krni hai to t=use map
# panda has => apply, map, applymap



In [11]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]",म्यूरियल अब बीस साल की हो गई है।
1,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]",म्यूरियल अब बीस साल की है।
2,"[▁Education, ▁in, ▁this, ▁world, ▁disappoint, ...",मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,"[▁That, ▁won, ', t, ▁happen, .]",वैसा नहीं होगा।
4,"[▁I, ▁miss, ▁you, .]",मुझें तुम्हारी याद आ रही है।


In [33]:
# this type of tokenization is called sub-word tokenization as it also breaks a single word into parts and it also treart space as a token represnted by unserscore(_)
# BPE(Byte-Pair encoding) => Algorithm is used behind this to do this type of tokenization
# WPT(Word Piece Tokenizer) => it is also an algo for tokenization

In [12]:
data["DstSent"] = data["DstSent"].apply(lambda x : indic_tokenize.trivial_tokenize(x,lang="hi"))

In [13]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]","[म्यूरियल, अब, बीस, साल, की, हो, गई, है, ।]"
1,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]","[म्यूरियल, अब, बीस, साल, की, है, ।]"
2,"[▁Education, ▁in, ▁this, ▁world, ▁disappoint, ...","[मैं, इस, दुनिया, में, शिक्षा, पर, बहुत, निराश..."
3,"[▁That, ▁won, ', t, ▁happen, .]","[वैसा, नहीं, होगा, ।]"
4,"[▁I, ▁miss, ▁you, .]","[मुझें, तुम्हारी, याद, आ, रही, है, ।]"


In [14]:
data["SrcSent"] = data["SrcSent"].apply(src_sent_tokenizer.convert_tokens_to_ids)

In [38]:
Vs = src_sent_tokenizer.get_vocab() # hr src word ko ek number assign kr dega

In [16]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[4159, 23, 14018, 19, 460, 230, 5]","[म्यूरियल, अब, बीस, साल, की, हो, गई, है, ।]"
1,"[4159, 23, 14018, 19, 460, 230, 5]","[म्यूरियल, अब, बीस, साल, की, है, ।]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[मैं, इस, दुनिया, में, शिक्षा, पर, बहुत, निराश..."
3,"[466, 751, 31, 17, 1837, 5]","[वैसा, नहीं, होगा, ।]"
4,"[27, 3041, 25, 5]","[मुझें, तुम्हारी, याद, आ, रही, है, ।]"


In [17]:
hindi_vocab = set()

for tokenized_hindi_sent in data["DstSent"] :
    hindi_vocab.update(tokenized_hindi_sent) # this is a set of unique hindi words from our dst column

In [18]:
Vd = dict()
for idx, token in enumerate(hindi_vocab) : 
    Vd[token] = idx + 3
Vd["<PAD>"] = 0
Vd["<SOS>"] = 1
Vd["<EOS>"] = 2

Tokens in neural network: 
"<SOS>" (1) => Start of Sentence => USed to denote or tell the neural network that a sentece is going to start.
"<PAD>" (0) => Padding => Used for doing padding to make the length of all the sentence equal, so that a matrix can be made"
"<EOS>" (2) => End of Service => USed to denote the end of the sentence.

In [19]:
def convert_hindi_tokens_to_ids(hindi_sent) : 
    return [Vd[token] for token in hindi_sent]

In [20]:
data["DstSent"] = data["DstSent"].apply(lambda x : convert_hindi_tokens_to_ids(x))

In [21]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[4159, 23, 14018, 19, 460, 230, 5]","[6601, 338, 5511, 656, 4176, 921, 4002, 4627, ..."
1,"[4159, 23, 14018, 19, 460, 230, 5]","[6601, 338, 5511, 656, 4176, 4627, 1179]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[3062, 3520, 279, 758, 6082, 5797, 5501, 1054,..."
3,"[466, 751, 31, 17, 1837, 5]","[380, 1922, 568, 1179]"
4,"[27, 3041, 25, 5]","[1323, 5957, 2601, 2048, 5702, 4627, 1179]"


In [22]:
# adding SOS(1) in start
def insert_sos_token_id(hindi_sent_token_ids):
    return [1] + hindi_sent_token_ids # Concatenation hora h

In [23]:
data["DstSentInput"] = data["DstSent"].apply(lambda x : insert_sos_token_id(x))

In [24]:
# adding EOS (2) at end
def insert_eos_token_id(hindi_sent_token_ids): 
    return hindi_sent_token_ids + [2]

In [25]:
data["DstSentLabel"] = data["DstSent"].apply(lambda x : insert_eos_token_id(x))

In [26]:
data.drop(labels=[data.columns[1]],axis=1,inplace=True)

In [28]:
X = list(data["SrcSent"])
Y_input = list(data["DstSentInput"])
Y_label = list(data["DstSentLabel"])

In [31]:
X_tensor = [torch.tensor(tokenized_eng_sent_ids) for tokenized_eng_sent_ids in X]
Y_input_tensor = [torch.tensor(tokenized_hin_sent_ids) for tokenized_hin_sent_ids in Y_input]
Y_label_tensor = [torch.tensor(tokenized_hin_sent_ids) for tokenized_hin_sent_ids in Y_label]

In [32]:
X_padded = torch.nn.utils.rnn.pad_sequence(X_tensor,batch_first=True)
Y_padded_input = torch.nn.utils.rnn.pad_sequence(Y_input_tensor,batch_first=True)
Y_padded_label = torch.nn.utils.rnn.pad_sequence(Y_label_tensor,batch_first=True)

In [33]:
Ns = X_padded.shape[1]
Nd = Y_padded_label.shape[1]