In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/english-hindi-dataset/Dataset_English_Hindi.csv


In [None]:
!pip install indic-nlp-library
!pip install contractions
!python -m spacy download en_core_web_sm

In [None]:
import pandas as pd
import numpy as np
import re
from indicnlp.tokenize import indic_tokenize
import spacy
import random
import contractions

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
filePath="/kaggle/input/english-hindi-dataset/Dataset_English_Hindi.csv"

In [6]:
corpus=pd.read_csv(filePath)

In [7]:
# pd.options.display.max_colwidth=100
pd.set_option('display.max_colwidth', None)

# Text Preprocesssing

In [8]:
corpus.head(10)

Unnamed: 0,English,Hindi
0,Help!,बचाओ!
1,Jump.,उछलो.
2,Jump.,कूदो.
3,Jump.,छलांग.
4,Hello!,नमस्ते।
5,Hello!,नमस्कार।
6,Cheers!,वाह-वाह!
7,Cheers!,चियर्स!
8,Got it?,समझे कि नहीं?
9,I'm OK.,मैं ठीक हूँ।


In [9]:
corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130476 entries, 0 to 130475
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   English  130474 non-null  object
 1   Hindi    130164 non-null  object
dtypes: object(2)
memory usage: 2.0+ MB


In [10]:
corpus['English'] = corpus['English'].str.lower().str.strip()
corpus['Hindi'] = corpus['Hindi'].str.lower().str.strip()

In [11]:
corpus.isnull().sum()

English      2
Hindi      312
dtype: int64

In [12]:
corpus.dropna(inplace=True)
corpus=corpus.reset_index(drop=True)

In [13]:
corpus.duplicated().sum()

3342

In [15]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)

corpus['English'] = corpus['English'].apply(remove_url)
corpus['Hindi'] = corpus['Hindi'].apply(remove_url)

In [None]:
def remove_html_tags(text):
    pattern =r'<.*?>'
    text = re.sub(pattern,'',text)
    return text
corpus['English']=corpus['English'].apply(remove_html_tags)
# remove_html_tags("hello are> , <your there?")

In [17]:
def expand_contractions(text):
    expanded_text = contractions.fix(text)
    return expanded_text

corpus['English'] = corpus['English'].apply(expand_contractions)


In [None]:
def preprocess_text(text, lang='eng'):
    if not isinstance(text, str):
        return text

    if lang == 'eng':
        pattern = re.compile(r'[^a-zA-Z0-9\s]')
    elif lang == 'hin':
        pattern = re.compile(r'[^\u0900-\u097F0-9\s]')
    else:
        raise ValueError("Unsupported Language, Supported languages are 'eng' and 'hin'")
    text = pattern.sub('', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

corpus['English'] = corpus['English'].apply(lambda x: preprocess_text(x, lang='eng'))
corpus['Hindi'] = corpus['Hindi'].apply(lambda x: preprocess_text(x, lang='hin'))

In [19]:
# duplicates = corpus[corpus.duplicated()]
# print(duplicates.head(10))
corpus.duplicated().sum()

3854

In [20]:
# corpus.to_csv("cleanedCorpus.csv",index=False)
corpus.drop_duplicates(inplace=True)
corpus=corpus.reset_index(drop=True)

In [21]:
#checking for spaces
rows_with_spaces = corpus.apply(lambda x: x.str.strip() == '').any(axis=1)
rows_with_spaces = corpus[rows_with_spaces].index
rows_with_spaces

Index([  3179,   4315,   4404,   4943,   5031,   5606,   6085,   6295,   6416,
         6615,
       ...
       124467, 124743, 124844, 124955, 124996, 125000, 125351, 125805, 126027,
       126148],
      dtype='int64', length=558)

In [22]:
# removing rows
corpus = corpus[~corpus.apply(lambda x: x.str.strip() == '').any(axis=1)].reset_index(drop=True)

# Tokenization

In [23]:
def tokenizerEN(text):
  tokens= [tok.text.lower() for tok in nlp.tokenizer(text)]
  return tokens

In [None]:
corpus.shape

(125750, 2)

In [None]:
# Removing Sentences whose Length of tokens is greater than 25

MAX_SEQ_LEN = 25

def filter_long_rows(df, max_len):
    def valid(row):
        return len(tokenizerEN(row['English'])) < max_len and \
               len(indic_tokenize.trivial_tokenize(row['Hindi'])) < max_len
    return df[df.apply(valid, axis=1)].reset_index(drop=True)

corpus = filter_long_rows(corpus, MAX_SEQ_LEN)

In [26]:
corpus.shape

(97180, 2)

In [27]:
corpus.to_csv("final.csv")

# Vocab Gen

In [28]:
PAD_TOKEN = 0
UNK_TOKEN = 1
SOS_TOKEN = 2
EOS_TOKEN = 3

In [29]:
eng_vocab={'<PAD>':PAD_TOKEN, '<UNK>':UNK_TOKEN,"<SOS>": SOS_TOKEN, "<EOS>": EOS_TOKEN,}
hin_vocab={'<PAD>':PAD_TOKEN ,'<UNK>':UNK_TOKEN,"<SOS>": SOS_TOKEN, "<EOS>": EOS_TOKEN,}

In [None]:
def vocabGen(row):
  eng_token=tokenizerEN(row["English"])
  hin_token=indic_tokenize.trivial_tokenize_indic(row["Hindi"])

  for token in eng_token:
    if token not in eng_vocab:
      eng_vocab[token]=len(eng_vocab)
  for token in hin_token:
    if token not in hin_vocab:
      hin_vocab[token]=len(hin_vocab)


corpus.apply(vocabGen,axis=1)

In [32]:
len(eng_vocab) ,len(hin_vocab)

(50494, 51555)

In [None]:
def text2idx(sentence, lang="eng",vocab=eng_vocab):
  if lang=="eng":
    sentence_token=tokenizerEN(sentence)
  elif lang=="hin":
    sentence_token=indic_tokenize.trivial_tokenize(sentence)
  else:
    raise ValueError("Unsupported Language, Supported languages are 'eng' and 'hin'")
  
  indexed_text = []
  for token in sentence_token:
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(UNK_TOKEN)
  indexed_text.append(EOS_TOKEN)

  return indexed_text


In [34]:
text2idx(corpus.iloc[1234]["English"],"eng",eng_vocab)
# corpus.iloc[1234]

[74, 424, 53, 1101, 353, 3]

## Dataset and DataLoader preparation

In [35]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_packed_sequence,pack_padded_sequence

In [36]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [37]:
class MTDataset(Dataset):

  def __init__(self, df, eng_vocab,hin_vocab):
    self.df = df
    self.eng_vocab = eng_vocab
    self.hin_vocab= hin_vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):

    numerical_eng = text2idx(self.df.iloc[index]['English'],"eng", self.eng_vocab)
    numerical_hin = text2idx(self.df.iloc[index]['Hindi'], "hin", self.hin_vocab)
    encoder_inputs =torch.tensor(numerical_eng[:-1],dtype=torch.long)
    numerical_hin=torch.tensor(numerical_hin,dtype=torch.long)
    decoder_targets=numerical_hin

    return encoder_inputs.to(device), decoder_targets.to(device)

In [38]:
def collate_fn(batch):
    encoder_inputs, decoder_targets = zip(*batch)
    input_lengths=torch.tensor([len(seq) for seq in encoder_inputs])
    encoder_inputs = pad_sequence(encoder_inputs, batch_first=True, padding_value=0)
    decoder_targets = pad_sequence(decoder_targets, batch_first=True, padding_value=0)

    return encoder_inputs, decoder_targets,input_lengths


In [39]:
dataset = MTDataset(corpus, eng_vocab, hin_vocab)

In [None]:
seed=42
g = torch.Generator()
g.manual_seed(seed)
train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size],generator=g)

BATCH_SIZE=128
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn,drop_last=True,generator=g)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn ,drop_last=True,generator=g)

# Index TO Word 

In [42]:
# index2word
hin_idx2word={v:k for k,v in hin_vocab.items()}
eng_idx2word={v:k for k,v in eng_vocab.items()}

# Model

In [None]:
class EncoderRNN(nn.Module):
  def __init__(self,eng_vocab_size,hidden_size,embedding_size):
    super().__init__()
    self.embedding=nn.Embedding(eng_vocab_size,embedding_dim=embedding_size)
    self.dropout=nn.Dropout(0.4)
    self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True, bidirectional=True)
    self.reduce=nn.Linear(hidden_size*2,hidden_size)

  def forward(self, encoder_inputs,input_lengths):
    embedded_input = self.dropout(self.embedding(encoder_inputs))
    packed_embedded = pack_padded_sequence(embedded_input, input_lengths, batch_first=True, enforce_sorted=False)
    output, (hidden_state,cell_state) = self.lstm(packed_embedded)
    output, _ = pad_packed_sequence(output, batch_first=True)
    output= self.reduce(output)
    hidden_state = torch.cat((hidden_state[0,:,:], hidden_state[1,:,:]), dim=1).unsqueeze(0)
    cell_state = torch.cat((cell_state[0,:,:], cell_state[1,:,:]), dim=1).unsqueeze(0)
    hidden_state= self.reduce(hidden_state)
    cell_state= self.reduce(cell_state)
    return output, hidden_state,cell_state


In [None]:
class Attention(nn.Module):
  def __init__(self,hidden_size):
    super().__init__()
    self.l = nn.Linear(hidden_size*2, hidden_size * 2, bias=False)
    self.o = nn.Linear(hidden_size * 2, 1, bias=False)


  def forward(self,encoder_output,decoder_hidden,mask=None):
    decoder_hidden=decoder_hidden.squeeze(0)
    decoder_hidden=decoder_hidden.unsqueeze(1).repeat(1,encoder_output.size(1),1)
    combined=torch.cat((encoder_output,decoder_hidden),dim=2)
    x = torch.tanh(self.l(combined))
    attention = self.o(x).squeeze(2)
    if mask is not None:
        attention = attention.masked_fill(mask == 0, -1e9)
    attention_weights = F.softmax(attention, dim=1)
    context_vector = torch.bmm(attention_weights.unsqueeze(1), encoder_output).squeeze(1)
    return context_vector, attention_weights



In [None]:
class AttentionDecoder(nn.Module):
  def __init__(self,hin_vocab_size,hidden_size,embedding_size):
    super().__init__()
    self.attention = Attention(hidden_size)
    self.embedding=nn.Embedding(hin_vocab_size,embedding_dim=embedding_size)
    self.lstm = nn.LSTM(embedding_size+hidden_size, hidden_size, batch_first=True)
    self.dropout = nn.Dropout(0.4)
    self.out = nn.Linear(hidden_size, hin_vocab_size)
    self.attention_matrix = []
  def forward(self,input, encoder_hidden,encoder_cell,encoder_output,mask=None ,store_attention=False):

      output = self.dropout(self.embedding(input))
      context_vector,attention_weights=self.attention(encoder_output,encoder_hidden,mask=mask)
      if store_attention:
        self.attention_matrix.append(attention_weights.detach().cpu().numpy())
      lstm_input = torch.cat((output, context_vector.unsqueeze(1)), dim=2)
      output, (hidden, cell) = self.lstm(lstm_input, (encoder_hidden, encoder_cell))
      output = self.out(output)
      return output, hidden, cell

# Translation

In [56]:
HIDDEN_SIZE=1024
EMBEDDING_SIZE=256
eng_vocab_size = len(eng_vocab)
hin_vocab_size = len(hin_vocab)

encoder = EncoderRNN(eng_vocab_size,HIDDEN_SIZE,EMBEDDING_SIZE).to(device)
decoder = AttentionDecoder(hin_vocab_size,HIDDEN_SIZE,EMBEDDING_SIZE).to(device)

In [47]:
def translate(encoder,decoder,sentence,eng_vocab=eng_vocab,hin_idx2word=hin_idx2word,maxLength=25):
  encoder.eval()
  decoder.eval()
  with torch.inference_mode():
    tokenized_sent=text2idx(sentence,"eng",eng_vocab)[:-1]
    input_tensor= torch.tensor(tokenized_sent,dtype=torch.long)
    input_tensor = input_tensor.view(1, -1).to(device)
    input_lengths=torch.tensor([len(seq) for seq in input_tensor])
    encoder_output , encoder_hidden,encoder_cell=encoder(input_tensor,input_lengths)

    decoder_input=torch.tensor([[SOS_TOKEN]]).to(device)
    decoder_hidden=encoder_hidden
    decoder_cell=encoder_cell
    decoded_words=[]
    for di in range(maxLength):
      logits,decoder_hidden,decoder_cell=decoder(decoder_input,decoder_hidden,decoder_cell,encoder_output,store_attention=True)
      logits=logits.reshape(logits.size(0),-1)
      next_token=logits.argmax(axis=1)
      decoder_input=torch.tensor([[next_token]]).to(device)
      if next_token.item()==EOS_TOKEN:
        break
      else:
        decoded_words.append(hin_idx2word[next_token.item()])
    return ' '.join(decoded_words)


In [None]:
sentence="what is your name,".strip()
translated_sent=translate(encoder,decoder,sentence,eng_vocab,hin_idx2word)
print("Hindi translation: ",translated_sent)

Hindi translation:  तुम्हारा नाम क्या है


# Training

In [50]:
len(train_loader),len(test_loader)

(683, 75)

In [None]:
def get_teacher_forcing_ratio(epoch, start=1.0, end=0.6):
    return max(end, start * (0.95 ** epoch)) 


In [None]:
learning_rate=3e-4
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
encoder_optimizer=optim.AdamW(encoder.parameters(),lr=learning_rate)
decoder_optimizer=optim.AdamW(decoder.parameters(),lr=learning_rate*5)

In [None]:

epochs=30


total_loss=[]
for epoch in range(epochs):
  encoder.train()
  decoder.train()
  epoch_loss =0
  teacher_forcing_ratio = get_teacher_forcing_ratio(epoch, epochs)
  for i,(encoder_input, decoder_target,input_lengths) in enumerate(train_loader):
    input_tensor, target_tensor = encoder_input, decoder_target

    mask = (input_tensor != PAD_TOKEN).to(device)  
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    target_length = target_tensor.size(1)

    encoder_output, encoder_hidden ,encoder_cell= encoder(input_tensor,input_lengths)

    decoder_input = torch.full((BATCH_SIZE, 1),SOS_TOKEN, dtype=torch.long).to(device)
    decoder_hidden = encoder_hidden
    decoder_cell = encoder_cell


    batch_loss = 0
    for di in range(target_length):
      logits, decoder_hidden,decoder_cell  = decoder(decoder_input, decoder_hidden,decoder_cell,encoder_output,mask=mask)
      logits=logits.reshape(logits.size(0),-1)
      batch_loss += loss_fn(logits, target_tensor[:,di])

      use_teacher_forcing = random.random() < teacher_forcing_ratio 
      if use_teacher_forcing:
          decoder_input = target_tensor[:, di].unsqueeze(1)
      else:
          top1 = logits.argmax(1).unsqueeze(1)
          decoder_input = top1.detach()


    batch_loss.backward()
    torch.nn.utils.clip_grad_norm_(encoder.parameters(), max_norm=1)
    torch.nn.utils.clip_grad_norm_(decoder.parameters(), max_norm=1)
    encoder_optimizer.step()
    decoder_optimizer.step()
    epoch_loss +=batch_loss.item()
    if i % 10 == 0:
        print(f'Epoch {epoch}, Batch {i}, Loss: {batch_loss.item() / target_length:.4f}')
  total_loss.append(epoch_loss /(len(train_loader)))
  print(f'--- Epoch {epoch} Completed, Avg Loss: {total_loss[-1]:.4f} ---')


# Plotting Losses and Saving Model

In [None]:
import pandas as pd

loss_df = pd.DataFrame({
    "epoch": list(range(len(total_loss))),
    "loss": total_loss
})
loss_df.to_csv("root/training_lossM63.csv", index=False)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
plt.plot(total_loss, marker='o')
plt.title("Training Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.grid(True)
plt.tight_layout()
plt.savefig("root/training_lossM6.png")
plt.show()

In [None]:

torch.save({
    'encoder_state': encoder.state_dict(),
    'decoder_state': decoder.state_dict(),
    'encoder_optimizer': encoder_optimizer.state_dict(),
    'decoder_optimizer': decoder_optimizer.state_dict(),
    'loss': total_loss,
}, "checkpoint3.pt")

In [None]:
print("done")

In [None]:
# from IPython.display import FileLink
# (FileLink('checkpoint.pt')) # your model name.

# Testing Model

In [73]:
def decode_indices(indices, idx2word):
    words = []
    for idx in indices:
        word = idx2word.get(idx, "<UNK>")
        if word == "<EOS>":
            break
        if word not in ["<PAD>", "<SOS>"]:
            words.append(word.strip())
    return words


In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

smoothie = SmoothingFunction().method4

bleu_scores = []

with torch.no_grad():
    for i,batch in enumerate(test_loader):
        input_tensor, target_tensor,input_lengths = batch
        for i in range(input_tensor.size(0)):
            input=decode_indices(input_tensor[i].tolist(),eng_idx2word)
            input=" ".join(input)
            if(input=="" or input==" "):
              continue
            predicted_sent = translate(encoder,decoder,sentence=input).split()
            target = decode_indices(target_tensor[i].tolist(), hin_idx2word)

            if len(predicted_sent) == 0 or len(target) == 0:
                continue
            # print("Predicted: "," ".join(predicted_sent))
            # print("Actual: "," ".join(target))
            bleu = sentence_bleu([target], predicted_sent, smoothing_function=smoothie)
            bleu_scores.append(bleu)
        
        

average_bleu = sum(bleu_scores) / len(bleu_scores)
print(f"Average BLEU score on test set: {average_bleu*100:.4f}")


Average BLEU score on test set: 10.0045
