## Import Libraries

In [None]:
import string
import re
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt
from datasets import load_dataset

pd.set_option('display.max_colwidth', 200)
#pd.set_option('display.expand_frame_repr', False)


# Download Hindi-English IIT Data from 

In [None]:
dataset = load_dataset("cfilt/iitb-english-hindi")

pairs=[] 
for translation_pair in dataset["train"]["translation"]:
  source_sentence = translation_pair["hi"]
  target_sentence = translation_pair["en"]
  pairs.append([source_sentence, target_sentence])

lines= pd.DataFrame(columns=[ "hindi","eng"], data=pairs)
lines= lines[:10000]
lines.tail()

In [None]:
######### Remove Punctuation
lines['hindi']= [ (sent.translate(str.maketrans('', '', string.punctuation)) )    for sent in lines['hindi']  ]
lines['eng']= [ (sent.translate(str.maketrans('', '', string.punctuation)) )    for sent in lines['eng']  ]
######### Remove Punctuation

######### Convert To Lowercase 
lines['hindi']= [ (sent.lower())  for sent in lines['hindi']   ]
lines['eng']= [ (sent.lower())  for sent in lines['eng']   ]
######### Convert To Lowercase 


In [None]:
##################  Popualte the lsit with sentence lengths

hindi_sent_length_list=[  (len(sent.split()))  for sent in lines['hindi'] ]
eng_sent_length_list=[  (len(sent.split()))  for sent in lines['eng'] ]

length_df = pd.DataFrame({'eng':eng_sent_length_list, 'hindi':hindi_sent_length_list})
length_df.hist(bins = 30)
plt.show()

## (Tokenize) Convert each sentence to list of integers

In [None]:

############ Function to build a tokenizer
def tokenization(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
############ Function to build a tokenizer    

In [None]:
################ Create english tokenizer
eng_tokenizer = tokenization(lines['eng'])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = 8
print('English Vocabulary Size: %d' % eng_vocab_size)
################ Create english tokenizer

################ Create hindi tokenizer
hindi_tokenizer = tokenization(lines['hindi'])
hindi_vocab_size = len(hindi_tokenizer.word_index) + 1
hindi_length = 8
print('Hindi Vocabulary Size: %d' % hindi_vocab_size)
################ Create hindi tokenizer


## To print eng dictionary
# eng_tokenizer.word_index

## To print hindi dictionary
# hindi_tokenizer.word_index

### Encode amd Pad Sentences

In [None]:

def encode_sequences(tokenizer, length, lines):
    seq = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    seq = pad_sequences(seq, maxlen=length, padding='post')
    return seq

## Create 2D list where first element is hindi and second element is english

In [None]:
hindi_eng=lines[['hindi','eng']].to_numpy()
hindi_eng[:5]  #.shape

In [None]:
from sklearn.model_selection import train_test_split

# split data into train and test set
train, test = train_test_split(hindi_eng, test_size=0.2, random_state = 12)