In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import nltk
from nltk.translate.bleu_score import sentence_bleu
import re
import string
from string import digits



# Load CSV file
data = pd.read_csv('data/Hindi_English_Truncated_Corpus.csv')
data = data.sample(n=20000, random_state=42)
data = data.reset_index(drop=True)



# Replace NA values with empty strings
data['hindi_sentence'] = data['hindi_sentence'].fillna('')
data['english_sentence'] = data['english_sentence'].fillna('')

# Add 'startseq' and 'endseq' tokens to English sentences
data['english_sentence'] = data['english_sentence'].apply(lambda x: 'startseq ' + x + ' endseq')

# Preprocess the data
def preprocess_text(text):
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Lowercase the text
    text = text.lower()
    return text


# Preprocess the hindi data
def preprocess_hindi_text(text):
    # Remove English words
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)
   
    text = text.lower()
    return text


data['hindi_sentence'] = data['hindi_sentence'].apply(preprocess_hindi_text)
data['english_sentence'] = data['english_sentence'].apply(preprocess_text)


# Lowercase all characters
data['english_sentence']=data['english_sentence'].apply(lambda x: x.lower())
data['hindi_sentence']=data['hindi_sentence'].apply(lambda x: x.lower())

#remove qoutes
data['english_sentence'] = data['english_sentence'].apply(lambda x: re.sub("'", '', x))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: re.sub("'", '', x))



exclude = set(string.punctuation)  # Set of all special characters

# Remove all the special characters
data['english_sentence'] = data['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))



# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
data['english_sentence'] = data['english_sentence'].apply(lambda x: x.translate(remove_digits))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: x.translate(remove_digits))

data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
data['english_sentence'] = data['english_sentence'].apply(lambda x: x.strip())
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: x.strip())
data['english_sentence'] = data['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))


#adding 2 new coloumns to see sentence length
data['length_eng_sentence'] = data['english_sentence'].apply(lambda x: len(x.split(" ")))
data['length_hin_sentence'] = data['hindi_sentence'].apply(lambda x: len(x.split(" ")))



In [4]:
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
0,tides,startseq he declares the result and reports it...,वही परिणाम की घोषणा करता है और निर्वाचन आयोग क...,21,20
1,ted,startseq was a little uncomfortable for them e...,थोडा कठिन था।,8,3
2,indic2012,startseq but mulla assamudin was proved to be ...,मगर मुल्ला असमुद्दीन अक्षम सिद्ध हुए।,11,6
3,ted,startseq i would never have to make a book and...,मुझे कभी भी किताब बना कर किसी प्रदर्शनस्थल को ...,17,14
4,indic2012,startseq headind kaun banega crorepati endseq,शीर्षक कौन बनेगा करोड़पति,6,4


In [5]:
data.shape

(20000, 5)

In [6]:
#keeping sentences with lenght upto max_sentence_length

max_sentence_length=20

data=data[data['length_eng_sentence']<= max_sentence_length]
data=data[data['length_hin_sentence']<= max_sentence_length]

data = data.reset_index(drop=True)

In [7]:
data.shape

(13683, 5)

In [8]:
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
0,ted,startseq was a little uncomfortable for them e...,थोडा कठिन था।,8,3
1,indic2012,startseq but mulla assamudin was proved to be ...,मगर मुल्ला असमुद्दीन अक्षम सिद्ध हुए।,11,6
2,ted,startseq i would never have to make a book and...,मुझे कभी भी किताब बना कर किसी प्रदर्शनस्थल को ...,17,14
3,indic2012,startseq headind kaun banega crorepati endseq,शीर्षक कौन बनेगा करोड़पति,6,4
4,tides,startseq no other national leader except nehru...,नेहरू को छोड़कर और किसी भी राष्ट्र नेता के मन ...,11,15


In [9]:
max_num_words = 20000

# Tokenizer for Hindi sentences
hindi_tokenizer = Tokenizer(num_words=max_num_words)
hindi_tokenizer.fit_on_texts(data['hindi_sentence'])
hindi_vocab_size = len(hindi_tokenizer.word_index) + 1

# Tokenizer for English sentences
english_tokenizer = Tokenizer(num_words=max_num_words)
english_tokenizer.fit_on_texts(data['english_sentence'])
english_vocab_size = len(english_tokenizer.word_index) + 1


In [10]:
hindi_tokenizer.index_word

{1: 'के',
 2: 'में',
 3: 'है',
 4: 'और',
 5: 'की',
 6: 'से',
 7: 'का',
 8: 'हैं',
 9: 'को',
 10: 'एक',
 11: 'कि',
 12: 'पर',
 13: 'यह',
 14: 'भी',
 15: 'है।',
 16: 'नहीं',
 17: 'इस',
 18: 'ही',
 19: 'लिए',
 20: 'जो',
 21: 'ने',
 22: 'कर',
 23: 'तो',
 24: 'आप',
 25: 'था',
 26: 'हो',
 27: 'ये',
 28: 'मैं',
 29: 'हम',
 30: 'हैं।',
 31: 'कुछ',
 32: 'किया',
 33: 'करने',
 34: 'बहुत',
 35: 'अपने',
 36: 'गया',
 37: 'थे',
 38: 'या',
 39: 'वे',
 40: 'होता',
 41: '।',
 42: 'तक',
 43: 'क्या',
 44: 'वो',
 45: 'साथ',
 46: 'भारत',
 47: 'जाता',
 48: 'रहे',
 49: 'थी',
 50: 'वह',
 51: 'करते',
 52: 'सकते',
 53: 'कोई',
 54: 'तरह',
 55: 'जब',
 56: 'लेकिन',
 57: 'तथा',
 58: 'किसी',
 59: 'रहा',
 60: 'दिया',
 61: 'मुझे',
 62: 'बारे',
 63: 'बाद',
 64: 'समय',
 65: 'अब',
 66: 'उन्हें',
 67: 'उनके',
 68: 'बात',
 69: 'था।',
 70: 'रूप',
 71: 'काम',
 72: 'सबसे',
 73: 'जा',
 74: 'हुआ',
 75: 'करना',
 76: 'द्वारा',
 77: 'इसके',
 78: 'इसे',
 79: 'सकता',
 80: 'होती',
 81: 'मे',
 82: 'कहा',
 83: 'हूँ',
 84: 'लिये',
 85: '

In [11]:
hindi_tokenizer.word_counts

OrderedDict([('थोडा', 8),
             ('कठिन', 32),
             ('था।', 197),
             ('मगर', 74),
             ('मुल्ला', 5),
             ('असमुद्दीन', 3),
             ('अक्षम', 4),
             ('सिद्ध', 9),
             ('हुए।', 14),
             ('मुझे', 220),
             ('कभी', 83),
             ('भी', 1016),
             ('किताब', 12),
             ('बना', 87),
             ('कर', 637),
             ('किसी', 230),
             ('प्रदर्शनस्थल', 1),
             ('को', 1765),
             ('देने', 61),
             ('की', 2589),
             ('ज़रुरत', 6),
             ('नहीं', 962),
             ('पड़ी', 8),
             ('शीर्षक', 6),
             ('कौन', 25),
             ('बनेगा', 2),
             ('करोड़पति', 2),
             ('नेहरू', 23),
             ('छोड़कर', 12),
             ('और', 2746),
             ('राष्ट्र', 18),
             ('नेता', 23),
             ('के', 4436),
             ('मन', 20),
             ('में', 3644),
             ('यह', 1084),
         

In [12]:
english_tokenizer.word_index

{'startseq': 1,
 'endseq': 2,
 'the': 3,
 'of': 4,
 'and': 5,
 'to': 6,
 'in': 7,
 'is': 8,
 'a': 9,
 'that': 10,
 'this': 11,
 'it': 12,
 'was': 13,
 'are': 14,
 'you': 15,
 'for': 16,
 'i': 17,
 'on': 18,
 'we': 19,
 'with': 20,
 'have': 21,
 'be': 22,
 'as': 23,
 'he': 24,
 'not': 25,
 'they': 26,
 'from': 27,
 'by': 28,
 'but': 29,
 'so': 30,
 'there': 31,
 'at': 32,
 'its': 33,
 'were': 34,
 'has': 35,
 'one': 36,
 'his': 37,
 'can': 38,
 'what': 39,
 'all': 40,
 'an': 41,
 'about': 42,
 'which': 43,
 'or': 44,
 'also': 45,
 'these': 46,
 'do': 47,
 'their': 48,
 'people': 49,
 'india': 50,
 'will': 51,
 'had': 52,
 'if': 53,
 'like': 54,
 'them': 55,
 'other': 56,
 'when': 57,
 'my': 58,
 'more': 59,
 'our': 60,
 'world': 61,
 'very': 62,
 'out': 63,
 'who': 64,
 'time': 65,
 'some': 66,
 'your': 67,
 'now': 68,
 's': 69,
 'no': 70,
 'only': 71,
 'many': 72,
 'up': 73,
 'first': 74,
 'hindi': 75,
 'how': 76,
 'because': 77,
 'after': 78,
 'been': 79,
 'indian': 80,
 'here': 81,
 

In [13]:
# Convert sentences to sequences
hindi_sequences = hindi_tokenizer.texts_to_sequences(data['hindi_sentence'])
english_sequences = english_tokenizer.texts_to_sequences(data['english_sentence'])

In [14]:
hindi_sequences

[[1737, 452, 69],
 [191, 2621, 3789, 3103, 1566, 1039],
 [61, 169, 14, 1207, 162, 22, 58, 7728, 9, 235, 5, 2227, 16, 1738],
 [2228, 591, 5042, 5043],
 [647, 9, 1208, 4, 58, 14, 824, 648, 1, 739, 2, 13, 1040, 16, 37],
 [615,
  21,
  1739,
  649,
  1,
  19,
  203,
  740,
  314,
  547,
  278,
  363,
  650,
  1209,
  1307,
  914,
  1210,
  75,
  396,
  69],
 [1954,
  2622,
  21,
  7729,
  100,
  5,
  741,
  482,
  66,
  7730,
  3790,
  2,
  651,
  32,
  4,
  7731,
  3104],
 [98, 1955, 4, 3791, 21, 525, 32, 3],
 [397, 696, 1740, 3792, 1956, 2, 3],
 [104, 61, 3105, 3793, 1, 2623, 33, 1, 62, 2, 592, 742, 114, 23, 43, 1957],
 [191, 101, 204, 14, 876, 199, 52, 8, 41],
 [94, 38, 85, 5044, 7, 1567, 1568],
 [315, 21, 262, 825, 1431, 1432, 647, 9, 5045, 269],
 [107, 7732, 2, 219, 12, 430, 1569, 7733, 5, 3106, 469, 219, 2624],
 [5046,
  3107,
  1041,
  70,
  6,
  3108,
  42,
  18,
  1308,
  37,
  4,
  483,
  343,
  1041,
  125,
  9,
  18,
  60,
  47,
  25],
 [3109, 526, 3109, 1570, 172, 111, 3],
 [6

In [15]:
english_sequences

[[1, 13, 9, 199, 4418, 16, 55, 2],
 [1, 29, 2731, 4419, 13, 2330, 6, 22, 25, 4420, 2],
 [1, 17, 82, 245, 21, 6, 114, 9, 192, 5, 87, 358, 12, 6, 9, 4421, 2],
 [1, 6513, 4422, 4423, 4424, 2],
 [1, 70, 56, 254, 561, 1742, 591, 1743, 10, 2732, 2],
 [1,
  1552,
  718,
  6514,
  48,
  495,
  6,
  3,
  4425,
  369,
  43,
  875,
  273,
  4426,
  5,
  6515,
  2],
 [1, 2331, 49, 34, 1983, 2332, 4427, 5, 4428, 28, 3, 299, 2733, 2],
 [1, 89, 23, 3, 1000, 5, 1744, 811, 2],
 [1, 3, 1001, 4429, 4, 562, 520, 8, 166, 3357, 2],
 [1, 39, 53, 17, 135, 592, 228, 42, 6516, 16, 9, 108, 3358, 2],
 [1, 29, 344, 115, 22, 1267, 23, 96, 2],
 [1, 83, 44, 59, 1553, 4, 9, 1745, 876, 2],
 [1, 2333, 183, 31, 5, 255, 37, 6517, 6, 1984, 6518, 591, 2],
 [1, 6519, 18, 653, 1554, 7, 80, 3359, 256, 18, 193, 2],
 [1,
  6520,
  1746,
  34,
  2734,
  1747,
  6,
  2735,
  5,
  84,
  4,
  3,
  1400,
  13,
  2736,
  6,
  3,
  4430,
  2],
 [1, 8, 6521, 1002, 27, 1002, 2],
 [1, 68, 24, 3360, 1175, 215, 5, 10, 521, 108, 2],
 [1, 9, 

In [16]:
# Determine the maximum sequence length for padding
max_hindi_seq_length = max([len(seq) for seq in hindi_sequences])
max_english_seq_length = max([len(seq) for seq in english_sequences])

print("Maximum Hindi Sequence Length: ", max_hindi_seq_length)
print("Maximum English Sequence Length: ", max_english_seq_length)

Maximum Hindi Sequence Length:  20
Maximum English Sequence Length:  20


In [17]:
# Pad the sequences
hindi_sequences = pad_sequences(hindi_sequences, maxlen=max_hindi_seq_length, padding='post')
english_sequences = pad_sequences(english_sequences, maxlen=max_english_seq_length, padding='post')


In [18]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(hindi_sequences, english_sequences, test_size=0.2, random_state=42)


In [19]:
from keras.utils import Sequence
class DataGenerator(Sequence):
    def __init__(self, X, y, batch_size, english_vocab_size):
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.english_vocab_size = english_vocab_size
        self.indices = np.arange(len(X))

    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch_size))

    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        X_batch = self.X[batch_indices]
        y_batch = self.y[batch_indices]

        # Decoder input sequences (excluding 'endseq')
        y_batch_input = np.zeros((len(y_batch), max_english_seq_length), dtype='int32')
        # Decoder output sequences (one-hot encoded, including 'endseq')
        y_batch_output = np.zeros((len(y_batch), max_english_seq_length, self.english_vocab_size), dtype='float32')

        for i, seq in enumerate(y_batch):
            for t, word_index in enumerate(seq):
                if t > 0:  # Shifted sequence for decoder input
                    y_batch_input[i, t-1] = word_index
                if word_index > 0:
                    y_batch_output[i, t, word_index] = 1.0

        return [X_batch, y_batch_input], y_batch_output

In [20]:
# Parameters
batch_size = 128

# Create data generators
train_gen = DataGenerator(X_train, y_train, batch_size, english_vocab_size)
test_gen = DataGenerator(X_test, y_test, batch_size, english_vocab_size)

In [21]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# Building the seq2seq model
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(hindi_vocab_size, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(english_vocab_size, latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(english_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


In [20]:
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 256)            4383488   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 256)            3903744   ['input_2[0][0]']             
                                                                                              

In [21]:
# Training the model
epochs = 50
model.fit(train_gen, epochs=epochs, validation_data=test_gen)

# Save the model
model.save('models/hindi_to_english_translation_model.h5')

Epoch 1/50


2024-05-29 01:31:15.770586: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
2024-05-29 01:31:15.854287: I external/local_xla/xla/service/service.cc:168] XLA service 0x79d9e5d5a8d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-05-29 01:31:15.854321: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 Laptop GPU, Compute Capability 8.6
2024-05-29 01:31:15.859448: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1716926475.914042  376294 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50

2024-05-29 01:40:01.860735: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 156170240 bytes after encountering the first element of size 156170240 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


  saving_api.save_model(


In [1]:
# Function to translate a Hindi sentence
def translate_sentence(hindi_sentence):
    hindi_seq = hindi_tokenizer.texts_to_sequences([hindi_sentence])
    hindi_seq = pad_sequences(hindi_seq, maxlen=max_hindi_seq_length, padding='post')
    states_value = encoder_model.predict(hindi_seq)
    
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = english_tokenizer.word_index['startseq']  # Start the decoding with 'startseq'
    
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = english_tokenizer.index_word.get(sampled_token_index, '')
        
        if sampled_word == 'endseq' or len(decoded_sentence) > max_english_seq_length:
            stop_condition = True
        else:
            if sampled_word != 'startseq':  # Ignore 'startseq'
                decoded_sentence += ' ' + sampled_word
        
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        
        states_value = [h, c]
    
    return decoded_sentence.strip()

In [22]:
# Loading the model for translation
from keras.models import load_model
model = load_model('models/hindi_to_english_translation_model.h5')

# Inference setup
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [None]:
# Example of translating a sentence
import random
random_index = random.randint(0,20000)

hindi_sentence = data['hindi_sentence'][random_index] # Replace with your Hindi sentence
print("Hindi Sentence : " , hindi_sentence)
print("Orignal English Sentence: ", data['english_sentence'][random_index])

translated_sentence = translate_sentence(hindi_sentence)
print(f'Translated sentence: {translated_sentence}')

In [None]:
# Calculate BLEU score for the test set
references = [english_tokenizer.sequences_to_texts([seq]) for seq in X_test]
candidates = [translate_sentence(hindi_tokenizer.sequences_to_texts([seq])[0]) for seq in X_test]

bleu_scores = [sentence_bleu([ref], cand) for ref, cand in zip(references, candidates)]
average_bleu_score = np.mean(bleu_scores)
print(f'Average BLEU score: {average_bleu_score}')



Exception ignored in: <function WeakKeyDictionary.__init__.<locals>.remove at 0x782c65da6290>
Traceback (most recent call last):
  File "/usr/lib/python3.10/weakref.py", line 370, in remove
    def remove(k, selfref=ref(self)):
KeyboardInterrupt: 




In [None]:
print(f'Average BLEU score: {average_bleu_score}')