In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import nltk
from nltk.translate.bleu_score import sentence_bleu
import re
import string
from string import digits



# Load CSV file
data = pd.read_csv('data/Hindi_English_Truncated_Corpus.csv')
data = data.sample(n=30000, random_state=42)
data = data.reset_index(drop=True)



# Replace NA values with empty strings
data['hindi_sentence'] = data['hindi_sentence'].fillna('')
data['english_sentence'] = data['english_sentence'].fillna('')

# Add 'startseq' and 'endseq' tokens to English sentences
data['english_sentence'] = data['english_sentence'].apply(lambda x: 'startseq ' + x + ' endseq')

# Preprocess the data
def preprocess_text(text):
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Lowercase the text
    text = text.lower()
    return text


# Preprocess the hindi data
def preprocess_hindi_text(text):
    # Remove English words
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)
   
    text = text.lower()
    return text


data['hindi_sentence'] = data['hindi_sentence'].apply(preprocess_hindi_text)
data['english_sentence'] = data['english_sentence'].apply(preprocess_text)


# Lowercase all characters
data['english_sentence']=data['english_sentence'].apply(lambda x: x.lower())
data['hindi_sentence']=data['hindi_sentence'].apply(lambda x: x.lower())

#remove qoutes
data['english_sentence'] = data['english_sentence'].apply(lambda x: re.sub("'", '', x))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: re.sub("'", '', x))



exclude = set(string.punctuation)  # Set of all special characters

# Remove all the special characters
data['english_sentence'] = data['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))



# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
data['english_sentence'] = data['english_sentence'].apply(lambda x: x.translate(remove_digits))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: x.translate(remove_digits))

data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
data['english_sentence'] = data['english_sentence'].apply(lambda x: x.strip())
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: x.strip())
data['english_sentence'] = data['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))


#adding 2 new coloumns to see sentence length
data['length_eng_sentence'] = data['english_sentence'].apply(lambda x: len(x.split(" ")))
data['length_hin_sentence'] = data['hindi_sentence'].apply(lambda x: len(x.split(" ")))



2024-05-29 00:28:42.187659: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-29 00:28:42.187714: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-29 00:28:42.189366: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-29 00:28:42.197216: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
0,tides,startseq he declares the result and reports it...,वही परिणाम की घोषणा करता है और निर्वाचन आयोग क...,21,20
1,ted,startseq was a little uncomfortable for them e...,थोडा कठिन था।,8,3
2,indic2012,startseq but mulla assamudin was proved to be ...,मगर मुल्ला असमुद्दीन अक्षम सिद्ध हुए।,11,6
3,ted,startseq i would never have to make a book and...,मुझे कभी भी किताब बना कर किसी प्रदर्शनस्थल को ...,17,14
4,indic2012,startseq headind kaun banega crorepati endseq,शीर्षक कौन बनेगा करोड़पति,6,4


In [3]:
data.shape

(30000, 5)

In [4]:
#keeping sentences with lenght upto max_sentence_length

max_sentence_length=20

data=data[data['length_eng_sentence']<= max_sentence_length]
data=data[data['length_hin_sentence']<= max_sentence_length]

data = data.reset_index(drop=True)

In [5]:
data.shape

(20564, 5)

In [6]:
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
0,ted,startseq was a little uncomfortable for them e...,थोडा कठिन था।,8,3
1,indic2012,startseq but mulla assamudin was proved to be ...,मगर मुल्ला असमुद्दीन अक्षम सिद्ध हुए।,11,6
2,ted,startseq i would never have to make a book and...,मुझे कभी भी किताब बना कर किसी प्रदर्शनस्थल को ...,17,14
3,indic2012,startseq headind kaun banega crorepati endseq,शीर्षक कौन बनेगा करोड़पति,6,4
4,tides,startseq no other national leader except nehru...,नेहरू को छोड़कर और किसी भी राष्ट्र नेता के मन ...,11,15


In [7]:
max_num_words = 20000

# Tokenizer for Hindi sentences
hindi_tokenizer = Tokenizer(num_words=max_num_words)
hindi_tokenizer.fit_on_texts(data['hindi_sentence'])
hindi_vocab_size = len(hindi_tokenizer.word_index) + 1

# Tokenizer for English sentences
english_tokenizer = Tokenizer(num_words=max_num_words)
english_tokenizer.fit_on_texts(data['english_sentence'])
english_vocab_size = len(english_tokenizer.word_index) + 1


In [8]:
hindi_tokenizer.index_word

{1: 'के',
 2: 'में',
 3: 'है',
 4: 'और',
 5: 'की',
 6: 'से',
 7: 'का',
 8: 'हैं',
 9: 'को',
 10: 'एक',
 11: 'कि',
 12: 'पर',
 13: 'यह',
 14: 'भी',
 15: 'है।',
 16: 'नहीं',
 17: 'इस',
 18: 'ही',
 19: 'ने',
 20: 'लिए',
 21: 'कर',
 22: 'जो',
 23: 'तो',
 24: 'हो',
 25: 'आप',
 26: 'था',
 27: 'हम',
 28: 'ये',
 29: 'मैं',
 30: 'हैं।',
 31: 'कुछ',
 32: 'किया',
 33: 'करने',
 34: 'बहुत',
 35: 'गया',
 36: 'अपने',
 37: 'या',
 38: 'थे',
 39: 'वे',
 40: '।',
 41: 'होता',
 42: 'क्या',
 43: 'वो',
 44: 'साथ',
 45: 'भारत',
 46: 'तक',
 47: 'थी',
 48: 'सकते',
 49: 'वह',
 50: 'रहे',
 51: 'जाता',
 52: 'करते',
 53: 'कोई',
 54: 'जब',
 55: 'तरह',
 56: 'तथा',
 57: 'लेकिन',
 58: 'मुझे',
 59: 'किसी',
 60: 'समय',
 61: 'बाद',
 62: 'रहा',
 63: 'दिया',
 64: 'अब',
 65: 'उन्हें',
 66: 'बारे',
 67: 'हुआ',
 68: 'उनके',
 69: 'इसके',
 70: 'करना',
 71: 'मे',
 72: 'बात',
 73: 'रूप',
 74: 'जा',
 75: 'सकता',
 76: 'इन',
 77: 'सबसे',
 78: 'होती',
 79: 'काम',
 80: 'कहा',
 81: 'था।',
 82: 'हूँ',
 83: 'पहले',
 84: 'लोग',
 85: 'द्वा

In [9]:
hindi_tokenizer.word_counts

OrderedDict([('थोडा', 11),
             ('कठिन', 41),
             ('था।', 271),
             ('मगर', 109),
             ('मुल्ला', 7),
             ('असमुद्दीन', 4),
             ('अक्षम', 4),
             ('सिद्ध', 12),
             ('हुए।', 21),
             ('मुझे', 351),
             ('कभी', 139),
             ('भी', 1558),
             ('किताब', 18),
             ('बना', 130),
             ('कर', 997),
             ('किसी', 344),
             ('प्रदर्शनस्थल', 1),
             ('को', 2674),
             ('देने', 88),
             ('की', 3989),
             ('ज़रुरत', 11),
             ('नहीं', 1455),
             ('पड़ी', 11),
             ('शीर्षक', 10),
             ('कौन', 41),
             ('बनेगा', 2),
             ('करोड़पति', 3),
             ('नेहरू', 45),
             ('छोड़कर', 14),
             ('और', 4142),
             ('राष्ट्र', 30),
             ('नेता', 37),
             ('के', 6655),
             ('मन', 28),
             ('में', 5515),
             ('यह', 1628),


In [10]:
english_tokenizer.word_index

{'startseq': 1,
 'endseq': 2,
 'the': 3,
 'of': 4,
 'and': 5,
 'to': 6,
 'in': 7,
 'a': 8,
 'is': 9,
 'that': 10,
 'this': 11,
 'it': 12,
 'was': 13,
 'are': 14,
 'you': 15,
 'for': 16,
 'i': 17,
 'on': 18,
 'we': 19,
 'with': 20,
 'as': 21,
 'be': 22,
 'have': 23,
 'from': 24,
 'he': 25,
 'not': 26,
 'they': 27,
 'by': 28,
 'but': 29,
 'so': 30,
 'there': 31,
 'at': 32,
 'were': 33,
 'one': 34,
 'his': 35,
 'its': 36,
 'has': 37,
 'can': 38,
 'what': 39,
 'about': 40,
 'all': 41,
 'an': 42,
 'or': 43,
 'which': 44,
 'also': 45,
 'these': 46,
 'do': 47,
 'people': 48,
 'their': 49,
 'india': 50,
 'had': 51,
 'if': 52,
 'will': 53,
 'when': 54,
 'very': 55,
 'like': 56,
 'them': 57,
 'my': 58,
 'more': 59,
 'our': 60,
 'other': 61,
 'world': 62,
 's': 63,
 'out': 64,
 'now': 65,
 'who': 66,
 'time': 67,
 'some': 68,
 'up': 69,
 'because': 70,
 'no': 71,
 'after': 72,
 'your': 73,
 'two': 74,
 'hindi': 75,
 'only': 76,
 'indian': 77,
 'first': 78,
 'see': 79,
 'been': 80,
 'here': 81,
 '

In [11]:
# Convert sentences to sequences
hindi_sequences = hindi_tokenizer.texts_to_sequences(data['hindi_sentence'])
english_sequences = english_tokenizer.texts_to_sequences(data['english_sentence'])

In [12]:
hindi_sequences

[[1910, 541, 81],
 [202, 2767, 4196, 4197, 1784, 1047],
 [58, 154, 14, 1225, 165, 21, 59, 10242, 9, 247, 5, 1911, 16, 1912],
 [2070, 542, 6801, 5142],
 [488, 9, 1541, 4, 59, 14, 732, 606, 1, 789, 2, 13, 1296, 16, 38],
 [499,
  19,
  1449,
  565,
  1,
  20,
  211,
  708,
  328,
  519,
  315,
  360,
  623,
  1172,
  1542,
  1004,
  1450,
  70,
  321,
  81],
 [2768,
  3600,
  19,
  10243,
  97,
  5,
  832,
  543,
  65,
  10244,
  5143,
  2,
  544,
  32,
  4,
  10245,
  3601],
 [101, 1913, 4, 3133, 19, 555, 32, 3],
 [456, 923, 1543, 4198, 1785, 2, 3],
 [106, 58, 2769, 3134, 1, 3135, 33, 1, 66, 2, 566, 755, 102, 23, 42, 2487],
 [202, 103, 214, 14, 1105, 193, 48, 8, 40],
 [88, 37, 89, 5144, 7, 1786, 1787],
 [295, 19, 239, 1005, 1451, 1297, 488, 9, 6802, 269],
 [107, 5145, 2, 252, 12, 520, 1544, 10246, 5, 3602, 465, 252, 2770],
 [6803,
  2488,
  1298,
  73,
  6,
  2771,
  46,
  18,
  1173,
  38,
  4,
  500,
  352,
  1298,
  129,
  9,
  18,
  63,
  51,
  26],
 [4199, 567, 4199, 2071, 179, 115,

In [13]:
english_sequences

[[1, 13, 8, 183, 4523, 16, 57, 2],
 [1, 29, 3189, 5816, 13, 1575, 6, 22, 26, 5817, 2],
 [1, 17, 90, 248, 23, 6, 112, 8, 214, 5, 84, 384, 12, 6, 8, 3190, 2],
 [1, 8568, 5818, 5819, 5820, 2],
 [1, 71, 61, 256, 564, 1138, 469, 2232, 10, 2790, 2],
 [1,
  1139,
  857,
  8569,
  49,
  446,
  6,
  3,
  5821,
  340,
  44,
  707,
  278,
  4524,
  5,
  8570,
  2],
 [1, 2791, 48, 33, 2020, 1866, 4525, 5, 4526, 28, 3, 341, 3191, 2],
 [1, 87, 21, 3, 1073, 5, 1576, 737, 2],
 [1, 3, 1023, 4527, 4, 592, 529, 9, 168, 2792, 2],
 [1, 39, 52, 17, 138, 470, 242, 40, 8571, 16, 8, 113, 2477, 2],
 [1, 29, 308, 121, 22, 1867, 21, 95, 2],
 [1, 74, 43, 59, 1868, 4, 8, 2233, 1024, 2],
 [1, 2021, 191, 31, 5, 296, 35, 8572, 6, 1869, 8573, 469, 2],
 [1, 8574, 18, 616, 1870, 7, 77, 3192, 336, 18, 220, 2],
 [1,
  8575,
  1391,
  33,
  3193,
  1871,
  6,
  2478,
  5,
  85,
  4,
  3,
  1074,
  13,
  3729,
  6,
  3,
  5822,
  2],
 [1, 9, 8576, 890, 24, 890, 2],
 [1, 65, 25, 4528, 1075, 204, 5, 10, 447, 113, 2],
 [1, 8, 4

In [14]:
# Determine the maximum sequence length for padding
max_hindi_seq_length = max([len(seq) for seq in hindi_sequences])
max_english_seq_length = max([len(seq) for seq in english_sequences])

print("Maximum Hindi Sequence Length: ", max_hindi_seq_length)
print("Maximum English Sequence Length: ", max_english_seq_length)

Maximum Hindi Sequence Length:  20
Maximum English Sequence Length:  20


In [15]:
# Pad the sequences
hindi_sequences = pad_sequences(hindi_sequences, maxlen=max_hindi_seq_length, padding='post')
english_sequences = pad_sequences(english_sequences, maxlen=max_english_seq_length, padding='post')


In [16]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(hindi_sequences, english_sequences, test_size=0.2, random_state=42)


In [17]:
from keras.utils import Sequence
class DataGenerator(Sequence):
    def __init__(self, X, y, batch_size, english_vocab_size):
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.english_vocab_size = english_vocab_size
        self.indices = np.arange(len(X))

    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch_size))

    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        X_batch = self.X[batch_indices]
        y_batch = self.y[batch_indices]

        # Decoder input sequences (excluding 'endseq')
        y_batch_input = np.zeros((len(y_batch), max_english_seq_length), dtype='int32')
        # Decoder output sequences (one-hot encoded, including 'endseq')
        y_batch_output = np.zeros((len(y_batch), max_english_seq_length, self.english_vocab_size), dtype='float32')

        for i, seq in enumerate(y_batch):
            for t, word_index in enumerate(seq):
                if t > 0:  # Shifted sequence for decoder input
                    y_batch_input[i, t-1] = word_index
                if word_index > 0:
                    y_batch_output[i, t, word_index] = 1.0

        return [X_batch, y_batch_input], y_batch_output

In [18]:
# Parameters
batch_size = 128

# Create data generators
train_gen = DataGenerator(X_train, y_train, batch_size, english_vocab_size)
test_gen = DataGenerator(X_test, y_test, batch_size, english_vocab_size)

In [19]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# Building the seq2seq model
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(hindi_vocab_size, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(english_vocab_size, latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(english_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


2024-05-29 00:28:46.188080: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-29 00:28:46.222437: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-29 00:28:46.222673: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [20]:
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 256)            5460480   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 256)            4901120   ['input_2[0][0]']             
                                                                                              

In [21]:
# Training the model
epochs = 100
model.fit(train_gen, epochs=epochs, validation_data=test_gen)

# Save the model
model.save('models/hindi_to_english_translation_model.h5')

Epoch 1/5


2024-05-29 00:28:48.963364: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
2024-05-29 00:28:49.059441: I external/local_xla/xla/service/service.cc:168] XLA service 0x720d4def1fb0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-05-29 00:28:49.059485: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 Laptop GPU, Compute Capability 8.6
2024-05-29 00:28:49.087134: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1716922729.149377  313618 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  saving_api.save_model(


In [22]:
# Function to translate a Hindi sentence
def translate_sentence(hindi_sentence):
    hindi_seq = hindi_tokenizer.texts_to_sequences([hindi_sentence])
    hindi_seq = pad_sequences(hindi_seq, maxlen=max_hindi_seq_length, padding='post')
    states_value = encoder_model.predict(hindi_seq)
    
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = english_tokenizer.word_index['startseq']  # Start the decoding with 'startseq'
    
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = english_tokenizer.index_word.get(sampled_token_index, '')
        
        if sampled_word == 'endseq' or len(decoded_sentence) > max_english_seq_length:
            stop_condition = True
        else:
            if sampled_word != 'startseq':  # Ignore 'startseq'
                decoded_sentence += ' ' + sampled_word
        
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        
        states_value = [h, c]
    
    return decoded_sentence.strip()

In [23]:
# Loading the model for translation
from keras.models import load_model
model = load_model('models/hindi_to_english_translation_model.h5')

# Inference setup
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [24]:
# Example of translating a sentence
hindi_sentence = 'आप कैसे हैं?'  # Replace with your Hindi sentence
translated_sentence = translate_sentence(hindi_sentence)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: he


In [25]:
# Calculate BLEU score for the test set
references = [english_tokenizer.sequences_to_texts([seq]) for seq in X_test]
candidates = [translate_sentence(hindi_tokenizer.sequences_to_texts([seq])[0]) for seq in X_test]

bleu_scores = [sentence_bleu([ref], cand) for ref, cand in zip(references, candidates)]
average_bleu_score = np.mean(bleu_scores)
print(f'Average BLEU score: {average_bleu_score}')



KeyboardInterrupt: 