In [72]:
import numpy as np 
import pandas as pd 
import os
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model



In [2]:
lines=pd.read_csv("../data/Hindi_English_Truncated_Corpus.csv",encoding='utf-8')

In [5]:
lines.head(20)

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what needs to be done.,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है ."
1,ted,"I'd like to tell you about one such child,","मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी,"
2,indic2012,This percentage is even greater than the percentage in India.,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not paying attention.,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called Upanishad.,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।
5,tides,"The then Governor of Kashmir resisted transfer , but was finally reduced to subjection with the aid of British .","कश्मीर के तत्कालीन गवर्नर ने इस हस्तांतरण का विरोध किया था , लेकिन अंग्रेजों की सहायता से उनकी आवाज दबा दी गयी ."
6,indic2012,In this lies the circumstances of people before you.,इसमें तुमसे पूर्व गुज़रे हुए लोगों के हालात हैं।
7,ted,"And who are we to say, even, that they are wrong",और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
8,indic2012,“”Global Warming“” refer to warming caused in recent decades and probability of its continual presence and its indirect effect on human being.,ग्लोबल वॉर्मिंग से आशय हाल ही के दशकों में हुई वार्मिंग और इसके निरंतर बने रहने के अनुमान और इसके अप्रत्यक्ष रूप से मानव पर पड़ने वाले प्रभाव से है।
9,tides,You may want your child to go to a school that is not run by the LEA - a non-maintained special school or an independent school that can meet your child 's needs .,"हो सकता है कि आप चाहते हों कि आप का नऋर्नमेनटेन्ड ह्यबिना किसी समर्थन के हृ विशेष स्कूल , या किसी स्वतंत्र स्कूल में जाए , इजसके पास विशेष शैक्षणिक जऋऋरतों वाले बच्चों के प्रति सहूलियत हों . ."


In [6]:
pd.isnull(lines).sum()

source              0
english_sentence    2
hindi_sentence      0
dtype: int64

In [7]:
lines=lines[~pd.isnull(lines['english_sentence'])]

In [8]:
lines.drop_duplicates(inplace=True)

In [9]:
lines=lines.sample(n=50000,random_state=42)
lines.shape

(50000, 3)

In [10]:
# Lowercase all characters
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.lower())
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.lower())

In [11]:
# Remove quotes
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub("'", '', x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

In [12]:
exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
lines['english_sentence']=lines['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [13]:
# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.translate(remove_digits))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.translate(remove_digits))

lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.strip())
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.strip())
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))


In [14]:
# Add start and end tokens to target sequences
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x : 'START_ '+ x + ' _END')

In [16]:
all_eng_words=set()
for eng in lines['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_hindi_words=set()
for hin in lines['hindi_sentence']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)

In [17]:
len(all_eng_words)

45291

In [18]:
all_eng_words

{'laddaki',
 'multiplying',
 'madina',
 'fowlpox',
 'shower',
 'hamida',
 'hatmakers',
 'tie',
 'hoarse',
 'offending',
 'fernandes',
 'severalfolds',
 'diphtheria',
 'cares',
 'bowlsecond',
 'fright',
 'shooters',
 'altruism',
 'angry',
 'resettlement',
 'bandits',
 'mediators',
 'injury',
 'marked',
 'tantric',
 'udf',
 'bowledthe',
 'mindonly',
 'indira',
 'necessitated',
 '“ah',
 'thakurbanglaravindra',
 'namangani',
 'dhavanmarg',
 'retirement',
 'ishwaray',
 'melts',
 'interspaces',
 'bot',
 'occasions',
 'groundfloor',
 'adversely',
 'jiyauddeen',
 'nausea',
 'trumpets',
 'preshyat',
 'seventytwo',
 'fearful',
 'detection',
 'clings',
 'disproportionate',
 'racialism',
 'itselffrom',
 'especially',
 'distributions',
 'raid',
 'nagarjun',
 'cramped',
 'distinguished',
 'friedrich',
 'celibacy',
 'illustrations',
 'corbelled',
 'hal',
 'hadapa',
 'introspective',
 'oy',
 'prosodic',
 'amounted',
 'powergenerating',
 'doubling',
 'clearwater',
 'tapas',
 'panjam',
 'thrown',
 'gdr'

In [19]:
len(all_hindi_words)

52937

In [20]:
all_hindi_words

{'आघ्ढ्',
 'चेट्टियार',
 'सुवेदनशीलता',
 'आयात्lत',
 'टेक्सटाइल',
 'ऑनलाइन',
 'कैंटनों',
 'सुंदरवन',
 'गईकंपनी',
 'विचार”',
 'पायेंगे',
 'बौडा',
 'ईएससी',
 'दे',
 'पदों',
 'हैंअंतराल',
 'परिवादों',
 'अब्बा',
 'कार्यकुशल',
 'समसऋऊण्श्छ्ष्त',
 'चालpassive',
 '\u200eबातें',
 'फ्लोरेंस',
 'फोड़ते',
 'संजोये',
 'नर्सरी',
 'मॉड़र्न',
 'प्रतिपिंड',
 'चिंतनीय',
 'हैंमैलारा',
 'दर्जनों',
 'चऋदह',
 'अग़्न्याशय',
 'रिवर',
 'बच्चों',
 'दियावे',
 'गाँवघर',
 'डालें',
 'सोया',
 'प्रयतऋऊण्श्छ्ष्न',
 'अंतर्मुखी',
 'वार्डन',
 'हैवलाक',
 'गुस्सा',
 'ओपीनियन',
 'मुहैया',
 'प्रेसीड़ेंसी',
 'चिले',
 'ईश्वर।',
 'तदनन्तर',
 'फूटता',
 'बलोचिस्तान',
 'हूंमैं',
 'ह्यन्याय',
 'अनुसमर्थन',
 'प्रदर्शनियां',
 'मेसोपोटामिया',
 'कॉडलिवर',
 'idkae',
 'समय।',
 'चारती',
 'षण्मत',
 'मेर',
 'गलफाड़े',
 'बीच।',
 'करनाल',
 'गुर्दों',
 'बाली',
 'ईरानियों',
 'बांधी',
 'haiking',
 'हठधर्मी',
 'सब्सक्राईब्ड',
 'ग्रीष्मकालीन',
 'भगोड़ें',
 'बीस',
 'इलाजो',
 'gate',
 'zबल्कि',
 'रखेगा',
 '“',
 'पिछड़े',
 'ºघूम',
 'नदीजीवन',
 'चैम्

In [21]:
lines['length_eng_sentence']=lines['english_sentence'].apply(lambda x:len(x.split(" ")))
lines['length_hin_sentence']=lines['hindi_sentence'].apply(lambda x:len(x.split(" ")))

In [22]:
lines.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
25520,indic2012,islam is word from arabic and it full word is salamaits definition peace surrender,START_ इस्लाम शब्द अरबी भाषा का शब्द है जिसका मूल शब्द सल्लमा है जिस की दो परिभाषाएं हैं शान्ति आत्मसमर्पण। _END,14,21
118633,ted,everything is reliant on these computers working,START_ इन कंप्यूटरों पर सब कुछ निर्भर है _END,7,9
113495,tides,parliament does not control the government,START_ संसद का सरकार पपर नियंत्रण नपहीं रहता _END,6,9
29783,tides,race equality new laws,START_ नये कानून नस्ली समानता _END,4,6
111804,tides,the provision would not affect the power of parliament to make laws in respect of income from professions etc lrb article rrb,START_ व्यवसायों आदि से होने वाली आय के बारे में विधि बनाने की संसद की शक्ति पर उपबंध का प्रभाव नहीं पड़ेगा अनुच्छेद _END,22,24


In [23]:
lines[lines['length_eng_sentence']>30].shape

(4905, 5)

In [24]:
lines=lines[lines['length_eng_sentence']<=20]
lines=lines[lines['length_hin_sentence']<=20]

In [26]:
lines.shape

(32971, 5)

In [27]:
print("maximum length of Hindi Sentence ",max(lines['length_hin_sentence']))
print("maximum length of English Sentence ",max(lines['length_eng_sentence']))

maximum length of Hindi Sentence  20
maximum length of English Sentence  20


In [28]:
max_length_src=max(lines['length_hin_sentence'])
max_length_tar=max(lines['length_eng_sentence'])

In [29]:
print(max_length_src)

20


In [30]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_hindi_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_hindi_words)
num_encoder_tokens, num_decoder_tokens

(45291, 52937)

In [31]:
num_decoder_tokens += 1 #for zero padding


In [32]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [33]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [34]:
lines = shuffle(lines)
lines.head(10)

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
99850,ted,i dont want to make this as a corporate entity,START_ मैं अपने इस प्रयास को कोई कॉरपोरेट शक्ल नहीं देना चाहता _END,10,13
63048,indic2012,should be defined,START_ परिभाषित हो _END,3,4
80981,ted,photographing and blogging,START_ फोटोphoto लेना और लिखना _END,3,6
34223,indic2012,with this they also keep an eye on the election,START_ इसके साथ ही वह शहर में होने वाले चुनावों पर भी नज़र रखता है। _END,10,16
56625,ted,the brain scans showed activation in a part of the brain,START_ मस्तिष्क स्कैन से मस्तिष्क का एक हिस्सा सक्रियण दिखा _END,11,11
42318,ted,if my work was nice enough to show it to people,START_ कि मेरा काम लोगों को दिखाने लायक था कि नहीं _END,11,12
38775,indic2012,the chinese civilization is older than sixth th century,START_ चीन की सभ्यता एवम् संस्कृति छठी शताब्दी से भी पुरानी है। _END,9,13
44769,ted,and the separation from your loved ones,START_ छटपटाहट है अपनों से दूर होने की _END,7,9
122490,indic2012,simple machines forum ashtank software part of the advertising campaign to promote hindi,START_ सरल मशीन मंच simple machine forum अष्टांक सॉफ़्टवेयर के हिन्दी प्रचार प्रसार अभियान का एक अंग है । _END,13,20
72874,ted,but what perhaps some of you dont realize,START_ लेकिन आप में से कुछ लोगो यह एहसास नहीं होगा _END,8,12


### Split the data into train and test

In [35]:
X, y = lines['english_sentence'], lines['hindi_sentence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)
X_train.shape, X_test.shape

((26376,), (6595,))

In [36]:
X_train

49713     “what are you doing here                                                                            
22541     category main jamboodweep                                                                           
473       these messages were started from aadam                                                              
120500    north west                                                                                          
4722      and guess what                                                                                      
               ...                                                                                            
117495    public relations office                                                                             
63620     viii cases of sexual intercourse with immoral women                                                 
123273    howeverit was projected in such a way that to most buddhist it is not acceptable and very unpleasant
1

In [37]:
y_train

49713     START_ “तब यहाँ क्या कर रहे हो _END                                                                        
22541     START_ श्रेणीनैऋत्य जंबुद्वीप _END                                                                         
473       START_ इन संदेशों का शुभारम्भ आदम से हुआ था। _END                                                          
120500    START_ उत्तर पश्चिम सीमांत प्रान्त _END                                                                    
4722      START_ और इसका नतीजा _END                                                                                  
                    ...                                                                                              
117495    START_ जनसम्पर्क कार्यालय ग्वालियर _END                                                                    
63620     START_ पतित स्त्रियों के साथ मैथुन के मामले _END                                                           
123273    START_ लेकिन इसे इस तरीके से पेश किया गया है ज

In [39]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

### Encoder-Decoder Architecture

In [40]:
latent_dim=300

In [41]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

2024-05-30 18:28:38.054359: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-30 18:28:38.086522: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-30 18:28:38.086771: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [42]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [43]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [44]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 300)            1358730   ['input_1[0][0]']             
                                                          0                                       
                                                                                                  
 embedding_1 (Embedding)     (None, None, 300)            1588140   ['input_2[0][0]']         

In [45]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 64
epochs = 80

In [46]:
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)



  model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),


Epoch 1/80


2024-05-30 18:28:43.681897: W tensorflow/core/common_runtime/type_inference.cc:339] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_INT32
    }
  }
}
 is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_FLOAT
    }
  }
}

	for Tuple type infernce function 0
	while inferring type of node 'cond_36/output/_23'
2024-05-30 18:28:44.442648: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
2024-05-30 18:28:44.878738: I external/local_xla/xla/service/service.cc:168] XLA service 0x758c99118d60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-05-30 18:28

Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80

2024-05-30 19:51:47.104286: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size
2024-05-30 19:51:47.104353: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 34/80

2024-05-30 19:54:25.312589: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 35/80

2024-05-30 19:57:09.914995: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size
2024-05-30 19:57:09.915089: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 36/80

2024-05-30 19:59:46.802266: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size
2024-05-30 19:59:46.802422: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 37/80

2024-05-30 20:02:25.684060: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 38/80

2024-05-30 20:05:00.181413: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 39/80

2024-05-30 20:07:40.342961: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80

2024-05-30 20:36:51.070782: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 51/80

2024-05-30 20:39:32.210491: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 52/80

2024-05-30 20:42:19.134620: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 53/80

2024-05-30 20:44:56.383351: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 54/80
Epoch 55/80

2024-05-30 20:50:25.056279: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 56/80

2024-05-30 20:53:11.892189: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 57/80
Epoch 58/80

2024-05-30 20:58:33.596151: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 59/80

2024-05-30 21:01:17.047214: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 60/80

2024-05-30 21:03:52.978174: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 61/80

2024-05-30 21:06:27.424959: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 62/80
Epoch 63/80
Epoch 64/80

2024-05-30 21:14:37.197100: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 65/80

2024-05-30 21:17:20.427366: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 66/80

2024-05-30 21:20:05.626253: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size
2024-05-30 21:20:05.626416: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 67/80
Epoch 68/80
Epoch 69/80

2024-05-30 21:28:21.164438: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 70/80
Epoch 71/80

2024-05-30 21:33:45.508924: W tensorflow/core/kernels/data/prefetch_autotuner.cc:52] Prefetch autotuner tried to allocate 271052800 bytes after encountering the first element of size 271052800 bytes.This already causes the autotune ram budget to be exceeded. To stay within the ram budget, either increase the ram budget or reduce element size


Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


<keras.src.callbacks.History at 0x758d73c91ff0>

In [47]:
#model.save_weights('nmt_weights.h5')


In [48]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)


In [49]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [50]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1


In [51]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: “what are you doing here
Actual Hindi Translation:  “तब यहाँ क्या कर रहे हो 
Predicted Hindi Translation:  क्या यहाँ कर रहे हैं 


In [2]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

NameError: name 'k' is not defined

In [69]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: even now during dark nights malay boats visit these shores with smuggled goods
Actual Hindi Translation:  अब भी मलाया की अनेक नावें यहां के समुद्रतट पर तस्करी का माल लेकर आती हैं 
Predicted Hindi Translation:  अब इन में से विभिन्न भागों के साथ विभिन्न उद्यो


In [70]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: it is build by mugal king shahjaha memory of his wife mumtaj mahal
Actual Hindi Translation:  इसका निर्माण मुगल सम्राट शाहजहाँ ने अपनी पत्नी मुमताज महल की याद में करवाया था। 
Predicted Hindi Translation:  इसका निर्माण निर्माण निर्माण कर जो मुगल पत्नी मु


In [55]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: and guess what
Actual Hindi Translation:  और इसका नतीजा 
Predicted Hindi Translation:  और उदाहरण के लिए 


In [57]:

encoder_model.save('../models/english_to_hindi_translator/encoder_model_e2h.h5')
decoder_model.save('../models/english_to_hindi_translator/decoder_model_e2h.h5')




  saving_api.save_model(




In [58]:
import pickle

# Save the tokenizers
with open('../models/english_to_hindi_translator/english_tokenizer_e2h.pkl', 'wb') as f:
    pickle.dump(input_token_index, f)

with open('../models/english_to_hindi_translator/hindi_tokenizer_e2h.pkl', 'wb') as f:
    pickle.dump(target_token_index, f)

# Save the reverse tokenizers
with open('../models/english_to_hindi_translator/reverse_english_tokenizer_e2h.pkl', 'wb') as f:
    pickle.dump(reverse_input_char_index, f)

with open('../models/english_to_hindi_translator/reverse_hindi_tokenizer_e2h.pkl', 'wb') as f:
    pickle.dump(reverse_target_char_index, f)


In [1]:
import numpy as np
import tensorflow as tf
from keras.models import load_model
import pickle

# Load the models
encoder_model = load_model('../models/english_to_hindi_translator/encoder_model_e2h.h5')
decoder_model = load_model('../models/english_to_hindi_translator/decoder_model_e2h.h5')


with open('../models/english_to_hindi_translator/english_tokenizer_e2h.pkl', 'rb') as f:
    input_token_index = pickle.load(f)

with open('../models/english_to_hindi_translator/hindi_tokenizer_e2h.pkl', 'rb') as f:
    target_token_index = pickle.load(f)

with open('../models/english_to_hindi_translator/reverse_english_tokenizer_e2h.pkl', 'rb') as f:
    reverse_input_char_index = pickle.load(f)

with open('../models/english_to_hindi_translator/reverse_hindi_tokenizer_e2h.pkl', 'rb') as f:
    reverse_target_char_index = pickle.load(f)

max_length_src = 20  
latent_dim = 300
num_decoder_tokens = len(target_token_index) + 1

def translate_sentence(input_sentence):
    input_seq = [input_token_index.get(word, 0) for word in input_sentence.split()]
    input_seq = tf.keras.preprocessing.sequence.pad_sequences([input_seq], maxlen=max_length_src, padding='post')

    # Encode the input as state vectors
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = []
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index.get(sampled_token_index, '')

        # Exit condition: either hit max length or find stop character
        if sampled_char == '_END' or len(decoded_sentence) > max_length_src:
            stop_condition = True
        else:
            decoded_sentence.append(sampled_char)

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return ' '.join(decoded_sentence)

# Example usage
import random 
random_index = randint
print(translate_sentence(" "))


2024-05-31 01:35:44.061545: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-31 01:35:44.061602: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-31 01:35:44.062866: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-31 01:35:44.072814: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-31 01:35:47.851716: I external/local_xla/xla/




2024-05-31 01:35:55.273083: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
