In [30]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense, LSTM, Embedding, Input, Masking
from keras.models import load_model, save_model, Model
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
import string
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from wordcloud import STOPWORDS, WordCloud
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to C:\Users\aashutosh
[nltk_data]     kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\aashutosh
[nltk_data]     kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
dataset = pd.read_csv(r"C:\Users\aashutosh kumar\Music\Hindi_English_Truncated_Corpus.csv")
dataset.head(5)

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [4]:
dataset.drop(columns = ["source"], axis = 1, inplace = True)

In [5]:
print(dataset.isnull().sum())
dataset.dropna(inplace = True)

english_sentence    2
hindi_sentence      0
dtype: int64


In [6]:
print(dataset.duplicated().sum())
dataset.drop_duplicates(inplace = True)
dataset.shape

2780


(124825, 2)

In [7]:
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    stopwords_set = set(stopwords.words("english"))
    lematizer = WordNetLemmatizer()

    text = " ".join(lematizer.lemmatize(word) for word in text.split() if word not in stopwords_set)
    return text

In [8]:
dataset["english_sentence"] = dataset["english_sentence"].apply(preprocess)
dataset.head(4)

Unnamed: 0,english_sentence,hindi_sentence
0,politician permission need done,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,id like tell one child,मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,percentage even greater percentage india,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,really mean theyre bad paying attention,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते


In [9]:
def add_tokens(text):
    START_TOKEN = "<start>"
    END_TOKEN = "<end>"

    return START_TOKEN + text + END_TOKEN


In [10]:
dataset["hindi_sentence"] = dataset["hindi_sentence"].apply(add_tokens)
dataset.head(4)

Unnamed: 0,english_sentence,hindi_sentence
0,politician permission need done,<start>राजनीतिज्ञों के पास जो कार्य करना चाहिए...
1,id like tell one child,<start>मई आपको ऐसे ही एक बच्चे के बारे में बता...
2,percentage even greater percentage india,<start>यह प्रतिशत भारत में हिन्दुओं प्रतिशत से...
3,really mean theyre bad paying attention,<start>हम ये नहीं कहना चाहते कि वो ध्यान नहीं ...


In [11]:
english_tokenizer = Tokenizer()
english_tokenizer.fit_on_texts(dataset["english_sentence"])
english_sequences = english_tokenizer.texts_to_sequences(dataset["english_sentence"])

In [12]:
hindi_tokenizer = Tokenizer()
hindi_tokenizer.fit_on_texts(dataset["hindi_sentence"])
hindi_sequences = hindi_tokenizer.texts_to_sequences(dataset["hindi_sentence"])

In [13]:
max_len_eng = max(len(seq) for seq in english_sequences)
max_len_hindi = max(len(seq) for seq in hindi_sequences)
max_len_eng, max_len_hindi

(219, 419)

In [14]:
english_padded = pad_sequences(english_sequences, maxlen= max_len_eng, padding = "post")
hindi_padded = pad_sequences(hindi_sequences, maxlen= max_len_hindi, padding = "post")

In [20]:
vocab_size = len(english_tokenizer.word_index)+1
vocab_size

70848

**MAKING ENCODER-DECODER ARCHITECTURE**

In [25]:
encoder_input = Input(shape = (None,))
encoder_embeddings = Embedding(input_dim = vocab_size, output_dim = 256)(encoder_input)
encoder_masking = Masking(mask_value=0.0)(encoder_embeddings)
encoder_LSTM = LSTM(units = 256, return_state = True)
_, state_h, state_c = encoder_LSTM(encoder_masking)

decoder_input = Input(shape = (None,))
decoder_embeddings = Embedding(input_dim = len(hindi_tokenizer.word_index)+1, output_dim = 256)(decoder_input)
decoder_masking = Masking(mask_value = 0.0)(decoder_embeddings)
decoder_LSTM = LSTM(units = 256, return_sequences = True, return_state = True)
decoder_outputs, _, _ = decoder_LSTM(decoder_masking, initial_state = [state_h, state_c])

decoder_Dense = Dense(units = len(hindi_tokenizer.word_index)+1, activation = "softmax")
decoder_output = decoder_Dense(decoder_outputs)

model = Model(inputs = [encoder_input, decoder_input], outputs = decoder_output)

In [26]:
model.compile(optimizer = "Adam", metrics = ["Accuracy"], loss = "sparse_categorical_crossentropy")

In [27]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_6 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_7 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_5 (Embedding)        (None, None, 256)    18137088    ['input_6[0][0]']                
                                                                                                  
 embedding_6 (Embedding)        (None, None, 256)    19719680    ['input_7[0][0]']                
                                                                                            

In [32]:
callback = EarlyStopping(monitor = "val_loss", patience = 3, restore_best_weights = True)
h = model.fit([english_padded, hindi_padded[:, :-1]], hindi_padded[:, :-1], batch_size = 13, callbacks = [callback], epochs = 50, validation_split = 0.2)

Epoch 1/50


ResourceExhaustedError: Graph execution error:

Detected at node 'model_1/dense_1/Tensordot/MatMul' defined at (most recent call last):
    File "c:\a\envs\main\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "c:\a\envs\main\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "c:\a\envs\main\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
      app.launch_new_instance()
    File "c:\a\envs\main\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
      app.start()
    File "c:\a\envs\main\lib\site-packages\ipykernel\kernelapp.py", line 739, in start
      self.io_loop.start()
    File "c:\a\envs\main\lib\site-packages\tornado\platform\asyncio.py", line 205, in start
      self.asyncio_loop.run_forever()
    File "c:\a\envs\main\lib\asyncio\base_events.py", line 601, in run_forever
      self._run_once()
    File "c:\a\envs\main\lib\asyncio\base_events.py", line 1905, in _run_once
      handle._run()
    File "c:\a\envs\main\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "c:\a\envs\main\lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue
      await self.process_one()
    File "c:\a\envs\main\lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one
      await dispatch(*args)
    File "c:\a\envs\main\lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell
      await result
    File "c:\a\envs\main\lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request
      await super().execute_request(stream, ident, parent)
    File "c:\a\envs\main\lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request
      reply_content = await reply_content
    File "c:\a\envs\main\lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute
      res = shell.run_cell(
    File "c:\a\envs\main\lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell
      return super().run_cell(*args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\IPython\core\interactiveshell.py", line 3048, in run_cell
      result = self._run_cell(
    File "c:\a\envs\main\lib\site-packages\IPython\core\interactiveshell.py", line 3103, in _run_cell
      result = runner(coro)
    File "c:\a\envs\main\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "c:\a\envs\main\lib\site-packages\IPython\core\interactiveshell.py", line 3308, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "c:\a\envs\main\lib\site-packages\IPython\core\interactiveshell.py", line 3490, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "c:\a\envs\main\lib\site-packages\IPython\core\interactiveshell.py", line 3550, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\aashutosh kumar\AppData\Local\Temp\ipykernel_3608\1480791754.py", line 2, in <module>
      h = model.fit([english_padded, hindi_padded[:, :-1]], hindi_padded[:, :-1], batch_size = 64, callbacks = [callback], epochs = 50, validation_split = 0.2)
    File "c:\a\envs\main\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "c:\a\envs\main\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "c:\a\envs\main\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\a\envs\main\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "c:\a\envs\main\lib\site-packages\keras\engine\training.py", line 993, in train_step
      y_pred = self(x, training=True)
    File "c:\a\envs\main\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\engine\training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\engine\functional.py", line 510, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "c:\a\envs\main\lib\site-packages\keras\engine\functional.py", line 667, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\layers\core\dense.py", line 244, in call
      outputs = tf.tensordot(inputs, self.kernel, [[rank - 1], [0]])
Node: 'model_1/dense_1/Tensordot/MatMul'
Detected at node 'model_1/dense_1/Tensordot/MatMul' defined at (most recent call last):
    File "c:\a\envs\main\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "c:\a\envs\main\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "c:\a\envs\main\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
      app.launch_new_instance()
    File "c:\a\envs\main\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
      app.start()
    File "c:\a\envs\main\lib\site-packages\ipykernel\kernelapp.py", line 739, in start
      self.io_loop.start()
    File "c:\a\envs\main\lib\site-packages\tornado\platform\asyncio.py", line 205, in start
      self.asyncio_loop.run_forever()
    File "c:\a\envs\main\lib\asyncio\base_events.py", line 601, in run_forever
      self._run_once()
    File "c:\a\envs\main\lib\asyncio\base_events.py", line 1905, in _run_once
      handle._run()
    File "c:\a\envs\main\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "c:\a\envs\main\lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue
      await self.process_one()
    File "c:\a\envs\main\lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one
      await dispatch(*args)
    File "c:\a\envs\main\lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell
      await result
    File "c:\a\envs\main\lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request
      await super().execute_request(stream, ident, parent)
    File "c:\a\envs\main\lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request
      reply_content = await reply_content
    File "c:\a\envs\main\lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute
      res = shell.run_cell(
    File "c:\a\envs\main\lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell
      return super().run_cell(*args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\IPython\core\interactiveshell.py", line 3048, in run_cell
      result = self._run_cell(
    File "c:\a\envs\main\lib\site-packages\IPython\core\interactiveshell.py", line 3103, in _run_cell
      result = runner(coro)
    File "c:\a\envs\main\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "c:\a\envs\main\lib\site-packages\IPython\core\interactiveshell.py", line 3308, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "c:\a\envs\main\lib\site-packages\IPython\core\interactiveshell.py", line 3490, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "c:\a\envs\main\lib\site-packages\IPython\core\interactiveshell.py", line 3550, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\aashutosh kumar\AppData\Local\Temp\ipykernel_3608\1480791754.py", line 2, in <module>
      h = model.fit([english_padded, hindi_padded[:, :-1]], hindi_padded[:, :-1], batch_size = 64, callbacks = [callback], epochs = 50, validation_split = 0.2)
    File "c:\a\envs\main\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "c:\a\envs\main\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "c:\a\envs\main\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\a\envs\main\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "c:\a\envs\main\lib\site-packages\keras\engine\training.py", line 993, in train_step
      y_pred = self(x, training=True)
    File "c:\a\envs\main\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\engine\training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\engine\functional.py", line 510, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "c:\a\envs\main\lib\site-packages\keras\engine\functional.py", line 667, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\a\envs\main\lib\site-packages\keras\layers\core\dense.py", line 244, in call
      outputs = tf.tensordot(inputs, self.kernel, [[rank - 1], [0]])
Node: 'model_1/dense_1/Tensordot/MatMul'
2 root error(s) found.
  (0) RESOURCE_EXHAUSTED:  OOM when allocating tensor with shape[5434,77030] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node model_1/dense_1/Tensordot/MatMul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

	 [[broadcast_weights_1/assert_broadcastable/AssertGuard/pivot_f/_15/_61]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

  (1) RESOURCE_EXHAUSTED:  OOM when allocating tensor with shape[5434,77030] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node model_1/dense_1/Tensordot/MatMul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_22946]