In [None]:
import numpy as np

import typing
from typing import Any, Tuple
import pandas as pd

import einops
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import tensorflow as tf

import string
from string import digits

import matplotlib.pyplot as plt
%matplotlib inline
import re
# from matplotlib.font_manager import FontProperties
# font_prop = FontProperties(fname='Mangal.ttf', size=18)

In [None]:
import tensorflow_text as tf_text

In [None]:
lines=pd.read_csv("D:\\Final_Year\\NLP Translator\\hindi_to_english_dataset.csv",encoding='utf-8')

In [None]:
pd.isnull(lines).sum()

In [None]:
lines=lines[~pd.isnull(lines['english'])]
lines=lines[~pd.isnull(lines['hindi'])]

In [None]:
lines.drop_duplicates(inplace=True)

In [None]:
lines.shape

In [None]:
hindi=np.array(lines['hindi'][110564:])
english=np.array(lines['english'][110564:])

In [None]:
def clean_english_data(sentence):
    exclude = set(string.punctuation)
    remove_digits = str.maketrans('', '', string.digits)
    sentence = sentence.lower()
    sentence = ''.join(ch for ch in sentence if ch not in exclude)
    sentence = sentence.translate(remove_digits)
    sentence = sentence.strip()
    sentence = re.sub(" +", " ", sentence)
    return sentence

In [None]:
def clean_hindi_data(sentence):
    exclude = set(string.punctuation)
    remove_digits = str.maketrans('', '', string.digits)
    sentence = sentence.lower()
    sentence = ''.join(ch for ch in sentence if ch not in exclude)

    sent_temp = ''
    for c in sentence:
        if c == ' ':
            sent_temp += c
        elif ord(u'\u0900') <= ord(c) <= ord(u'\u097F'):
            sent_temp += c
    sentence = sent_temp
      
    sentence = re.sub('[a-z]', '', sentence)
    sentence = re.sub('[०१२३४५६७८९।]', '', sentence)
    sentence = sentence.translate(remove_digits)
    sentence = sentence.strip()
    sentence = re.sub(" +", " ", sentence)
    return sentence

In [None]:
hindi

In [None]:
english

In [None]:
context_raw=np.array([clean_english_data(x) for x in english])
target_raw=np.array([clean_hindi_data(x) for x in hindi])

In [None]:
print(target_raw.shape)
print(target_raw[-1])

In [None]:
print(context_raw[-1])

Creating Tensorflow Dataset

In [None]:

BUFFER_SIZE = len(context_raw)
BATCH_SIZE = 64

is_train = np.random.uniform(size=(len(target_raw),)) < 0.8

train_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw[is_train], target_raw[is_train]))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE))
val_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw[~is_train], target_raw[~is_train]))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE))

In [None]:
for example_context_strings, example_target_strings in train_raw.take(1):
  print(example_context_strings[:5])
  print()
  print(example_target_strings[:5])
  break

In [None]:
example_text = tf.constant('सिंगापुर के फरेर पार्क में रासबिहारी बोस ने भारतीय स्वतंत्रता परिषद का नेतृत्व सुभाषबाबू को सौंप दिया।')

print(example_text.numpy())
print(tf_text.normalize_utf8(example_text, 'NFD').numpy())

In [None]:
def tf_lower_and_split_punct(text):
  # Split accented characters.
  
  text = tf.strings.strip(text)

  text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
  text = tf_text.normalize_utf8(text, 'NFD')
  return text

In [None]:
print(example_text.numpy().decode())
print(tf_lower_and_split_punct(example_text).numpy().decode())

Text Vectorization

In [None]:
max_vocab_size = 5000

context_text_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=max_vocab_size,
    ragged=True)

In [None]:
context_text_processor.adapt(train_raw.map(lambda context, target: context))

# Here are the first 10 words from the vocabulary:
context_text_processor.get_vocabulary()[:10]

In [None]:
target_text_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=max_vocab_size,
    ragged=True)

In [None]:

target_text_processor.adapt(train_raw.map(lambda context, target: target))

In [None]:
# 10 words of hindi vocabulary
(target_text_processor).get_vocabulary()[:10]

Tokenization

In [None]:
example_tokens = context_text_processor(example_context_strings)
example_tokens[:3, :]

In [None]:
context_vocab = np.array(context_text_processor.get_vocabulary())
tokens = context_vocab[example_tokens[0].numpy()]
' '.join(tokens)

In [None]:
plt.subplot(1, 2, 1)
plt.pcolormesh(example_tokens.to_tensor())
plt.title('Token IDs')

plt.subplot(1, 2, 2)
plt.pcolormesh(example_tokens.to_tensor() != 0)
plt.title('Mask')

In [None]:
def process_text(context, target):
  context = context_text_processor(context).to_tensor()
  target = target_text_processor(target)
  targ_in = target[:,:-1].to_tensor()
  targ_out = target[:,1:].to_tensor()
  return (context, targ_in), targ_out


train_ds = train_raw.map(process_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(process_text, tf.data.AUTOTUNE)

In [None]:
for (ex_context_tok, ex_tar_in), ex_tar_out in train_ds.take(1):
  print(ex_context_tok[0, :20].numpy()) 
  print()
  print(ex_tar_in[0, :20].numpy()) 
  print(ex_tar_out[0, :20].numpy()) 

Encoder

In [None]:
#@title
class ShapeChecker():
  def __init__(self):
    # Keep a cache of every axis-name seen
    self.shapes = {}

  def __call__(self, tensor, names, broadcast=False):
    if not tf.executing_eagerly():
      return

    parsed = einops.parse_shape(tensor, names)

    for name, new_dim in parsed.items():
      old_dim = self.shapes.get(name, None)
      
      if (broadcast and new_dim == 1):
        continue

      if old_dim is None:
        # If the axis name is new, add its length to the cache.
        self.shapes[name] = new_dim
        continue

      if new_dim != old_dim:
        raise ValueError(f"Shape mismatch for dimension: '{name}'\n"
                         f"    found: {new_dim}\n"
                         f"    expected: {old_dim}\n")

In [None]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, text_processor, units):
    super(Encoder, self).__init__()
    self.text_processor = text_processor
    self.vocab_size = text_processor.vocabulary_size()
    self.units = units
    
    # The embedding layer converts tokens to vectors
    self.embedding = tf.keras.layers.Embedding(self.vocab_size, units,
                                               mask_zero=True)

    # The RNN layer processes those vectors sequentially.
    self.rnn = tf.keras.layers.Bidirectional(
        merge_mode='sum',
        layer=tf.keras.layers.GRU(units,
                            # Return the sequence and state
                            return_sequences=True,
                            recurrent_initializer='glorot_uniform'))

  def call(self, x):
    shape_checker = ShapeChecker()
    shape_checker(x, 'batch s')

    # 2. The embedding layer looks up the embedding vector for each token.
    x = self.embedding(x)
    shape_checker(x, 'batch s units')

    # 3. The GRU processes the sequence of embeddings.
    x = self.rnn(x)
    shape_checker(x, 'batch s units')

    # 4. Returns the new sequence of embeddings.
    return x

  def convert_input(self, texts):
    texts = tf.convert_to_tensor(texts)
    if len(texts.shape) == 0:
      texts = tf.convert_to_tensor(texts)[tf.newaxis]
    context = self.text_processor(texts).to_tensor()
    context = self(context)
    return context

In [None]:
UNITS = 256

In [None]:
# Encode the input sequence.
encoder = Encoder(context_text_processor, UNITS)
ex_context = encoder(ex_context_tok)

print(f'Context tokens, shape (batch, s): {ex_context_tok.shape}')
print(f'Encoder output, shape (batch, s, units): {ex_context.shape}')