In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import warnings
import re
from tqdm import tqdm
import string
from sklearn.model_selection import train_test_split
import os
warnings.filterwarnings('ignore')

In [None]:
tf.test.is_gpu_available(
    cuda_only=False,
    min_cuda_compute_capability=None
)

True

In [None]:
PATH = "/content/drive/MyDrive/Hindi_English_Truncated_Corpus.csv"
df = pd.read_csv(PATH)
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [None]:
import unicodedata
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


In [None]:

def preprocess_hindi(sent):
   
    sent = str(sent)
    sent = sent.strip()
    sent = re.sub(r"([?.!,¿])", r" \1 ", sent)
    sent = re.sub(r'[" "]+', " ", sent)
    sent = re.sub(r"[२३०८१५७९४६]","",sent)
    sent = (sent.translate(str.maketrans('', '', string.punctuation))).replace('।','')
    sent = ''.join([i for i in sent if not i.isdigit()])
    sent = sent.rstrip().strip()
    sent = '<s> ' + sent + ' <e>'
    return sent

def preprocess_eng(sent):

    sent = str(sent)
    sent = sent.lower()
    sent = re.sub("'", '', sent)
    sent = re.sub(r"([?.!,¿])", r" \1 ", sent)
    sent = re.sub(r'[" "]+', " ", sent)
    sent = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sent)
    sent = sent.translate(str.maketrans('', '', string.punctuation))
    sent = ''.join([i for i in sent if not i.isdigit()])
    sent = sent.rstrip().strip()
    sent = '<s> ' + sent + ' <e>'
    return sent

In [None]:
df["eng_sent_len"] = df["english_sentence"].apply(lambda x: len(str(x).split(' ')))
df["hindi_sent_len"] = df["hindi_sentence"].apply(lambda x: len(str(x).split(' ')))

In [None]:
df = df.loc[df['hindi_sent_len'] < 30].copy()

In [None]:
df['english_sentence'] = df['english_sentence'].apply(preprocess_eng)
df['hindi_sentence'] = df['hindi_sentence'].apply(preprocess_hindi)

In [None]:

def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

  return tensor, lang_tokenizer

In [None]:
hindi = df['hindi_sentence'].values.tolist()[:3500]
english = df['english_sentence'].values.tolist()[:3500]
hin, tok_hin = tokenize(hindi)
eng, tok_eng = tokenize(english)

In [None]:
hin.shape, eng.shape

((3500, 31), (3500, 46))

In [None]:
vocab_eng_size = len(tok_eng.word_index)+1
vocab_hin_size = len(tok_hin.word_index)+1

In [None]:
maxlen_output_hin = max(len(t) for t in hin)
maxlen_input_eng = max(len(t) for t in eng)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(eng, hin, test_size=0.2)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape,len(X_train), len(X_test), len(y_train), len(y_test)

((2800, 46), (700, 46), (2800, 31), (700, 31), 2800, 700, 2800, 700)

In [None]:
embedding_dim = 256
epochs = 20
bs = 64
units = 1024
steps_per_epoch = len(X_train)//bs

In [None]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [None]:
encoder = Encoder(vocab_eng_size, embedding_dim, units, bs)

In [None]:
class BahdanauAttention(tf.keras.Model):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    hidden_with_time_axis = tf.expand_dims(query, 1)

    score = self.V(tf.nn.tanh(
        self.W1(values) + self.W2(hidden_with_time_axis)))

    attention_weights = tf.nn.softmax(score, axis=1)

    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [None]:
attention_layer = BahdanauAttention(10)

In [None]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)


    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):

    context_vector, attention_weights = self.attention(hidden, enc_output)

    x = self.embedding(x)


    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    output, state = self.gru(x)


    output = tf.reshape(output, (-1, output.shape[2]))

    x = self.fc(output)

    return x, state, attention_weights

In [None]:
decoder = Decoder(vocab_hin_size, embedding_dim, units, bs)

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [None]:
checkpoint_dir = './content/checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([tok_hin.word_index['<s>']] * bs, 1)

    for t in range(1, targ.shape[1]):
      
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [None]:
BUFFER_SIZE = len(X_train)
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(bs, drop_remainder=True)

In [None]:

for epoch in range(epochs):

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                     batch,
                                                     batch_loss.numpy()))

  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))

Epoch 1 Batch 0 Loss 3.5553
Epoch 1 Loss 3.0902
Epoch 2 Batch 0 Loss 3.0595
Epoch 2 Loss 2.8063
Epoch 3 Batch 0 Loss 2.8666
Epoch 3 Loss 2.7363
Epoch 4 Batch 0 Loss 2.5263
Epoch 4 Loss 2.6617
Epoch 5 Batch 0 Loss 2.6781
Epoch 5 Loss 2.5771
Epoch 6 Batch 0 Loss 2.6455
Epoch 6 Loss 2.4946
Epoch 7 Batch 0 Loss 2.0426
Epoch 7 Loss 2.3910
Epoch 8 Batch 0 Loss 2.3386
Epoch 8 Loss 2.2945
Epoch 9 Batch 0 Loss 2.1237
Epoch 9 Loss 2.1762
Epoch 10 Batch 0 Loss 2.0695
Epoch 10 Loss 2.0803
Epoch 11 Batch 0 Loss 1.7854
Epoch 11 Loss 1.9735
Epoch 12 Batch 0 Loss 1.7748
Epoch 12 Loss 1.8625
Epoch 13 Batch 0 Loss 1.7118
Epoch 13 Loss 1.7564
Epoch 14 Batch 0 Loss 1.6709
Epoch 14 Loss 1.6509
Epoch 15 Batch 0 Loss 1.4446
Epoch 15 Loss 1.5446
Epoch 16 Batch 0 Loss 1.4760
Epoch 16 Loss 1.4302
Epoch 17 Batch 0 Loss 1.2580
Epoch 17 Loss 1.3170
Epoch 18 Batch 0 Loss 0.9330
Epoch 18 Loss 1.1992
Epoch 19 Batch 0 Loss 0.8853
Epoch 19 Loss 1.0847
Epoch 20 Batch 0 Loss 0.9143
Epoch 20 Loss 0.9758


In [None]:
max_hin_len = max(len(t) for t in hin)
max_eng_len = max(len(t) for t in eng)

In [None]:
def evaluate(sentence):
    attention_plot = np.zeros((max_hin_len, max_eng_len))

    sentence = preprocess_eng(sentence)

    inputs = [tok_eng.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                           maxlen=max_eng_len,
                                                           padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([tok_hin.word_index['<s>']], 0)

    for t in range(max_hin_len):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                             dec_hidden,
                                                             enc_out)

        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += tok_hin.index_word[predicted_id] + ' '

        if tok_hin.index_word[predicted_id] == '<e>':
            return result, sentence, attention_plot

        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

In [None]:
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))
translate('politicians do not have')

Input: <s> politicians do not have <e>
Predicted translation: तो आप ये कल्पना कर सकते हैं <e> 


In [None]:
!wget '/content/training_checkpoints'

/content/training_checkpoints: Scheme missing.
