In [None]:
import numpy as np
import pandas as pd

In [None]:
import re
import os
import time
import string
import pickle
import unicodedata
from string import digits

In [None]:
import tensorflow as tf

In [None]:
from sklearn.model_selection import train_test_split

# 1. Load dataset

In [None]:
def load_dataset(filename):
    doc = pd.read_csv(filename)
    return doc

In [None]:
filename = './english_hindi.csv'
lines_raw = load_dataset(filename)
print(lines_raw.info())
lines_raw.sample(5)

In [None]:
lines_raw['English'] = lines_raw['English'].astype(str)
lines_raw['Hindi'] = lines_raw['Hindi'].astype(str)
lines_raw.rename(columns = {'English' : 'source', 'Hindi' : 'target'}, inplace = True)
lines_raw.sample(5)

# 2. Preprocess Dataset

In [None]:
def preprocess_sentence(sentence):
    num_digits = str.maketrans('', '', digits)
    exclude = set(string.punctuation)
    
    sentence = sentence.lower()
    sentence = re.sub(" +", " ", sentence)
    sentence = re.sub("'", "", sentence)
    sentence = ''.join(ch for ch in sentence if ch not in exclude)
    sentence = sentence.translate(num_digits)
    sentence = re.sub("[२३०८१५७९४६]", "", sentence)
    sentence = sentence.strip()
    sentence= re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = sentence.rstrip().strip()
    sentence=  'start_ ' + sentence + ' _end'
    
    return sentence

In [None]:
eng_sentence = u"With this information."
hin_sentence = u"इस जानकारी के साथ."
print("English sentence: {}".format(preprocess_sentence(eng_sentence)))
print("Hindi sentence: {}".format(preprocess_sentence(hin_sentence)))

In [None]:
eng_sentence = u"I felt down the stairs in my haist."
hin_sentence = u"अपनी ही जल्दबाज़ी मे मैं सीढ़ियों से नीचे गिर गया।"
print("English sentence: {}".format(preprocess_sentence(eng_sentence)))
print("Hindi sentence: {}".format(preprocess_sentence(hin_sentence)))

### Create Dataset after preprocessing

In [None]:
def create_dataset(lines, num_examples):
    lines = lines[:num_examples]
    eng_sentence = []
    hin_sentence = []
    for i in range(len(lines)):
        eng_sentence.append(preprocess_sentence(lines.loc[i, 'source']))
        hin_sentence.append(preprocess_sentence(lines.loc[i, 'target']))
    return (eng_sentence, hin_sentence)

In [None]:
sample_size = 20000
source, target = create_dataset(lines_raw, sample_size)
source = tuple(source)
target = tuple(target)
print(source[-1])
print(target[-1])
print("Length of source: {}".format(len(source)))
print("Length of target: {}".format(len(target)))
type(target)

In [None]:
def max_length(tensor):
    return max(len(t) for t in tensor)

# 3. Generate Tokens

In [None]:
source_sentence_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters = '')
source_sentence_tokenizer.fit_on_texts(source)

source_tensor = source_sentence_tokenizer.texts_to_sequences(source)
source_tensor = tf.keras.preprocessing.sequence.pad_sequences(source_tensor, padding = 'post')

print(len(source_tensor[0]))

In [None]:
target_sentence_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters = '')
target_sentence_tokenizer.fit_on_texts(target)

target_tensor = target_sentence_tokenizer.texts_to_sequences(target)
target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, padding = 'post')

print(len(target_tensor[0]))

### Save Tokens

In [None]:
with open('source_tokenizer.pickle', 'wb') as handle:
    pickle.dump(source_sentence_tokenizer, handle, protocol = pickle.HIGHEST_PROTOCOL)
    
with open('target_tokenizer.pickle', 'wb') as handle:
    pickle.dump(target_sentence_tokenizer, handle, protocol = pickle.HIGHEST_PROTOCOL)

### Load Tokens

In [None]:
with open('source_tokenizer.pickle', 'rb') as handle:
    source_sentence_tokenizer = pickle.load(handle)
    
with open('target_tokenizer.pickle', 'rb') as handle:
    target_sentence_tokenizer = pickle.load(handle)

In [None]:
max_target_length = max(len(t) for t in target_tensor)
print(max_target_length)

max_source_length = max(len(t) for t in source_tensor)
print(max_source_length)

In [None]:
source_train_tensor, source_test_tensor, target_train_tensor, target_test_tensor = train_test_split(source_tensor, target_tensor, test_size = 0.2)

print("Source train tensor: {}".format(len(source_train_tensor)))
print("Source test tensor: {}".format(len(source_test_tensor)))
print("Target train tensor: {}".format(len(target_train_tensor)))
print("Target test tensor: {}".format(len(target_test_tensor)))

In [None]:
type(source_train_tensor)

In [None]:
def convert(lang, tensor):
    for t in tensor:
        if t != 0:
            print("{} --> {}".format(t, lang.index_word[t]))

In [None]:
print("Input Language, index to word mapping...")
convert(source_sentence_tokenizer, source_train_tensor[0])

print()

print("Target Language, index to word mapping...")
convert(target_sentence_tokenizer, target_train_tensor[0])

In [None]:
BUFFER_SIZE = len(source_train_tensor)
BATCH_SIZE = 64
steps_per_epoch = len(source_train_tensor) // BATCH_SIZE
embedding_dim = 256
units = 1024

vocab_inp_size = len(source_sentence_tokenizer.word_index) + 1
vocab_tar_size = len(target_sentence_tokenizer.word_index) + 1

dataset = tf.data.Dataset.from_tensor_slices((source_train_tensor, target_train_tensor)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder = True)
type(dataset)

In [None]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

# 4. Encoder-Decoder Model
### Encoder

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences = True,
                                       return_state = True,
                                       recurrent_initializer = 'glorot_uniform')

    @tf.function
    def call(self, inputs):
        x, hidden = inputs
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    @tf.function
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [None]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

In [None]:
sample_hidden = encoder.initialize_hidden_state()

sample_output, sample_hidden = encoder([example_input_batch, sample_hidden])
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

In [None]:
print(example_input_batch.shape)
print(sample_hidden.shape)

### Bahdanau Attention

In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        hidden_with_time_axis = tf.expand_dims(query, 1)

        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))

        attention_weights = tf.nn.softmax(score, axis = 1)

        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis = 1)

        return context_vector, attention_weights

In [None]:
attention_layer = BahdanauAttention(10)

In [None]:
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

### Decoder

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences = True,
                                       return_state = True,
                                       recurrent_initializer = 'glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(self.dec_units)

    @tf.function
    def call(self, inputs):
        x, hidden, enc_output = inputs
        context_vector, attention_weights = self.attention(hidden, enc_output)

        x = self.embedding(x)

        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis = -1)

        output, state = self.gru(x)

        output = tf.reshape(output, (-1, output.shape[2]))

        x = self.fc(output)

        return x, state, attention_weights

In [None]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

In [None]:
sample_decoder_output, _, _ = decoder([tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output])

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

In [None]:
print(tf.random.uniform((BATCH_SIZE, 1)).shape)
print(sample_hidden.shape)
print(sample_output.shape)

# 5. Define the Optimizer and the loss Function

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True, reduction = 'none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

# 6. Checkpoints (Object-based Saving)

In [None]:
checkpoint_dir = 'training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer = optimizer, encoder = encoder, decoder = decoder)

In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder([inp, enc_hidden])
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([target_sentence_tokenizer.word_index['start_']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder([dec_input, dec_hidden, enc_output])
            loss += loss_function(targ[:, t], predictions)
            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

# 7. Training the Model

In [None]:
steps_per_epoch

In [None]:
EPOCHS = 20

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss
        if batch % 100 == 0:
            print('Epoch {} Batch {} loss {}'.format(epoch + 1, batch, batch_loss.numpy()))
   
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

### Restore Checkpoint

In [None]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

# 8. Save Encoder-Decoder models and weights

In [None]:
encoder.save('encoder', save_format = 'tf')

In [None]:
decoder.save('decoder', save_format = 'tf')

In [None]:
encoder.save_weights('encoder_weights/encoder')
decoder.save_weights('decoder_weights/decoder')

# 9. Evaluate the Model

In [None]:
def evaluate(sentence):
    try:
        flag = True
        sentence = preprocess_sentence(sentence)
        inputs = [source_sentence_tokenizer.word_index[i] for i in sentence.split(' ')]
        inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen = max_source_length, padding = 'post')
        inputs = tf.convert_to_tensor(inputs)
        result = ''
        hidden = [tf.zeros((1, units))]

        enc_out, enc_hidden = encoder([inputs, hidden])
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([target_sentence_tokenizer.word_index['start_']], 0)

        for t in range(max_target_length):
            predictions, dec_hidden, attention_weights = decoder([dec_input, dec_hidden, enc_out])
            attention_weights = tf.reshape(attention_weights, (-1, ))
            predicted_id = tf.argmax(predictions[0]).numpy()
            result += target_sentence_tokenizer.index_word[predicted_id] + ' '
            if target_sentence_tokenizer.index_word[predicted_id] == '_end':
                return result, flag

            dec_input = tf.expand_dims([predicted_id], 0)
        return result, flag
    
    except KeyError:
        flag = False
        return "Sorry we didn't find any expected translation for your entered word/sentence", flag

# 10. Translate Input to generate Output

In [None]:
def translate(sentence):
    result, flag = evaluate(sentence)
    if flag:
        print('Predicted translation: {}'.format(result[:-5]))
    else:
        print(result)

In [None]:
translate(u'I am going to work.')

In [None]:
translate(u'You need to work smart.')

In [None]:
translate(u'hello')

In [None]:
translate(u'I should take rest')

In [None]:
translate(u'I live in Bangalore')