In [1]:
import tensorflow as tf
import unicodedata
import re
import numpy as np
import os
import io
import time

# Import Encoder and Decoder

In [2]:
import tensorflow as tf
import numpy as np

class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.LSTM = tf.keras.layers.LSTM(self.enc_units,
                                   return_sequences=True,
                                   return_state=True)

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state_h, state_c = self.LSTM(x, initial_state = hidden)
        return output, state_h, state_c

    def initialize_hidden_state(self):
        return (tf.zeros([self.batch_sz, self.enc_units]),
                tf.zeros([self.batch_sz, self.enc_units]))





class LuongAttention(tf.keras.Model):
    def __init__(self, rnn_size, attention_func):
        super(LuongAttention, self).__init__()
        self.attention_func = attention_func

        if attention_func not in ['dot', 'general', 'concat']:
            raise ValueError(
                'Unknown attention score function! Must be either dot, general or concat.')

        if attention_func == 'general':
            # General score function
            self.wa = tf.keras.layers.Dense(rnn_size)
        elif attention_func == 'concat':
            # Concat score function
            self.wa = tf.keras.layers.Dense(rnn_size, activation='tanh')
            self.va = tf.keras.layers.Dense(1)

    def call(self, decoder_output, encoder_output):
        if self.attention_func == 'dot':
            # decoder_output has shape: (batch_size, 1, rnn_size)
            # encoder_output has shape: (batch_size, max_len, rnn_size)
            # score has shape: (batch_size, 1, max_len)
            score = tf.matmul(decoder_output, encoder_output, transpose_b=True)
        elif self.attention_func == 'general':

            # score has shape: (batch_size, 1, max_len)
            score = tf.matmul(decoder_output, self.wa(
                encoder_output), transpose_b=True)
        elif self.attention_func == 'concat':

            decoder_output = tf.tile(
                decoder_output, [1, encoder_output.shape[1], 1])

            score = self.va(
                self.wa(tf.concat((decoder_output, encoder_output), axis=-1)))

            # (batch_size, max_len, 1) => (batch_size, 1, max_len)
            score = tf.transpose(score, [0, 2, 1])

        alignment = tf.nn.softmax(score, axis=2)

        # context vector c_t is the weighted average sum of encoder output
        context = tf.matmul(alignment, encoder_output)

        return context, alignment


class LoungDecoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, rnn_size, attention_func):
        super(LoungDecoder, self).__init__()
        self.attention = LuongAttention(rnn_size, attention_func)
        self.rnn_size = rnn_size
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.lstm = tf.keras.layers.LSTM(
            rnn_size, return_sequences=True, return_state=True)
        self.wc = tf.keras.layers.Dense(rnn_size, activation='tanh')
        self.ws = tf.keras.layers.Dense(vocab_size)

    def call(self, sequence, state, encoder_output):
        # shape of sequence is (batch_size, 1)
        embed = self.embedding(sequence)
        # shape of embed becomes (batch_size , 1 , embedding_size)

        # the lstm_out has shape (batch_size, 1, rnn_size)
        lstm_out, state_h, state_c = self.lstm(embed, initial_state=state)

        # Use self.attention to compute the context and alignment vectors
        # context vector's shape: (batch_size, 1, rnn_size)
        # alignment vector's shape: (batch_size, 1, source_length)
        context, alignment = self.attention(lstm_out, encoder_output)

        # Combine the context vector and the LSTM output
        # Before combined, both have shape of (batch_size, 1, rnn_size),
        # so let's squeeze the axis 1 first
        # After combined, it will have shape of (batch_size, 2 * rnn_size)
        lstm_out = tf.concat(
            [tf.squeeze(context, 1), tf.squeeze(lstm_out, 1)], 1)

        # lstm_out now has shape (batch_size, rnn_size)
        lstm_out = self.wc(lstm_out)

        # Finally, it is converted back to vocabulary space: (batch_size, vocab_size)
        logits = self.ws(lstm_out)

        return logits, state_h, state_c, alignment



# Load Dataset

In [3]:
lines = io.open('../input/hindienglish/hin.txt', encoding='UTF-8').read().strip().split('\n')

word_pairs = [[a for a in l.split('\t')]  for l in lines]
for i in range(len(word_pairs)):
    word_pairs[i] = word_pairs[i][:2]

In [4]:
word_pairs[1000:1010]

[['Let me know your address.', 'मुझे अपना पता बतादेना।'],
 ['My father died of cancer.', 'मेरे पिताजी कैंसर से चल बसे।'],
 ["Our team isn't very good.", 'हमारी टीम बहुत अच्छी नहीं है।'],
 ['Please wait five minutes.', 'कृपया पाँच मिनट ठहरिए।'],
 ['She asked us to be quiet.', 'उसने हमें चुप रहने के लिए कहा।'],
 ['She is an obstinate girl.', 'वह एक ज़िद्दी लड़की है।'],
 ['She left the baby crying.', 'उसने बच्चे को रोते हुए छोड़ दिया।'],
 ['She refused to notice me.', 'उसने मुझे ध्यान में लेने से इनकार करदिआ।'],
 ['Ten years is a long time.', 'दस साल बहुत लम्बा समय होता है।'],
 ['The doctor felt my pulse.', 'डॉक्टर ने मेरी नब्ज़ ली।']]

# Preprocess Data

In [5]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
    if unicodedata.category(c) != 'Mn')


def preprocess_sentence_en(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([?.!|,¿])", r" \1 ", s)
    s = re.sub(r'[" "]+', " ", s)
    s = s.strip()
    s = '<start> ' + s + ' <end>'
    return s

def preprocess_sentence_hn(s):
    s = s.lower().strip()
    s = re.sub(r"([?.!|,¿])", r" \1 ", s)
    s = re.sub(r'[" "]+', " ", s)
    s = s.strip()
    s = '<start> ' + s + ' <end>'
    return s  

def tokenize(lang):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
    tokenizer.fit_on_texts(lang)

    tensor = tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    return tensor, tokenizer

In [6]:
inp_lang = [preprocess_sentence_en(p[0]) for p in word_pairs]
target_lang = [preprocess_sentence_hn(p[1]) for p in word_pairs]

input_tensor, input_lang_tokenizer = tokenize(inp_lang)
target_tensor, target_lang_tokenizer = tokenize(target_lang)

In [7]:
print(input_tensor.shape)
print(target_tensor.shape)

(2923, 27)
(2923, 29)


In [8]:
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]


In [9]:
BUFFER_SIZE = len(input_tensor)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(input_lang_tokenizer.word_index)+1
vocab_tar_size = len(target_lang_tokenizer.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor, target_tensor)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

# Initialize Encoder and Decoder models

In [10]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

attention_decoder = LoungDecoder(vocab_tar_size, embedding_dim, units, 'concat')


In [11]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

# Training

## Define single train step

In [12]:

def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden_h, enc_hidden_c = encoder(inp, enc_hidden)

        dec_hidden_h = enc_hidden_h
        dec_hidden_c = enc_hidden_c


        for t in range(1, targ.shape[1]):
            dec_input = tf.expand_dims(targ[:, t-1], 1)
      
            predictions, dec_hidden_h, dec_hidden_c, _ = attention_decoder(dec_input, (dec_hidden_h , dec_hidden_c), enc_output)

            loss += loss_function(targ[:, t], predictions)


    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + attention_decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [13]:
def translate(sentence):
    sentence = preprocess_sentence_en(sentence)

    inputs = [input_lang_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = (tf.zeros([1, units]), tf.zeros([1, units]))

    enc_out, dec_hidden_h, dec_hidden_c = encoder(inputs, hidden)

    dec_input = tf.expand_dims([target_lang_tokenizer.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden_h, dec_hidden_c, _ = attention_decoder(dec_input, (dec_hidden_h, dec_hidden_c), enc_out)



        predicted_id = tf.argmax(predictions[0]).numpy()
    # print(predicted_id)
        result += target_lang_tokenizer.index_word[predicted_id] + ' '

        if target_lang_tokenizer.index_word[predicted_id] == '<end>':
            break

    # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

## Start Training

In [14]:
EPOCHS = 40

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))


    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 2.2512
Epoch 1 Loss 1.8286
Time taken for 1 epoch 446.14481830596924 sec

Epoch 2 Batch 0 Loss 1.5934
Epoch 2 Loss 1.5179
Time taken for 1 epoch 444.03094816207886 sec

Epoch 3 Batch 0 Loss 1.3246
Epoch 3 Loss 1.3990
Time taken for 1 epoch 443.4099473953247 sec

Epoch 4 Batch 0 Loss 1.3630
Epoch 4 Loss 1.3086
Time taken for 1 epoch 447.27841424942017 sec

Epoch 5 Batch 0 Loss 1.2071
Epoch 5 Loss 1.2272
Time taken for 1 epoch 443.7384581565857 sec

Epoch 6 Batch 0 Loss 1.1014
Epoch 6 Loss 1.1434
Time taken for 1 epoch 442.9027154445648 sec

Epoch 7 Batch 0 Loss 1.0438
Epoch 7 Loss 1.0629
Time taken for 1 epoch 445.6185941696167 sec

Epoch 8 Batch 0 Loss 1.0521
Epoch 8 Loss 0.9796
Time taken for 1 epoch 442.891535282135 sec

Epoch 9 Batch 0 Loss 0.8965
Epoch 9 Loss 0.8935
Time taken for 1 epoch 440.72787976264954 sec

Epoch 10 Batch 0 Loss 0.7244
Epoch 10 Loss 0.8068
Time taken for 1 epoch 444.0262999534607 sec

Epoch 11 Batch 0 Loss 0.6670
Epoch 11 Loss 0.7256
Time 

# Translate

In [15]:
translate(u'this is book.')

Input: <start> this is book . <end>
Predicted translation: यह किताब है। <end> 


In [16]:
translate(u'are you at home ?')

Input: <start> are you at home ? <end>
Predicted translation: तुम घर पे हो क्या ? <end> 


In [17]:
translate(u'what is this ?')

Input: <start> what is this ? <end>
Predicted translation: यह क्या है ? <end> 
