## Setup

In [1]:
!pip install tensorflow-addons==0.11.2

In [2]:
import tensorflow as tf
import tensorflow_addons as tfa
from IPython.display import HTML as html_print
from IPython.display import display
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import unicodedata
import re
import numpy as np
import os
import io
import time
import pandas as pd
from tensorflow.keras.layers import Embedding, SimpleRNNCell, GRUCell, Dense, LSTMCell

## Data Cleaning and Data Preparation 


In [3]:
# def download_nmt():
#     path_to_zip = tf.keras.utils.get_file(
#     'dakshina_dataset_v1.0.tar', origin='https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar',
#     extract=True, untar = True)

#     path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"
#     return path_to_file
# download_nmt()

### Define a DakshinaDataset class with necessary functions to follow Step 1 to Step 4. 
The ```call()``` will return:
1. ```train_dataset```  and ```val_dataset``` : ```tf.data.Dataset``` objects
2. ```inp_lang_tokenizer``` and ```targ_lang_tokenizer``` : ```tf.keras.preprocessing.text.Tokenizer``` objects 

In [7]:
!wget  https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
!tar -xf 'dakshina_dataset_v1.0.tar'
train_file_path = "./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
val_file_path= "./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv"
test_file_path  = "./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv"

In [8]:
class DakshinaDataset:
    def __init__(self, problem_type='en-spa'):
        self.problem_type = 'en-spa'
        self.inp_lang_tokenizer = None
        self.targ_lang_tokenizer = None
        self.num_of_train = 0
        self.num_of_test = 0
        self.num_of_val = 0
    

    def unicode_to_ascii(self, s):
        return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

    ## Step 1 and Step 2 
    def preprocess_sentence(self, w):
        # w = self.unicode_to_ascii(w.lower().strip())

        # # creating a space between a word and the punctuation following it
        # # eg: "he is a boy." => "he is a boy ."
        # # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
        # w = re.sub(r"([?.!,¿])", r" \1 ", w)
        # w = re.sub(r'[" "]+', " ", w)

        # # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
        # w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

        # w = w.strip()

        # adding a start and an end token to the sentence
        # so that the model know when to start and stop predicting.
        w = '\t' + w + '\n'
        return w
    
    def create_dataset(self, path, data_name):
        # path : path to spa-eng.txt file
        # num_examples : Limit the total number of training example for faster training (set num_examples = len(lines) to use full data)
        lines = io.open(path, encoding='UTF-8').read().split('\n')
        #print(lines)
        if data_name == "train":
          self.num_of_train = len(lines) -1
        elif data_name == "val":
          self.num_of_val = len(lines) -1
        else:
          self.num_of_test = len(lines) -1
        word_pairs = [[self.preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:len(lines)-1]]
        #print(word_pairs)

        
        return zip(*word_pairs)

    # Step 3 and Step 4
    def tokenize(self, lang):
        # lang = list of sentences in a language
        
        # print(len(lang), "example sentence: {}".format(lang[0]))
        lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level = True)
        lang_tokenizer.fit_on_texts(lang)

        ## tf.keras.preprocessing.text.Tokenizer.texts_to_sequences converts string (w1, w2, w3, ......, wn) 
        ## to a list of correspoding integer ids of words (id_w1, id_w2, id_w3, ...., id_wn)
        tensor = lang_tokenizer.texts_to_sequences(lang) 

        ## tf.keras.preprocessing.sequence.pad_sequences takes argument a list of integer id sequences 
        ## and pads the sequences to match the longest sequences in the given input
        tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

        return tensor, lang_tokenizer

    def load_dataset(self, path, data_name = None, ):
        targ_lang, inp_lang ,_= self.create_dataset(path, data_name)
        if data_name == "train":
            # creating cleaned input, output pairs
            
            #print(targ_lang, inp_lang)
            input_tensor, inp_lang_tokenizer = self.tokenize(inp_lang)
            #print(input_tensor, inp_lang_tokenizer.word_index)
            target_tensor, targ_lang_tokenizer = self.tokenize(targ_lang)
            return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer
        else:
            
            #print(targ_lang, inp_lang)
            input_tensor= self.inp_lang_tokenizer.texts_to_sequences(inp_lang)
            #print(input_tensor)
            input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, padding='post', maxlen = 22)
            #print(input_tensor, inp_lang_tokenizer.word_index)
            target_tensor = self.targ_lang_tokenizer.texts_to_sequences(targ_lang)
            target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, padding='post', maxlen =21)
            #print(target_tensor, targ_lang_tokenizer.word_index)
            return input_tensor, target_tensor
        

    def call(self,  BUFFER_SIZE, BATCH_SIZE):
        file_path = train_file_path
        input_tensor_train, target_tensor_train, self.inp_lang_tokenizer, self.targ_lang_tokenizer = self.load_dataset(train_file_path, data_name ="train" )
        train_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train))
        train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
        print("val")
        file_path = val_file_path
        input_tensor_val, target_tensor_val = self.load_dataset(val_file_path, "val")
        val_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val))
        val_dataset = val_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
        print("test")
        file_path = test_file_path
        input_tensor_test, target_tensor_test = self.load_dataset(test_file_path,  "test")
        test_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_test, target_tensor_test))
        test_dataset = test_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
        # val_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val))
        # val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)

        return train_dataset, val_dataset, test_dataset, self.inp_lang_tokenizer, self.targ_lang_tokenizer

In [9]:
BUFFER_SIZE = 32000
BATCH_SIZE = 512
# Let's limit the #training examples for faster training
num_examples = 500

dataset_creator = DakshinaDataset('en-hi')
train_dataset, val_dataset, test_dataset, inp_lang, targ_lang = dataset_creator.call( BUFFER_SIZE, BATCH_SIZE)

In [10]:
example_input_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_target_batch.shape

### Some important parameters

In [11]:
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1
max_length_input = example_input_batch.shape[1]
max_length_output = example_target_batch.shape[1]

embedding_dim = 256
units = 1024



In [12]:
print("max_length_english, max_length_spanish, vocab_size_english, vocab_size_spanish")
max_length_input, max_length_output, vocab_inp_size, vocab_tar_size

In [13]:
##### 

class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz, num_of_layers, enc_unit_type, dropout, recurrent_dropout):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.num_of_layers = num_of_layers
    self.enc_unit_type = enc_unit_type
    self.dropout = dropout
    self.recurrent_dropout = recurrent_dropout
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    ##-------- LSTM layer in Encoder ------- ##
    self.encoder_layer = self.get_encoder_layer(self.enc_units,
                                                self.num_of_layers, self.enc_unit_type)
    
  def get_encoder_layer(self, enc_units, num_of_layers, enc_unit_type):
    return tf.keras.layers.RNN(tf.keras.layers.StackedRNNCells( [self.get_cell(enc_unit_type, 
                                                                                 enc_units) for i in range(num_of_layers)],),
                                  return_sequences=True, return_state=True, name = "Encoder")
  def get_cell(self, cell_type = "lstm", num_of_cell = 1, name = None):
      #print(cell_type)
      if cell_type == "lstm":
        return LSTMCell(num_of_cell, dropout = self.dropout, recurrent_dropout = self.recurrent_dropout, )
      elif cell_type == "rnn":
        return SimpleRNNCell(num_of_cell, dropout = self.dropout, recurrent_dropout = self.recurrent_dropout)
      elif cell_type =="gru":
        return GRUCell(num_of_cell, dropout = self.dropout, recurrent_dropout = self.recurrent_dropout)
      else:
        print(f"Invalid cell type: {cell_type}")

  def call(self, x, hidden):
    x = self.embedding(x)
    output = self.encoder_layer(x, initial_state = hidden)
    return output[0], output[1:]

  def initialize_hidden_state(self):
    if self.enc_unit_type == 'rnn' or self.enc_unit_type == "gru":
        return [tf.zeros((self.batch_sz, self.enc_units))]*self.num_of_layers
    else:
        return [[tf.zeros((self.batch_sz, self.enc_units)),tf.zeros((self.batch_sz, self.enc_units))]]*self.num_of_layers

In [14]:
## Test Encoder Stack

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE, 1, "lstm", 0.2,0.2)


# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_state= encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print(len(sample_state))
# print ('Encoder h vecotr shape: (batch size, units) {}'.format(sample_state[0].shape))
# print ('Encoder c vector shape: (batch size, units) {}'.format(sample_state[1].shape))

In [15]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, num_of_layers, dec_unit_type, dropout, recurrent_dropout, attention_type='luong',):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.attention_type = attention_type
    self.num_of_layers = num_of_layers
    self.dec_unit_type = dec_unit_type
    self.dropout = dropout
    self.recurrent_dropout = recurrent_dropout
    # Embedding Layer
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    
    #Final Dense layer on which softmax will be applied
    self.fc = tf.keras.layers.Dense(vocab_size)

    # Define the fundamental cell for decoder recurrent structure
    self.decoder_rnn_cell =  self.get_stacked_rnn_cell()
   


    # Sampler
    self.sampler = tfa.seq2seq.sampler.TrainingSampler()

    # Create attention mechanism with memory = None
    self.attention_mechanism = self.build_attention_mechanism(self.dec_units, 
                                                              None, self.batch_sz*[max_length_input], self.attention_type)

    # Wrap attention mechanism with the fundamental rnn cell of decoder
    self.rnn_cell = self.build_rnn_cell(batch_sz)

    # Define the decoder with respect to fundamental rnn cell
    self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler=self.sampler, output_layer=self.fc)

    
  def build_rnn_cell(self, batch_sz):
    rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnn_cell, 
                                  self.attention_mechanism, attention_layer_size=self.dec_units, alignment_history = True)
    return rnn_cell

  def build_attention_mechanism(self, dec_units, memory, memory_sequence_length, attention_type='luong'):
    # ------------- #
    # typ: Which sort of attention (Bahdanau, Luong)
    # dec_units: final dimension of attention outputs 
    # memory: encoder hidden states of shape (batch_size, max_length_input, enc_units)
    # memory_sequence_length: 1d array of shape (batch_size) with every element set to max_length_input (for masking purpose)

    if(attention_type=='bahdanau'):
      return tfa.seq2seq.BahdanauAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)
    else:
      return tfa.seq2seq.LuongAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)

  def build_initial_state(self, batch_sz, encoder_state, Dtype):
    decoder_initial_state = self.rnn_cell.get_initial_state(batch_size=batch_sz, dtype=Dtype)
    decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
    return decoder_initial_state


  def call(self, inputs, initial_state):
    x = self.embedding(inputs)
    outputs, _, _ = self.decoder(x, initial_state=initial_state, sequence_length=self.batch_sz*[max_length_output-1])
    return outputs
  def get_cell(self, cell_type = "lstm", num_of_cell = 1, name = None):
      #print(cell_type)
      if cell_type == "lstm":
        return LSTMCell(num_of_cell, dropout = self.dropout, recurrent_dropout = self.recurrent_dropout, )
      elif cell_type == "rnn":
        return SimpleRNNCell(num_of_cell, dropout = self.dropout, recurrent_dropout = self.recurrent_dropout)
      elif cell_type =="gru":
        return GRUCell(num_of_cell, dropout = self.dropout, recurrent_dropout = self.recurrent_dropout)
      else:
        print(f"Invalid cell type: {cell_type}")

  def get_stacked_rnn_cell(self,):
    return tf.keras.layers.StackedRNNCells( [self.get_cell(self.dec_unit_type, self.dec_units,) for i in range(self.num_of_layers)])


In [16]:
# Test decoder stack

decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE,1, "lstm", 0.2,0.2,  'luong')
sample_x = tf.random.uniform((BATCH_SIZE, max_length_output))
decoder.attention_mechanism.setup_memory(sample_output)
initial_state = decoder.build_initial_state(BATCH_SIZE,tuple(sample_state), tf.float32)


sample_decoder_outputs = decoder(sample_x, initial_state)

print("Decoder Outputs Shape: ", sample_decoder_outputs.rnn_output.shape)


## Define the optimizer and the loss function

In [17]:
optimizer = tf.keras.optimizers.RMSprop()


def loss_function(real, pred):
  # real shape = (BATCH_SIZE, max_length_output)
  # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )
  cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
  loss = cross_entropy(y_true=real, y_pred=pred)
  mask = tf.logical_not(tf.math.equal(real,0))   #output 0 for y=0 else output 1
  mask = tf.cast(mask, dtype=loss.dtype)  
  loss = mask* loss
  loss = tf.reduce_mean(loss)
  return loss  

## Checkpoints (Object-based saving)

In [18]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

## One train_step operations

In [19]:
@tf.function
def val_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_state= encoder(inp, enc_hidden)


    dec_input = targ[ : , :-1 ] # Ignore <end> token
    real = targ[ : , 1: ]         # ignore <start> token

    # Set the AttentionMechanism object with encoder_outputs
    decoder.attention_mechanism.setup_memory(enc_output)

    # Create AttentionWrapperState as initial_state for decoder
    decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, tuple(enc_state) ,tf.float32)
    pred = decoder(dec_input, decoder_initial_state)
    logits = pred.rnn_output
    loss = loss_function(real, logits)
    metric.update_state(real, logits)
  return loss, metric.result().numpy()

In [20]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_state= encoder(inp, enc_hidden)


    dec_input = targ[ : , :-1 ] # Ignore <end> token
    real = targ[ : , 1: ]         # ignore <start> token

    # Set the AttentionMechanism object with encoder_outputs
    decoder.attention_mechanism.setup_memory(enc_output)

    # Create AttentionWrapperState as initial_state for decoder
    decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, tuple(enc_state) ,tf.float32)
    pred = decoder(dec_input, decoder_initial_state)
    logits = pred.rnn_output
    loss = loss_function(real, logits)
    metric.update_state(real, logits)

  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss, metric.result().numpy()

## Train the model

In [21]:
EPOCHS = 20
metric = tf.keras.metrics.SparseCategoricalAccuracy()
tf.config.run_functions_eagerly(True)

step_per_val_epoch = dataset_creator.num_of_val//BATCH_SIZE
steps_per_epoch = dataset_creator.num_of_train//BATCH_SIZE

# step_per_val_epoch = 500//BATCH_SIZE
# steps_per_epoch = 500//BATCH_SIZE

for epoch in range(EPOCHS):
  start = time.time()
  
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0
  total_accuracy = 0
  # print(enc_hidden[0].shape, enc_hidden[1].shape)
  metric.reset_state()
  print("="*80)
  print("TRAINING")
  for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
    batch_loss, batch_acc= train_step(inp, targ, enc_hidden)
    total_loss += batch_loss
    total_accuracy+=batch_acc

    if batch % 10 == 0:
      print('\t Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy(), batch_acc*100 ))
  # saving (checkpoint) the model every 2 epochs
  metric.reset_state()
  total_val_loss = 0
  total_val_accuracy = 0
  print("="*80)
  print("VALIDATING")
  for (batch, (inp, targ)) in enumerate(val_dataset.take(steps_per_epoch)):
    val_batch_loss, val_batch_acc= val_step(inp, targ, enc_hidden)
    total_val_loss += val_batch_loss
    total_val_accuracy += val_batch_acc
  
  print(f"Validatiion loss:  {total_val_loss.numpy()/  step_per_val_epoch}")
  print((f"Validatiion Acc:  {(total_val_accuracy/  step_per_val_epoch)*100}"))
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)
  print("Accuracy ",(total_accuracy/steps_per_epoch) *100)
  print('Epoch {} Loss {:.4f} Acc {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch,
                                      (total_accuracy/ steps_per_epoch)*100
                                      ))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

#sweeping


In [None]:
sweep_config = {
    
    'method':'bayes',
    'metric': {
        'name':'Val Accuracy',
        'goal':'maximize'
    },
    'parameters':{
    
    "num_of_layer" : {'values': [1]},
    "unit_size": {"values":[256]},
    "unit_type": {"values":["lstm"]},
    "dropout": {"values": [0.3]},
    'recurrent_dropout':{'values':[0.3]},
    "epochs":{"value":17},
    "encoder_embedding_dim":{"values": [1024]},
    "decoder_embedding_dim":{"values": [256]},
    "optimizer":{"values": ["rmsprop"]}             
                   }
}
pprint.pprint(sweep_config)

In [None]:
sweep_id = wandb.sweep(sweep_config, project="Sweep_without_Attention2")

In [None]:
# def train(config = None):
  
#     encoder = Encoder(vocab_inp_size, 1024, 256, BATCH_SIZE, 1, "lstm", 0.3,0.3)
#     decoder = Decoder(vocab_tar_size, 256, 256,BATCH_SIZE,1, "lstm", 0.3,0.3, 'luong')
#     EPOCHS = config.epochs
#     metric = tf.keras.metrics.SparseCategoricalAccuracy()
#     tf.config.run_functions_eagerly(True)

#     step_per_val_epoch = dataset_creator.num_of_val//BATCH_SIZE
#     steps_per_epoch = dataset_creator.num_of_train//BATCH_SIZE

#     # step_per_val_epoch = 500//BATCH_SIZE
#     # steps_per_epoch = 500//BATCH_SIZE

#     for epoch in range(EPOCHS):
#       start = time.time()
      
#       enc_hidden = encoder.initialize_hidden_state()
#       total_loss = 0
#       total_accuracy = 0
#       # print(enc_hidden[0].shape, enc_hidden[1].shape)
#       metric.reset_state()
#       print("="*80)
#       print("TRAINING")
#       for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
#         batch_loss, batch_acc= train_step(inp, targ, enc_hidden)
#         total_loss += batch_loss
#         total_accuracy+=batch_acc

#         if batch % 100 == 0:
#           print('\t Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1,
#                                                       batch,
#                                                       batch_loss.numpy(), batch_acc*100 ))
#       # saving (checkpoint) the model every 2 epochs
#       metric.reset_state()
#       total_val_loss = 0
#       total_val_accuracy = 0
#       print("="*80)
#       print("VALIDATING")
#       for (batch, (inp, targ)) in enumerate(val_dataset.take(steps_per_epoch)):
#         val_batch_loss, val_batch_acc= val_step(inp, targ, enc_hidden)
#         total_val_loss += val_batch_loss
#         total_val_accuracy += val_batch_acc
      
#       print(f"Validatiion loss:  {total_val_loss.numpy()/  step_per_val_epoch}")
#       print((f"Validatiion Acc:  {(total_val_accuracy.numpy()/  step_per_val_epoch)*100}"))
#       if (epoch + 1) % 2 == 0:
#         checkpoint.save(file_prefix = checkpoint_prefix)
#       print("Accuracy ",(total_accuracy.numpy()/steps_per_epoch) *100)
#       print('Epoch {} Loss {:.4f} Acc {:.4f}'.format(epoch + 1,
#                                           total_loss / steps_per_epoch,
#                                           (total_accuracy/ steps_per_epoch)*100
#                                           ))
#       print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

## Use tf-addons BasicDecoder for decoding


In [None]:
next(iter(val_dataset))

In [24]:
def evaluate_sentence(sentence, attention_weights = None):
  #print("from evaluate",sentence)
  sentence = dataset_creator.preprocess_sentence(sentence)

  inputs = [inp_lang.word_index[i] for i in sentence]
  inputs = [inputs for _ in range(512)]
  inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                          maxlen=max_length_input,
                                                          padding='post')
  inputs = tf.convert_to_tensor(inputs)
  #print(inputs)
  inference_batch_size = 512
  result = ''

  enc_start_state = encoder.initialize_hidden_state()
  enc_out, enc_state  = encoder(inputs, enc_start_state)

  # dec_h = enc_h
  # dec_c = enc_c

  start_tokens = tf.fill([inference_batch_size], targ_lang.word_index['\t'])
  end_token = targ_lang.word_index['\n']

  greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler(decoder.embedding)

  # Instantiate BasicDecoder object
  decoder_instance = tfa.seq2seq.BasicDecoder(cell=decoder.rnn_cell, sampler=greedy_sampler, output_layer=decoder.fc, maximum_iterations=25)
  # Setup Memory in decoder stack
  decoder.attention_mechanism.setup_memory(enc_out)

  # set decoder_initial_state
  decoder_initial_state = decoder.build_initial_state(inference_batch_size,tuple(enc_state), tf.float32)


  ### Since the BasicDecoder wraps around Decoder's rnn cell only, you have to ensure that the inputs to BasicDecoder 
  ### decoding step is output of embedding layer. tfa.seq2seq.GreedyEmbeddingSampler() takes care of this. 
  ### You only need to get the weights of embedding layer, which can be done by decoder.embedding.variables[0] and pass this callabble to BasicDecoder's call() function

  decoder_embedding_matrix = decoder.embedding.variables
  if attention_weights:
    return decoder.attention_mechanism(inputs)
    #return decoder_instance(None, start_tokens = start_tokens, end_token= end_token, initial_state=decoder_initial_state)

  outputs, _, _ = decoder_instance(None, start_tokens = start_tokens, end_token= end_token, initial_state=decoder_initial_state)
  return outputs.sample_id.numpy(), outputs

def translate(sentence):
  result,_= evaluate_sentence(sentence)
  print("-"*80)
  #print(result[1])
  result = "".join("".join(targ_lang.sequences_to_texts(result[:1])).split(" "))
  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))
  return result[:-1]
translate('aaditya')

In [None]:
word_list  = ["bichhua", "flynn", "dikhai", "amit","immune","bhay","samadhan","corolla","brigadier", "dhndhe","yatna","chaudhawan", "vash", "hakikat", "kahlaya", "sheling", "talvon","beesavaan","salary","jaghaega"]
test_df = pd.read_csv(test_file_path, delimiter = "\t", header = None)
temp_df = pd.DataFrame(columns=["Input", "Predicted","True"])
for word in word_list:
  for index, row in test_df.iterrows():
    if word == row[1]:
      #print(word)
      pred = translate(word)
      temp_df.loc[len(temp_df.index)] = [word, pred,row[0]]
      break
temp_df.head(30)


In [None]:
test_df = pd.read_csv(test_file_path, delimiter = "\t", header = None)
temp_df = pd.DataFrame(columns=["Input", "Predicted","True"])
count =0
for index, row in test_df.iterrows():
      #print(word)
      pred = translate(row[1])
      if pred == row[0]:
        count+=1
      temp_df.loc[len(temp_df.index)] = [row[1], pred,row[0]]
temp_df.to_csv("predictions_attention.csv")
print("Test accuracy :", (count/len(temp_df))*100)

In [None]:
test_df[1]

In [None]:
def plot_attention(attention, sentence, predicted_sentence):
  #sentence = tf_lower_and_split_punct(sentence).numpy().decode().split()
  #predicted_sentence = predicted_sentence.numpy().decode().split() + ['[END]']
  fig = plt.figure(figsize=(10, 10))
  ax = fig.add_subplot(1, 1, 1)

  attention = attention[:len(predicted_sentence), :len(sentence)]

  ax.matshow(attention, cmap='viridis', vmin=0.0)

  fontdict = {'fontsize': 14}

  ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
  ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  ax.set_xlabel('Input text')
  ax.set_ylabel('Output text')
  plt.suptitle('Attention weights')

In [None]:
out =  evaluate_sentence("aditya", True)


In [None]:
_, attention, _ = out

In [None]:
cell_state, attention, alignments, alignment_history, attention_state = attention

In [None]:
attention

In [None]:
stack_align = alignment_history.stack()

In [None]:
stack_align[:,0,:]

In [None]:
decoder

In [None]:
alignment_history.

In [None]:
out[1].attention

In [None]:
plt.matshow([out[1].attention_state[0].numpy(),out[1].attention_state[0].numpy()])

In [None]:
plot_attention(stack_align[:,0,:],["a"]*12,["a"]*12)

In [None]:
out[1].attention

## Restore the latest checkpoint and test

In [None]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [None]:
translate(u'esta es mi vida.')

In [None]:
translate(u'¿todavia estan en casa?')

In [None]:
# wrong translation
translate(u'trata de averiguarlo.')

## Use tf-addons BeamSearchDecoder 


In [22]:
def beam_evaluate_sentence(sentence, beam_width=3):
  sentence = dataset_creator.preprocess_sentence(sentence)

  inputs = [inp_lang.word_index[i] for i in sentence]
  inputs = [inputs for _ in range(64)]
  inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                          maxlen=max_length_input,
                                                          padding='post')
  inputs = tf.convert_to_tensor(inputs)
  print(inputs)
  inference_batch_size = inputs.shape[0]
  result = ''

  enc_start_state = [[tf.zeros((inference_batch_size, units)),tf.zeros((inference_batch_size, units))]]*1
  enc_out, enc_state = encoder(inputs, enc_start_state)

  start_tokens = tf.fill([inference_batch_size], targ_lang.word_index['\t'])
  end_token = targ_lang.word_index['\n']

  # From official documentation
  # NOTE If you are using the BeamSearchDecoder with a cell wrapped in AttentionWrapper, then you must ensure that:
  # The encoder output has been tiled to beam_width via tfa.seq2seq.tile_batch (NOT tf.tile).
  # The batch_size argument passed to the get_initial_state method of this wrapper is equal to true_batch_size * beam_width.
  # The initial state created with get_initial_state above contains a cell_state value containing properly tiled final state from the encoder.

  enc_out = tfa.seq2seq.tile_batch(enc_out, multiplier=beam_width)
  decoder.attention_mechanism.setup_memory(enc_out)
  print("beam_with * [batch_size, max_length_input, rnn_units] :  3 * [1, 16, 1024]] :", enc_out.shape)

  # set decoder_inital_state which is an AttentionWrapperState considering beam_width
  hidden_state = tfa.seq2seq.tile_batch(tuple(enc_state), multiplier=beam_width)
  decoder_initial_state = decoder.rnn_cell.get_initial_state(batch_size=beam_width*inference_batch_size, dtype=tf.float32)
  decoder_initial_state = decoder_initial_state.clone(cell_state=hidden_state)

  # Instantiate BeamSearchDecoder
  decoder_instance = tfa.seq2seq.BeamSearchDecoder(decoder.rnn_cell,beam_width=beam_width, output_layer=decoder.fc, embedding_fn = decoder.embedding)
  decoder_embedding_matrix = decoder.embedding.variables[:]

  # The BeamSearchDecoder object's call() function takes care of everything.
  outputs, final_state, sequence_lengths = decoder_instance(None, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state)
  # outputs is tfa.seq2seq.FinalBeamSearchDecoderOutput object. 
  # The final beam predictions are stored in outputs.predicted_id
  # outputs.beam_search_decoder_output is a tfa.seq2seq.BeamSearchDecoderOutput object which keep tracks of beam_scores and parent_ids while performing a beam decoding step
  # final_state = tfa.seq2seq.BeamSearchDecoderState object.
  # Sequence Length = [inference_batch_size, beam_width] details the maximum length of the beams that are generated

  
  # outputs.predicted_id.shape = (inference_batch_size, time_step_outputs, beam_width)
  # outputs.beam_search_decoder_output.scores.shape = (inference_batch_size, time_step_outputs, beam_width)
  # Convert the shape of outputs and beam_scores to (inference_batch_size, beam_width, time_step_outputs)
  final_outputs = tf.transpose(outputs.predicted_ids, perm=(0,2,1))
  beam_scores = tf.transpose(outputs.beam_search_decoder_output.scores, perm=(0,2,1))
  
  return final_outputs.numpy(), beam_scores.numpy()
def beam_translate(sentence):
  result, beam_scores = beam_evaluate_sentence(sentence)
  print(result.shape, beam_scores.shape)
  for beam, score in zip(result, beam_scores):
    print(beam.shape, score.shape)
    output = targ_lang.sequences_to_texts(beam)
    output = [a[:a.index('\n')] for a in output]
    beam_score = [a.sum() for a in score]
    print('Input: %s' % (sentence))
    for i in range(len(output)):
      print('{} Predicted translation: {}  {}'.format(i+1, output[i], beam_score[i]))
beam_translate("aande")

In [None]:
decoder.fc.get_config()

In [None]:
for (_, (inp, targ) )  in enumerate(train_dataset.take(64)):

  enc_start_state = [[tf.zeros((64, units)),tf.zeros((64, units))]]*1

  enc_output, enc_state= encoder(inp , enc_start_state)


  dec_input = targ[ : , :-1 ] # Ignore <end> token
  real = targ[ : , 1: ]         # ignore <start> token

      # Set the AttentionMechanism object with encoder_outputs
  decoder.attention_mechanism.setup_memory(enc_output)

  # Create AttentionWrapperState as initial_state for decoder
  decoder_initial_state = decoder.build_initial_state(64, tuple(enc_state) ,tf.float32)
  pred = decoder(dec_input, decoder_initial_state)


In [25]:
# get html element
def cstr(s, color='black'):
	if s == ' ':
    
		return "<text style=color:#000;padding-left:10px;background-color:{}> </text>".format(color, s)
	else:

		return "<text style=color:#000;background-color:{}>{} </text>".format(color, s)
	
# print html
def print_color(t):
	display(html_print(''.join([cstr(ti, color=ci) for ti,ci in t])))

# get appropriate color for value
def get_clr(value):
	colors = ['#85c2e1', '#89c4e2', '#95cae5', '#99cce6', '#a1d0e8'
		'#b2d9ec', '#baddee', '#c2e1f0', '#eff7fb', '#f9e8e8',
		'#f9e8e8', '#f9d4d4', '#f9bdbd', '#f8a8a8', '#f68f8f',
		'#f47676', '#f45f5f', '#f34343', '#f33b3b', '#f42e2e']
	value = int((value * 18) )
	#print("color value",value)
	return colors[value]

# sigmoid function
def sigmoid(x):
	z = 1/(1 + np.exp(-x)) 
	return z


In [26]:
def visualize(output_values, result_list, cell_no, predicted_char):
    #print( result_list)
    print("\nPredicted Char : ", predicted_char)
    print(f"Importance of {predicted_char}")
    text_colours = []
    for i in range(len(result_list)):
      #print(i, cell_no)
      #print(result_list[i])
      #print(output_values[i])
      #print(output_values[i][cell_no])
      #print(output_values[i][cell_no])
      #print(output_values[i][cell_no])
      #print(int(output_values[i][cell_no]*18))
      text = (result_list[i], get_clr(output_values[i][cell_no]))
      text_colours.append(text)
    print_color(text_colours)

In [None]:
visualize([[0.1,0.9,0.9]],['a'],2,'q')

In [35]:
tx = 0
def translate(sentence):
  print(sentence)
  result, output = evaluate_sentence(sentence)
  print("-"*80)
  print(result[0])
  word_list ="".join(targ_lang.sequences_to_texts(result[:1])).split(" ")
  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(word_list))
  #print(output.rnn_output)
  print("word_list", word_list)
  print("result ", result[0])
  output_values = []
  for time_step in output.rnn_output[0]:
    step = []
    for char_index in list(result)[0]:
      #print(char_index)
      step.append(sigmoid(time_step[char_index]))
    output_values.append(step)
  output_values = np.array(output_values)
  #print(output_values.shape)
  output_values = output_values.transpose()
  scaler = MinMaxScaler()
  scaler.fit(output_values)
  output_values =scaler.transform(output_values)
  #print(output_values.shape)
  #print(word_list)
  for i,char in enumerate(word_list[:-1]):
    visualize(output_values[:i+1], word_list[:i+1], i,char )
  return output.rnn_output

tx =translate('youtube')

In [None]:
@tf.function
def connectivity_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    tape.watch(inp)
    enc_output, enc_state= encoder(inp, enc_hidden)


    dec_input = targ[ : , :-1 ] # Ignore <end> token
    real = targ[ : , 1: ]         # ignore <start> token

    # Set the AttentionMechanism object with encoder_outputs
    decoder.attention_mechanism.setup_memory(enc_output)

    # Create AttentionWrapperState as initial_state for decoder
    decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, tuple(enc_state) ,tf.float32)
    pred = decoder(dec_input, decoder_initial_state)
    #logits = pred.rnn_output
    #loss = loss_function(real, logits)
    #metric.update_state(real, logits)

  #variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(pred.rnn_output, inp)
  print(gradients)
  #optimizer.apply_gradients(zip(gradients, variables))

  return loss, metric.result().numpy()

for (i, (inp, targ)) in enumerate(test_dataset.take(1)):

  x, y = connectivity_step(tf.cast(inp, dtype = tf.float32), targ,  encoder.initialize_hidden_state())
  if i == 1:
    break

In [None]:
with tf.GradientTape(persistent=True, watch_accessed_variables=False) as tape:
        tape.watch(embedded_in)

        enc_out, enc_state = get_output_from_embedding(model.encoder, embedded_in, enc_state)

        dec_state = enc_state
        dec_input = tf.expand_dims([model.targ_tokenizer.word_index["\t"]]*1, 1)

        for t in range(1, model.max_target_len):

            lstm_out, dec_state, _ = get_lstm_output(model.decoder, dec_input, dec_state, enc_out)

            preds = model.decoder.dense(model.decoder.flatten(lstm_out))
            gradient_list.append(tape.gradient(lstm_out, embedded_in)[0])
            
            preds = tf.argmax(preds, 1)
            next_char = model.targ_tokenizer.index_word[preds.numpy().item()]
            result += next_char

            dec_input = tf.expand_dims(preds, 1)

            if next_char == "\n":
                return result[:-1], gradient_list[:-1]

        return result[:-1], gradient_list[:-1]

In [None]:
from sklearn.preprocessing import MinMaxScaler

def get_connectivity(word):
  print("Input word : ", word)
  inputs = [inp_lang.word_index[i] for i in word]
  inputs = [inputs for _ in range(64)]
  inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                          maxlen=max_length_input,
                                                          padding='post')
  inputs = tf.convert_to_tensor(inputs)
  print(inputs)
  #print(index_list)

  enc_start_state = [[tf.zeros((64, units)),tf.zeros((64, units))]]*1

  enc_output, enc_state= encoder(inp , enc_start_state)


  dec_input = targ[ : , :-1 ] # Ignore <end> token
  real = targ[ : , 1: ]         # ignore <start> token

      # Set the AttentionMechanism object with encoder_outputs
  decoder.attention_mechanism.setup_memory(enc_output)

  # Create AttentionWrapperState as initial_state for decoder
  decoder_initial_state = decoder.build_initial_state(64, tuple(enc_state) ,tf.float32)
  pred = decoder(dec_input, decoder_initial_state)
  
  output = s2s.call(enc_inp, dec_input)
  temp_list = []
  #for i in range(len(index_list)):
  input_char_list = list(word)
  first_prediction = output[0].rnn_output[0]
  pred_char_index = (argmax(output[0].rnn_output[0], axis =1))
  #print("pred_char_index",pred_char_index)
  scaler = MinMaxScaler()
  for i,  pred_char in enumerate(index_list):
    
    output_values = []  
    for time_step in first_prediction:
        #print(time_step.shape)
        
        prob = []
        for index in pred_char_index:
          #print(index)
          prob.append(time_step[index].numpy())
        #print(prob)
        output_values.append(prob)
    scaler.fit(output_values)
    output_values  = scaler.transform(output_values)
    #print(np.array(output_values).shape)
    #print(len(input_char_list))
    #print("pred_char_index", pred_char_index)
    out_char_list = list(idx_to_word(pred_char_index))

    temp_list.append(idx_to_word(pred_char_index))

    visualize(output_values, input_char_list[:i],i, out_char_list[i])
  pred_word = "".join(out_char_list)
  print(f"\nTransliterate word of {word[:-1]} is {pred_word[:i]}")
get_connectivity("ande")

In [None]:
beam_translate(u'¿todavia estan en casa?')