In [None]:
#Downloading the dataset from the link using wget
!wget https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
!tar -xvf ./dakshina_dataset_v1.0.tar

In [2]:
#importing required Libraries
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Flatten,Embedding,Dense
from keras.utils.vis_utils import plot_model

In [None]:
!pip install wandb
# wandb login
import wandb
wandb.login()

In [3]:
train_path = "./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
val_path =   "./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv"
test_path = "./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv"
# Parsing the training data to find number of unique characters in input language and output language( these values are useful in creating embedding layer)
# readData function will take path as argument and return the data present in that path as pandas DataFrame
def readData(path):    
    trainingData_df = pd.read_csv(path, sep='\t',on_bad_lines='skip',header=None)
    trainingData = trainingData_df.values.tolist()
    return trainingData

In [4]:
# Analysing dataset
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

trainingData = readData(train_path)
for line in trainingData:
    input_text, target_text = line[1],line[0]
    if not isinstance(input_text,str):
        continue
    target_text = " " + target_text + " "
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)
input_characters.add(' ')
target_characters.add(' ')
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))

num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())


print(input_token_index)
print(target_token_index)

print("Number of samples:", len(input_texts))
num_samples = len(input_texts)
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)

In [5]:
# Character encoding using Embedding layer....

# Encoder inputs embedding (Latin)
def getData(path):
    print(path)
    input_texts = []
    target_texts = []
    data = readData(path)
    for line in data:
        input_text, target_text = line[1],line[0]
        if not isinstance(input_text,str):
            continue
        target_text = " " + target_text + " "
        input_texts.append(input_text)
        target_texts.append(target_text)
    
    vocab_size = num_encoder_tokens
    max_length = max_encoder_seq_length

    EncoderInputEncodedWords = []
    for i,eachText in enumerate(input_texts):
        EncoderInputEncodedWords.append([])
        for eachChar in eachText:
            EncoderInputEncodedWords[i].append(input_token_index[eachChar])

    EncoderInputEncodedWords = pad_sequences(EncoderInputEncodedWords,maxlen=max_length,padding='post',value=0.0)
    print('EncoderInputEncodedWords.shape',EncoderInputEncodedWords.shape)
    print(EncoderInputEncodedWords[:10])

    vocab_size = num_decoder_tokens
    max_length = max_decoder_seq_length

    DecoderInputEncodedWords = []
    for i,eachText in enumerate(target_texts):
        DecoderInputEncodedWords.append([])
        for j,eachChar in enumerate(eachText):
            DecoderInputEncodedWords[i].append(target_token_index[eachChar])

    DecoderInputEncodedWords = pad_sequences(DecoderInputEncodedWords,maxlen = max_decoder_seq_length ,padding='post',value = 0.0)#max(num_decoder_tokens,num_encoder_tokens))
    print('DecoderInputEncodedWords.shape',DecoderInputEncodedWords.shape)
    print(DecoderInputEncodedWords[:10])

    decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32")
    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(target_text):
            if t > 0:
                decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
        decoder_target_data[i, t:, target_token_index[" "]] = 1.0

    with np.printoptions(threshold=np.inf):
      print(decoder_target_data[0])
    
    return EncoderInputEncodedWords,DecoderInputEncodedWords,decoder_target_data,input_texts,target_texts

In [7]:
encoder_input_train_data, decoder_input_train_data, decoder_target_train_data,train_eng,train_hin = getData(train_path)
encoder_input_val_data, decoder_input_val_data, decoder_target_val_data,val_eng,val_hin = getData(val_path)
encoder_input_test_data, decoder_input_test_data, decoder_target_test_data,test_eng,test_hin = getData(test_path)

In [None]:
# Define an input sequence and process it.
def buildModel(latent_dims,EmbeddingOutputDimensions,layer_type,dropout, lr, optimiser):
    encoder_inputs = keras.Input(shape=(max_encoder_seq_length,))
    embedding_encoder_layer = Embedding(input_dim = num_encoder_tokens, output_dim = EmbeddingOutputDimensions[0], input_length = max_encoder_seq_length,trainable=True)
    embedding_encoder_inuts = embedding_encoder_layer(encoder_inputs)
    encoder_outputs = embedding_encoder_inuts
    
    encoder_states = []
    encoder_layers = []
    encoder_layers.append(encoder_inputs)
    encoder_layers.append(embedding_encoder_layer)
    for i in range(len(latent_dims))[::-1]:
        if layer_type == 'lstm':
            encoder_layers.append(keras.layers.LSTM(latent_dims[i], return_state=True,return_sequences=True,dropout=dropout))
            encoder_outputs, state_h, state_c = encoder_layers[-1](encoder_outputs)
            encoder_states += [state_h, state_c]
        if layer_type == 'gru':
            encoder_layers.append(keras.layers.GRU(latent_dims[i], return_state=True, return_sequences=True,dropout=dropout))
            encoder_outputs, state_h= encoder_layers[-1](encoder_outputs)
            encoder_states += [state_h]
        if layer_type == 'rnn':
            encoder_layers.append(keras.layers.SimpleRNN(latent_dims[i], return_state=True, return_sequences=True,dropout=dropout))
            encoder_outputs, state_h = encoder_layers[-1](encoder_outputs)
            encoder_states += [state_h]

    decoder_inputs = keras.Input(shape=(max_decoder_seq_length,))
    embedding_decoder_layer = Embedding(input_dim = num_decoder_tokens, output_dim = EmbeddingOutputDimensions[1], input_length = max_decoder_seq_length,trainable=True)
    embedding_decoder_inputs = embedding_decoder_layer(decoder_inputs)
    decoder_outputs_temp = embedding_decoder_inputs
    decoder_layers = []
    decoder_layers.append(decoder_inputs)
    decoder_layers.append(embedding_decoder_layer)

    for i in range(len(latent_dims)):
        if layer_type == 'lstm':
            layer = keras.layers.LSTM(latent_dims[len(latent_dims) - i - 1], return_sequences=True, return_state=True,dropout=dropout)
            decoder_outputs_temp, dh, dc = layer(decoder_outputs_temp, initial_state=encoder_states[2*i:2*(i+1)])
            decoder_layers.append(layer)
        if layer_type == 'gru':
            layer = keras.layers.GRU(latent_dims[len(latent_dims) - i - 1], return_sequences=True, return_state=True,dropout=dropout)
            decoder_outputs_temp,dh = layer(decoder_outputs_temp, initial_state=encoder_states[i])
            decoder_layers.append(layer)
        if layer_type == 'rnn':
            layer = keras.layers.SimpleRNN(latent_dims[len(latent_dims) - i - 1], return_sequences=True, return_state=True,dropout=dropout)
            decoder_outputs_temp, dh = layer(decoder_outputs_temp, initial_state=encoder_states[i])
            decoder_layers.append(layer)
            
    if attention == True:
        attention_layer = keras.layers.AdditiveAttention() 
        decoder_layers.append(attention_layer)
        attn_out, attn_states = attention_layer([ decoder_outputs_temp , encoder_outputs] , return_attention_scores = True) 

        concatenate_layer = Concatenate(axis=-1, name='concat_layer')
        decoder_layers.append(concatenate_layer)
        decoder_outputs_temp = concatenate_layer([decoder_outputs_temp, attn_out])


    dense_layer = keras.layers.Dense(num_decoder_tokens, activation="softmax")
    decoder_outputs = dense_layer(decoder_outputs_temp) 
    decoder_layers.append(dense_layer)

    model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
    #model.summary()
    plot_model(model, to_file='model12.png', show_shapes=True)
    
    lr = tf.keras.experimental.CosineDecayRestarts(lr , 1000)
    if optimiser == 'adam':
        optim = keras.optimizers.Adam(learning_rate=lr)
    elif optimiser == 'rmsprop':
        optim = keras.optimizers.RMSprop(learning_rate=lr)
    elif optimiser == 'sgd':
        optim = keras.optimizers.SGD(learning_rate=lr)
    model.compile(optimizer =optim, loss="categorical_crossentropy", metrics=["accuracy"])
    
    return model,encoder_layers,decoder_layers

In [None]:
def beam_selector(logit , beam_width , rows ):
    holder = list()
    for i in range(len(rows)):
        cur_sequence, cur_score = rows[i]
        desired_top = np.argsort(logit)[-beam_width:]
    for desired in desired_top:
        current_holder = [cur_sequence + [desired], cur_score + tf.math.log(logit[desired])]
        holder.append(current_holder)
    return holder

def BeamDecoder(logits, beam_width):
    rows = [[list(), 0.0]]
    logits  = tf.nn.softmax(logits)
    for logit in logits:
        holder = beam_selector(logit , beam_width , rows )
        rev_sorted = sorted(holder, key=lambda start_tuple:start_tuple[1], reverse=True)
        rows = rev_sorted[:beam_width]
    return np.array(rows)[:,0:1]

In [None]:
class AccuracyCallback(tf.keras.callbacks.Callback):
    def __init__(self, trainingData, validationData, beam_width):
        self.trainingData = trainingData
        self.validationData = validationData   
        self.beamWidth = beam_width

    def on_epoch_end(self, epoch , logs):
        train_preds = self.model.predict(self.trainingData[0])
        train_truth = self.trainingData[1]
        train_pred_one_hot = tf.one_hot(tf.argmax(train_preds, axis=2), train_preds.shape[2]).numpy()
        correct_word_count = 0
        correct_character_count = 0

        for i in range(len(train_preds)):
            if np.array_equal(train_truth[i], train_pred_one_hot[i]):
                correct_word_count += 1
            for j in range(0,len(train_truth[i])):
                if np.array_equal(train_truth[i][j], train_pred_one_hot[i][j]):
                    correct_character_count += 1

        trainingAccuracy_word = correct_word_count/len(train_preds)
        trainingAccuracy_character = correct_character_count/(len(train_preds)*len(train_preds[0]))
        print("")
        print(correct_character_count,len(train_preds),len(train_preds[0]))
        print(str(correct_word_count)," words correctly predicted among ",len(train_preds)," words")
        print("Training Accuracy (word): "+ str(trainingAccuracy_word))
        print("Training Accuracy (character): "+ str(trainingAccuracy_character))

        #wandb.log({"accuracy": (correct_count/len(train_preds))})


        val_preds = self.model.predict(self.validationData[0])
        val_truth = self.validationData[1]

        val_pred_one_hot = tf.one_hot(tf.argmax(val_preds, axis=2), val_preds.shape[2]).numpy()
        correct_word_count = 0
        correct_character_count = 0
        for i in range(len(val_preds)):
            req_length = len(val_hin[i].strip()) 
            beam_result = BeamDecoder(np.array(val_preds[i][:req_length+1]) , self.beamWidth)

            ground_label = decoder_input_val_data[i ,1:req_length+2]

            maxCharMatch = 0
            for beam in beam_result:
                charMatch = 0  
                for j in range(0,len(ground_label)):
                    if beam[0][j] == ground_label[j]:
                        charMatch += 1
                    if charMatch > maxCharMatch:
                        maxCharMatch = charMatch

                if(np.array_equal(beam[0] , ground_label)):
                    correct_word_count += 1
                    break
            correct_character_count +=  maxCharMatch
        validationAccuracy_word = (correct_word_count/len(val_preds))
        validationAccuracy_char = (correct_character_count/(len(val_preds)*len(val_preds[0])))
        print(correct_character_count,len(val_preds),len(val_preds[0]))
        print(str(correct_word_count)," words correctly predicted among ",len(val_preds)," words")
        print("Validation Accuracy (word) : "+ str(validationAccuracy_word))
        print("Validation Accuracy (character): "+ str(validationAccuracy_char))

        wandb.log({ "epoch": epoch,"accuracy": trainingAccuracy_word,"val_accuracy": validationAccuracy_word,"accuracy (character)": trainingAccuracy_character, "val_accuracy (character)": validationAccuracy_char})


In [None]:
latent_dims = [128,128,128]
embed_dims = [128,256]
attention = False
cell_type = 'gru'
dropout = 0.1
batch_size = 64  # Batch size for training.
epochs = 20  # Number of epochs to train for.
learning_rate = 0.002
optimiser = 'rmsprop'
beam_width = 3

In [None]:
model, encoder_layers, decoder_layers = buildModel(latent_dims=latent_dims,EmbeddingOutputDimensions=embed_dims,layer_type=cell_type,dropout=dropout, lr=learning_rate, optimiser=optimiser)

acc_callback = AccuracyCallback(trainingData = ([encoder_input_train_data, decoder_input_train_data], decoder_target_train_data),validationData = ([encoder_input_val_data, decoder_input_val_data], decoder_target_val_data), beam_width=beam_width)

model.fit([encoder_input_train_data, decoder_input_train_data],decoder_target_train_data,batch_size=batch_size,
          epochs=epochs,shuffle=True,callbacks=[acc_callback])

In [None]:
def encoder_inference_model(model,latent_dims,cell_type,encoder_layers):
    encoder_input = encoder_layers[0]   # Taking input layer from training encoder layer
    encoder_embedding_layer = encoder_layers[1] # Taking embedded layer from training encoder layer
    encoder_embedding_output = encoder_embedding_layer(encoder_input)

    encoder_states = []
    encoder_outputs = encoder_embedding_output
  
    for i in range(len(latent_dims)):
        index = i+2   # first two layers are input and embedded layers and remaining layers are recurrent cells
        encoder_recurrent_layer = encoder_layers[index]
        if cell_type == 'lstm':
            encoder_outputs, state_h, state_c = encoder_recurrent_layer(encoder_outputs)
            encoder_states += [state_h, state_c]  # storing states from cell to give as initial state to respective  (lstm)
        else:
            encoder_outputs, state_h = encoder_recurrent_layer(encoder_outputs)
            encoder_states += [state_h] # storing states from cell to give as initial state to respective  (rnn,gru)

    encoder_inference_Model = keras.Model(inputs = encoder_input, outputs = encoder_states + [encoder_outputs]) # outputting saved encoder states and final encoder outpust
    
    return encoder_inference_Model

In [None]:
def decoder_inference_model(model, latent_dims , embed_dims, attention ,cell_type, decoder_layers):
    decoder_inference_input = decoder_layers[0]  # training decoder input layer
    decoder_outputs = decoder_layers[1](decoder_inference_input) # training decoder embedded layer
    
    
    decoder_states_inputs = []
    decoder_states_outputs = []
    
    for i in range(len(latent_dims))[::-1]:
        index = len(latent_dims)-i-1+2
        if cell_type == 'lstm':
            starting_state_inputs = [keras.Input(shape=(latent_dims[i],)) for num_hidden_states in range(2)] # Encoder states as starting states
            decoder_outputs, state_h, state_c = decoder_layers[index](decoder_outputs, initial_state=starting_state_inputs)
            decoder_states_outputs += [state_h , state_c] # These states are useful generating translation of word.(as we are generating characterby character)
            decoder_states_inputs += starting_state_inputs
        else:
            starting_state_inputs = [keras.Input(shape=(latent_dims[i],)) for num_hidden_states in range(1)]
            decoder_outputs, state_h = decoder_layers[index](decoder_outputs, initial_state= starting_state_inputs)
            decoder_states_outputs += [state_h ]
            decoder_states_inputs += starting_state_inputs

    decoder_hidden_input = keras.Input(shape=(max_encoder_seq_length , latent_dims[0]))
    
    if attention == True:
        attention_layer = decoder_layers[-3] 
    
        attention_output, attention_states = attention_layer([ decoder_outputs , decoder_hidden_input], return_attention_scores = True) 
        concatenation_layer = decoder_layers[-2]
        decoder_outputs = concatenation_layer([decoder_outputs, attention_output])

        attention_model = keras.Model(
          [decoder_inference_input] + [decoder_hidden_input] + decoder_states_inputs,
          [attention_states])

        decoder_dense = decoder_layers[-1]
        decoder_outputs = decoder_dense(decoder_outputs) 

        decoder_model = keras.Model(
                [decoder_inference_input]  + decoder_states_inputs + [decoder_hidden_input],
                [decoder_outputs] + decoder_states_outputs) 

        return  decoder_model, attention_model

    decoder_dense = decoder_layers[-1]
    decoder_outputs = decoder_dense(decoder_outputs) 

    decoder_model = keras.Model(
          [decoder_inference_input]  + decoder_states_inputs + [decoder_hidden_input],
          [decoder_outputs] + decoder_states_outputs) 

    return  decoder_model

In [None]:
def Finding_test_accuracy():
    correct_count = 0
    total_words = len(test_eng)
    for i in range(0,total_words):
        input = encoder_input_test_data[i]
        real_output = decoder_input_test_data[i]
        if(attention):
            prediction,attention_out = translate_word(input.reshape(1,max_encoder_seq_length))
        else:
            prediction = translate_word(input.reshape(1,max_encoder_seq_length))
        correct_word_length = len(test_hin[i].strip())
        beam_result = BeamDecoder(np.array(beamlogits) , beam_width)
        real_output = real_output[1:correct_word_length+2]
        #print("--------------")
        #print('g',real_output)
        for pred in beam_result:
            #print('p',pred[0])
            if(np.array_equal(pred[0],real_output)):
                correct_count += 1
                break
        if i%200==0:
            print("Test accuracy without Attention: {0}, Completion = {1}".format(correct_count*100/(i+1), i/total_words))
    print("Test accuracy : ", correct_count*100/total_words)

In [None]:
def translate_word(input_word):
    att = [] 
    full_encoder_inference_output = encoder_model.predict(input_word)
    once_encoder_output = full_encoder_inference_output[-1]  # Final cell outputs
    recurrent_states = full_encoder_inference_output[0:-1]   # All cells hidden states 
    global beamlogits 
    beamlogits = []
    rec_target_char = np.zeros((1,1))    
    rec_target_char[0, 0] = target_token_index[' ']   # initial input to decoder is space character

    decoding_stop = False
    decoded_word = ''
    while(decoding_stop == False):
        full_decoder_output = decoder_model.predict([rec_target_char] + recurrent_states + [once_encoder_output])
        beamlogits.append(full_decoder_output[0][0][0])
        if(attention):
            attention_array = attention_model([rec_target_char] + [once_encoder_output]+recurrent_states)
            att.append(attention_array.numpy().flatten())
        current_token_ind = np.argmax(full_decoder_output[0][0, -1, :])  # finding maximum probability character
        current_char = reverse_target_char_index[current_token_ind]
        if (current_char == ' ' or len(decoded_word) >  max_decoder_seq_length):  # if decoded character is ' ' or decoded sequence is longer than required then terminate
            decoding_stop = True
        else:    
            decoded_word += ''+ current_char

        rec_target_char = np.zeros((1,1))
        rec_target_char[0, 0] = current_token_ind # making current predicted character as input to decoder in next iteration

        recurrent_states = full_decoder_output[1:] # states preparation
    if (attention):
        return decoded_word , np.asarray(att)
    else:
        return decoded_word

In [None]:
encoder_model = encoder_inference_model(model,latent_dims,cell_type,encoder_layers)
if attention == True:
    decoder_model, attention_model = decoder_inference_model(model, latent_dims , embed_dims, attention ,cell_type, decoder_layers)
else:
    decoder_model = decoder_inference_model(model, latent_dims , embed_dims, attention ,cell_type, decoder_layers)
plot_model(encoder_model, to_file='emodel.png', show_shapes=True)
plot_model(decoder_model, to_file='dmodel.png', show_shapes=True)

Finding_test_accuracy()