<a href="https://colab.research.google.com/github/CS20M038/CS6910-assignment_3/blob/main/DL_Assignment_3_Arjun_Version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#import all the libraries that would be used in this notebook
import csv
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.layers.embeddings import Embedding
from tensorflow.keras.layers import LSTM,Dense
from keras.models import Model
from keras.utils.vis_utils import plot_model

In [2]:
# to mount to the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print('GPU device not found')
else:
  print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [4]:
#make sure to upload the Dakshina Dataset in your drive and name the folder as Dakshina_Dataset
dakshina_dataset_hindi = '/content/drive/MyDrive/Dakshina_Dataset/hi/lexicons'

In [5]:
dev_dataset_path = os.path.join(dakshina_dataset_hindi,"hi.translit.sampled.dev.tsv")
train_dataset_path = os.path.join(dakshina_dataset_hindi,"hi.translit.sampled.train.tsv")
test_dataset_path = os.path.join(dakshina_dataset_hindi,"hi.translit.sampled.test.tsv")

In [6]:
def load_dataset(path,test_dataset = False):
    dataset = open(path)
    read_dataset = csv.reader(dataset,delimiter = '\t')
    x = []
    y = []
    for row in read_dataset:
        x.append(row[1])
        if test_dataset:
            y.append(row[0])
        else:
            y.append("\t"+row[0]+'\n')

    return np.array(x),np.array(y)


In [62]:
x_raw_train,y_raw_train = load_dataset(train_dataset_path,False)

In [74]:
x_raw_test,y_raw_test = load_dataset(dev_dataset_path,True)

In [64]:
english_alphabets = 'abcdefghijklmnopqrstuvwxyz'
english_alpha2index = {"PAD": 0}
for index,alpha in enumerate(english_alphabets):
    english_alpha2index[alpha] = index + 1

hindi_alphabets = [chr(alpha) for alpha in range(2304, 2432)]
hindi_alphabets.append('\t')
hindi_alphabets.append('\n')
hindi_alpha2index = {"PAD" : 0}
for index,alpha in enumerate(hindi_alphabets):
    hindi_alpha2index[alpha] = index + 1

english_index2alpha = {0: "PAD"}
hindi_index2alpha = {0: "PAD"}

for index,alpha in enumerate(english_alphabets):
    english_index2alpha[index + 1] = alpha

for index,alpha in enumerate(hindi_alphabets):
    hindi_index2alpha[index + 1] = alpha



In [65]:
def get_integer_encode(word,alpha2index,max_length = 25):
    integer_encode = np.zeros((max_length,),dtype='int')
    for index,alpha in enumerate(word):
        integer_encode[index] = alpha2index[alpha]
    return integer_encode

def encode_docs(docs,alpha2index,max_length = 25):
    encoded_docs = np.zeros((docs.shape[0],max_length),dtype='int')
    for index,word in enumerate(docs):
        encoded_docs[index] = get_integer_encode(word,alpha2index)
    return encoded_docs



In [66]:
integer_encoded_x_train = encode_docs(x_raw_train,english_alpha2index)
integer_encoded_y_train = encode_docs(y_raw_train,hindi_alpha2index)

In [67]:
decoder_input_data = np.zeros((integer_encoded_y_train.shape[0],25,len(hindi_alpha2index)),dtype="float32")
decoder_output_data = np.zeros((integer_encoded_y_train.shape[0],25,len(hindi_alpha2index)),dtype="float32")

for i,integer_encoded_data in enumerate(integer_encoded_y_train):
    for t,integer in enumerate(integer_encoded_data):
        decoder_input_data[i, t, integer]= 1.0
        if t > 0:
            decoder_output_data[i, t - 1, integer] = 1.0

In [16]:
batch_size = 64  
epochs = 25
latent_dim = 256

In [183]:
embedding_inputs = keras.Input(shape=(25))
embedding_layer = Embedding(len(english_alpha2index), 125 , input_length=25)
encoder_inputs = embedding_layer(embedding_inputs)
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = keras.Input(shape=(None, len(hindi_alpha2index)))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(len(hindi_alpha2index), activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = keras.Model([embedding_inputs, decoder_inputs], decoder_outputs)

In [184]:
model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)
model.fit(
    [integer_encoded_x_train, decoder_input_data],
    decoder_output_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.1,
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f4f51e80c10>

In [185]:
model.save("/content/drive/MyDrive/s2s")



INFO:tensorflow:Assets written to: /content/drive/MyDrive/s2s/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/s2s/assets


In [14]:
model = keras.models.load_model("/content/drive/MyDrive/s2s")

In [18]:
#model.summary()

embedding_inputs = model.input[0]  # input_1
embedding_l1 = model.layers[1]
encoder_inputs = embedding_l1(embedding_inputs)
encoder_lstm = model.layers[3]
encoder_outputs, state_h_enc, state_c_enc = encoder_lstm(encoder_inputs)
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(embedding_inputs, encoder_states)
encoder_model.summary()

decoder_inputs = model.input[1]  # input_2
decoder_state_input_h = keras.Input(shape=(latent_dim,), name="input_3")
decoder_state_input_c = keras.Input(shape=(latent_dim,), name="input_5")
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[4]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model.layers[5]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)
decoder_model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        [(None, 25)]              0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 25, 125)           3375      
_________________________________________________________________
lstm_10 (LSTM)               [(None, 256), (None, 256) 391168    
Total params: 394,543
Trainable params: 394,543
Non-trainable params: 0
_________________________________________________________________
Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           [(None, None, 131)]  0                                            
_____________________________________________________________________

In [22]:
def decode_sequence(word):
    input_seq = get_integer_encode(word,english_alpha2index)
    input_seq = input_seq.reshape(1,25)
    
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1, len(hindi_alpha2index)))
    # Populate the first character of target sequence with the start character.
    target_seq[0,0, hindi_alpha2index["\t"]] = 1.0

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_word = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        sampled_char = hindi_index2alpha[sampled_token_index]
        

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or len(decoded_word) > 25:
            stop_condition = True
            break

        decoded_word += sampled_char

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1,len(hindi_alpha2index)))
        target_seq[0, 0, sampled_token_index] = 1.0

        # Update states
        states_value = [h, c]
    return decoded_word

In [23]:
def test():
    correct = 0
    for english_word,hindi_word in zip(x_raw_test,y_raw_test):
    
        predicted_hindi_word = decode_sequence(english_word)
        print(english_word+"  "+hindi_word+"  "+predicted_hindi_word)
        if predicted_hindi_word == hindi_word:
            correct += 1

    acc = (correct/x_raw_test.shape[0])*100
    return acc
    

In [107]:
class RNN():

    def __init__(self,embedding_size,num_encoder_layers,num_decoder_layers,hidden_layer_size,cell_type,drop_out_ratio,in_char_size,out_char_size,input_len):

        self.embedding_size = embedding_size
        self.num_encoder_layers = num_encoder_layers
        self.num_decoder_layers = num_decoder_layers
        self.hidden_layer_size = hidden_layer_size
        self.cell_type = cell_type
        self.drop_out_ratio = drop_out_ratio
        self.in_char_size = in_char_size
        self.out_char_size = out_char_size
        self.input_len = input_len

        self._build_rnn_network()

    def _build_rnn_network(self):

        # Embedding 
        embedding_inputs = keras.Input(shape=(None,))
        embedding_layer = Embedding(self.in_char_size, self.embedding_size , input_length=self.input_len)
        encoder_inputs = embedding_layer(embedding_inputs)

        #Encoder

        self.encoder_layers = []
        for _ in range(self.num_encoder_layers-1):
            encoder_layer = getattr(layers,self.cell_type)(self.hidden_layer_size, dropout=self.drop_out_ratio, return_sequences=True)
            encoder_inputs = encoder_layer(encoder_inputs)
            self.encoder_layers.append(encoder_layer)

        last_encoder_layer = getattr(layers,self.cell_type)(self.hidden_layer_size, dropout=self.drop_out_ratio, return_state=True)
        encoder_outputs,*encoder_states = last_encoder_layer(encoder_inputs)
        self.encoder_layers.append(last_encoder_layer)

        #Decoder
        initial_decoder_inputs = keras.Input(shape=(None, self.out_char_size))
        decoder_inputs = initial_decoder_inputs

        self.decoder_layers = []

        for _ in range(self.num_decoder_layers):

            decoder_layer = getattr(layers,self.cell_type)(self.hidden_layer_size, dropout=self.drop_out_ratio, return_sequences=True,return_state=True)
            decoder_inputs,*decoder_states = decoder_layer(decoder_inputs,initial_state=encoder_states)
            self.decoder_layers.append(decoder_layer)

        decoder_outputs = decoder_inputs
        decoder_dense = Dense(self.out_char_size, activation="softmax")
        decoder_outputs = decoder_dense(decoder_outputs)

        model = keras.Model([embedding_inputs, initial_decoder_inputs], decoder_outputs) 

        self.model = model
        self.embedding_inputs = embedding_inputs
        self.encoder_states = encoder_states
        self.decoder_inputs = initial_decoder_inputs
        self.decoder_dense = decoder_dense

    def compile(self,optimizer="rmsprop"):

        self.model.compile(
        optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"]
        )

    def fit(self,encoder_input,decoder_input,decoder_output,batch_size = 64,epochs = 5):
        self.model.fit(
        [encoder_input, decoder_input],
        decoder_output,
        batch_size=batch_size,
        epochs=epochs,
        validation_split=0.1,
        )

    def build_inference_model(self):

        self.encoder_model = Model(self.embedding_inputs,self.encoder_states)

        decoder_state_input = []
        for i in range(len(self.encoder_states)) :
            new_state = keras.Input(shape=(self.hidden_layer_size,))
            decoder_state_input.append(new_state)

        initial_decoder_inputs = self.decoder_inputs

        decoder_inputs = initial_decoder_inputs
        for layer in self.decoder_layers :
            decoder_inputs, *decoder_states = layer(decoder_inputs,initial_state=decoder_state_input)

        decoder_outputs = decoder_inputs
        decoder_outputs = self.decoder_dense(decoder_outputs)

        self.decoder_model = Model(
            [initial_decoder_inputs] + decoder_state_input,
            [decoder_outputs] + decoder_states
            )


    def _decode_sequence(self,word):

        input_seq = get_integer_encode(word,english_alpha2index)
        input_seq = input_seq.reshape(1,25)
    
        states_value = self.encoder_model.predict(input_seq)

        target_seq = np.zeros((1,1, len(hindi_alpha2index)))
        target_seq[0,0, hindi_alpha2index["\t"]] = 1.0

        decoded_word = ""
        while True:
            output_tokens, *states = self.decoder_model.predict([target_seq] + [states_value])

            sampled_token_index = np.argmax(output_tokens[0, -1, :])

            sampled_char = hindi_index2alpha[sampled_token_index]
        
            if sampled_char == "\n" or len(decoded_word) > 25:
                break

            decoded_word += sampled_char

            target_seq = np.zeros((1, 1,len(hindi_alpha2index)))
            target_seq[0, 0, sampled_token_index] = 1.0

            states_value = [states[i] for i in range(len(states))]

        return decoded_word

    def evaluate(self,X_test,Y_test):
        correct = 0
        for english_word,hindi_word in zip(X_test,Y_test):
    
            predicted_hindi_word = self._decode_sequence(english_word)
            print(english_word+"  "+hindi_word+"  "+predicted_hindi_word)
            if predicted_hindi_word == hindi_word:
                correct += 1

        acc = (correct/x_raw_test.shape[0])*100
        return acc



In [114]:
hyperparameters = {
    "embedding_size" : 125,
    "num_encoder_layers" : 1,
    "num_decoder_layers" : 1,
    "hidden_layer_size" : 256,
    "cell_type" : "GRU",
    "drop_out_ratio": 0.2,
    "in_char_size": len(english_alpha2index),
    "out_char_size": len(hindi_alpha2index),
    "input_len": 25,
    }

net = RNN(**hyperparameters)


In [119]:
net.compile()
net.fit(integer_encoded_x_train, decoder_input_data,decoder_output_data,epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [120]:
net.build_inference_model()

In [121]:
net.evaluate(x_raw_test,y_raw_test)

ankan  अंकन  अंकन
angkor  अंगकोर  अंगकर
angira  अंगिरा  अंगीरा
angithi  अंगीठी  अंगिथि
angrej  अंग्रेज  अंग्रेज
angrejon  अंग्रेजों  अंग्रेजों
anjaam  अंजाम  अंजाम
anjam  अंजाम  अंजम
antakaran  अंतकरण  अंतकारण
antkaran  अंतकरण  अंतकारण
anralon  अंतरालों  अंरालों
antralon  अंतरालों  अंतरालों
antarkalah  अंतर्कलह  अंतरकाल
antarklah  अंतर्कलह  अंतरकाल
antkalah  अंतर्कलह  अंतकाल
andher  अंधेर  अंधर
andhera  अंधेरा  अंधारा
andheraa  अंधेरा  अंधारा
andhere  अंधेरे  अंडेरे
ambar  अंबर  अंबार
amber  अंबर  अमबर
umber  अंबर  अमबीर
ambarnath  अंबरनाथ  अंबरनाथ
ahankaar  अंहकार  अहंकार
ahankar  अंहकार  अहंकर
anhkar  अंहकार  अनहकर
akaash  अकाश  अकाश
akash  अकाश  अकाष
akeelaa  अकीला  अकेला
akila  अकीला  अकीला
ankush  अकुंश  अंकुश
aksharash  अक्षरशः  अक्षार्ष
aksharashaah  अक्षरशः  अक्षाराश
aksharashah  अक्षरशः  अक्षाराश
asharsha  अक्षरशः  अशारशा
akhandanand  अखंडानंद  अखंडंड
akhbar  अखबार  अख़बार
akhabari  अखबारी  अखबारी
akhbari  अखबारी  अख़बारी
akhabaro  अखबारो  अखबराओं
akhbaro  अखबारो  अख़बारो
akha

23.313446535107847

In [122]:
net.decoder_model.summary()

Model: "model_52"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_62 (InputLayer)           [(None, None, 131)]  0                                            
__________________________________________________________________________________________________
input_64 (InputLayer)           [(None, 256)]        0                                            
__________________________________________________________________________________________________
gru_17 (GRU)                    [(None, None, 256),  298752      input_62[0][0]                   
                                                                 input_64[0][0]                   
__________________________________________________________________________________________________
dense_20 (Dense)                (None, None, 131)    33667       gru_17[2][0]              