In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [2]:
import os

In [3]:
dataset_path = os.getcwd() + os.sep + 'te' + os.sep + 'lexicons'
train_path = dataset_path + os.sep + 'te.translit.sampled.train.tsv'
valid_path = dataset_path + os.sep + 'te.translit.sampled.dev.tsv'
test_path = dataset_path + os.sep + 'te.translit.sampled.test.tsv'

In [4]:
train_inputs = []
train_outputs = []
valid_inputs = []
valid_outputs = []
test_inputs = []
test_outputs = []
input_chars = set()
output_chars = set()

In [5]:
include_all = True

In [6]:
with open(train_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")
for line in lines[: (len(lines) - 1 )]:
    out,inp,a = line.split('\t')
#     if not include_all and a!=1:
#         continue
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
#     print(out,out[-1],inp,inp[-1])
    out = "\t" + out + "\n"
    train_inputs.append(inp)
    train_outputs.append(out)
    for char in inp:
        if char not in input_chars:
            input_chars.add(char)
    for char in out:
        if char not in output_chars:
            output_chars.add(char)

In [7]:
len(train_inputs)

58550

In [8]:
with open(valid_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")
for line in lines[: (len(lines) - 1 )]:
    out,inp,a = line.split('\t')
    if not include_all and a!=1:
        continue
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
#     print(out,out[-1],inp,inp[-1])
    out = "\t" + out + "\n"
    valid_inputs.append(inp)
    valid_outputs.append(out)
#     for char in inp:
#         if char not in input_chars:
#             input_chars.add(char)
#     for char in out:
#         if char not in output_chars:
#             output_chars.add(char)

In [9]:
with open(test_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")
for line in lines[: (len(lines) - 1 )]:
    out,inp,a = line.split('\t')
    if not include_all and a!=1:
        continue
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
#     print(out,out[-1],inp,inp[-1])
    out = "\t" + out + "\n"
    test_inputs.append(inp)
    test_outputs.append(out)
#     for char in inp:
#         if char not in input_chars:
#             input_chars.add(char)
#     for char in out:
#         if char not in output_chars:
#             output_chars.add(char)

In [10]:
input_chars = sorted(list(input_chars))
print(input_chars)
num_input_chars = len(input_chars)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [11]:
output_chars = sorted(list(output_chars))
print(output_chars)
num_output_chars = len(output_chars)

['\t', '\n', 'ం', 'ః', 'అ', 'ఆ', 'ఇ', 'ఈ', 'ఉ', 'ఊ', 'ఋ', 'ఎ', 'ఏ', 'ఐ', 'ఒ', 'ఓ', 'ఔ', 'క', 'ఖ', 'గ', 'ఘ', 'చ', 'ఛ', 'జ', 'ఝ', 'ఞ', 'ట', 'ఠ', 'డ', 'ఢ', 'ణ', 'త', 'థ', 'ద', 'ధ', 'న', 'ప', 'ఫ', 'బ', 'భ', 'మ', 'య', 'ర', 'ఱ', 'ల', 'ళ', 'వ', 'శ', 'ష', 'స', 'హ', 'ా', 'ి', 'ీ', 'ు', 'ూ', 'ృ', 'ె', 'ే', 'ై', 'ొ', 'ో', 'ౌ', '్', '\u200c']


In [12]:
train_size = len(train_inputs)
max_input_size = max([len(txt) for txt in train_inputs])
print(max_input_size)
max_output_size = max([len(txt) for txt in  train_outputs])
print(max_output_size)

25
22


In [13]:
input_index = dict([(char, i) for i, char in enumerate(input_chars)])
output_index = dict([(char, i+1) for i, char in enumerate(output_chars)])


In [14]:
print(output_index)

{'\t': 1, '\n': 2, 'ం': 3, 'ః': 4, 'అ': 5, 'ఆ': 6, 'ఇ': 7, 'ఈ': 8, 'ఉ': 9, 'ఊ': 10, 'ఋ': 11, 'ఎ': 12, 'ఏ': 13, 'ఐ': 14, 'ఒ': 15, 'ఓ': 16, 'ఔ': 17, 'క': 18, 'ఖ': 19, 'గ': 20, 'ఘ': 21, 'చ': 22, 'ఛ': 23, 'జ': 24, 'ఝ': 25, 'ఞ': 26, 'ట': 27, 'ఠ': 28, 'డ': 29, 'ఢ': 30, 'ణ': 31, 'త': 32, 'థ': 33, 'ద': 34, 'ధ': 35, 'న': 36, 'ప': 37, 'ఫ': 38, 'బ': 39, 'భ': 40, 'మ': 41, 'య': 42, 'ర': 43, 'ఱ': 44, 'ల': 45, 'ళ': 46, 'వ': 47, 'శ': 48, 'ష': 49, 'స': 50, 'హ': 51, 'ా': 52, 'ి': 53, 'ీ': 54, 'ు': 55, 'ూ': 56, 'ృ': 57, 'ె': 58, 'ే': 59, 'ై': 60, 'ొ': 61, 'ో': 62, 'ౌ': 63, '్': 64, '\u200c': 65}


In [15]:
#### Enoding in indexes of characters in the set
def encode_index(inputs,index):
    data = []
    for i in range(len(inputs)):
        a = np.zeros(len(inputs[i]))
        j = 0
        for char in inputs[i]:
            a[j] = index[char]
            j += 1
        data.append(a)
    data = np.asarray(data).astype(np.ndarray)
    return data
    

In [16]:
# input_data = []
# for i in range(train_size):
#     a = np.zeros(len(train_inputs[i]))
#     j = 0
#     for char in train_inputs[i]:
#         a[j] = input_index[char]
#         j += 1
#     input_data.append(a)
# input_data = np.asarray(input_data).astype(np.ndarray)

In [17]:
input_data = encode_index(train_inputs,input_index)
input_tensor = tf.ragged.constant(input_data)

  return array(a, dtype, copy=False, order=order)


In [18]:
val_input_data = encode_index(valid_inputs,input_index)
val_input_tensor = tf.ragged.constant(val_input_data)

In [19]:
test_input_data = encode_index(test_inputs,input_index)
test_input_tensor = tf.ragged.constant(test_input_data)

In [20]:
print(len(input_data))

58550


In [21]:
max_val__input_size = max([len(txt) for txt in valid_inputs])
max_val_output_size = max([len(txt) for txt in  valid_outputs])
max_test_input_size = max([len(txt) for txt in test_inputs])
max_test_output_size = max([len(txt) for txt in  test_outputs])

In [22]:
decoder_input_data = np.zeros(
    (len(train_inputs), max_output_size,num_output_chars+1), dtype="float32"
)
decoder_output_data = np.zeros(
    (len(train_inputs), max_output_size,num_output_chars+1), dtype="float32"
)
for i,target_text in enumerate(train_outputs):
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, output_index[char]] = 1.0
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_output_data[i, t - 1, output_index[char]] = 1.0
#     decoder_input_data[i, t + 1 :, output_index[" "]] = 1.0
#     decoder_output_data[i, t:, output_index[" "]] = 1.0
# print(decoder_input_data[0])
decoder_input_data = np.argmax(decoder_input_data,axis=2).astype(dtype='float32')
# decoder_output_data = np.argmax(decoder_output_data,axis=2).astype(dtype='float32')
# decoder_input_data = tf.convert_to_tensor(decoder_input_data)
# decoder_output_data = tf.convert_to_tensor(decoder_output_data) 

In [23]:
decoder_input_data[0]

array([ 1.,  5.,  3., 18., 53., 32.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32)

In [24]:
decoder_val_input_data = np.zeros(
    (len(valid_inputs), max_val_output_size,num_output_chars+1), dtype="float32"
)
decoder_val_output_data = np.zeros(
    (len(valid_inputs), max_val_output_size,num_output_chars+1), dtype="float32"
)
for i,target_text in enumerate(valid_outputs):
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_val_input_data[i, t, output_index[char]] = 1.0
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_val_output_data[i, t - 1, output_index[char]] = 1.0
#     decoder_input_data[i, t + 1 :, output_index[" "]] = 1.0
#     decoder_output_data[i, t:, output_index[" "]] = 1.0
decoder_val_input_data = np.argmax(decoder_val_input_data,axis=2).astype(dtype='float32')
# decoder_val_output_data = np.argmax(decoder_val_output_data,axis=2).astype(dtype='float32')
# decoder_val_input_data = tf.convert_to_tensor(decoder_val_input_data)
# decoder_val_output_data = tf.convert_to_tensor(decoder_val_output_data) 

In [25]:
decoder_test_input_data = np.zeros(
    (len(test_inputs), max_test_output_size,num_output_chars+1), dtype="float32"
)
decoder_test_output_data = np.zeros(
    (len(test_inputs), max_test_output_size,num_output_chars+1), dtype="float32"
)
for i,target_text in enumerate(test_outputs):
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_test_input_data[i, t, output_index[char]] = 1.0
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_test_output_data[i, t - 1, output_index[char]] = 1.0
#     decoder_input_data[i, t + 1 :, output_index[" "]] = 1.0
#     decoder_output_data[i, t:, output_index[" "]] = 1.0
decoder_test_input_data = np.argmax(decoder_test_input_data,axis=2).astype(dtype='float32')
# decoder_test_output_data = np.argmax(decoder_test_output_data,axis=2).astype(dtype='float32')
# decoder_test_input_data = tf.convert_to_tensor(decoder_test_input_data)
# decoder_test_output_data = tf.convert_to_tensor(decoder_test_output_data) 

In [26]:
# charinput = tf.keras.Input(shape=(None,),name="input")
# embedding = tf.keras.layers.Embedding(num_input_chars,input_embed_size, name="embedding")(charinput)

In [27]:
# model = tf.keras.Model(charinput,embedding)

In [28]:
# model.compile("rmsprop","mse")

In [29]:
# out = model.predict(input_data[0])
# print(out)

# Sample Model

In [30]:
def get_sample_model(input_embed_size , hidden_size):
    charinput = tf.keras.Input(shape=(None,),name="input")
    embedding = tf.keras.layers.Embedding(num_input_chars,input_embed_size, name="embedding")(charinput)
    
    encoder = tf.keras.layers.LSTM(hidden_size, return_state=True )
    encoder_outputs, state_h, state_c = encoder(embedding)
    encoder_states = [state_h, state_c]
    
    decoder_inputs = tf.keras.Input(shape=(None,),name="decoder_input")
    decoder_embedding = tf.keras.layers.Embedding(num_output_chars + 1,64, name="decoder_embedding",mask_zero=True)(decoder_inputs)
    
    decoder_lstm = tf.keras.layers.LSTM(hidden_size, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = tf.keras.layers.Dense(num_output_chars + 1, activation="softmax")
    decoder_outputs = decoder_dense(decoder_outputs)
    model = tf.keras.Model([charinput,decoder_inputs],decoder_outputs)
    
    encoder_model = tf.keras.Model(charinput, encoder_states)
    # define inference decoder
    decoder_state_input_h = tf.keras.Input(shape=(hidden_size,))
    decoder_state_input_c = tf.keras.Input(shape=(hidden_size,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
    return model, encoder_model, decoder_model

In [31]:
sample_model, enc_model, dec_model = get_sample_model(32,256)

In [32]:
sample_model.compile(
    optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
)
sample_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, None)]       0                                            
__________________________________________________________________________________________________
decoder_input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 32)     832         input[0][0]                      
__________________________________________________________________________________________________
decoder_embedding (Embedding)   (None, None, 64)     4224        decoder_input[0][0]              
______________________________________________________________________________________________

In [33]:
reverse_input_char_index = dict((i, char) for char, i in input_index.items())
reverse_target_char_index = dict((i, char) for char, i in output_index.items())
reverse_target_char_index[0] = ' '

def decode_sequence(input_seq):
    states_value = enc_model.predict(input_seq)
    target_seq = np.zeros((1, 1, num_output_chars+1))
    target_seq[0, 0, output_index["\t"]] = 1.0
    target_seq = np.argmax(target_seq,axis=2).astype('float32')
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = dec_model.predict([target_seq] + states_value)

#         print(output_tokens)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
#         print(sampled_token_index)
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char
#         print(sampled_char)
        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or sampled_char == ' ' or len(decoded_sentence) > max_output_size:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_output_chars+1))
        target_seq[0, 0, sampled_token_index] = 1.0
        target_seq = np.argmax(target_seq,axis=2).astype('float32')
        # Update states
        states_value = [h, c]
    return decoded_sentence

In [39]:
for seqid in range(5):
    input_seq = input_tensor[seqid:seqid+1]
#     print(input_seq.shape,input_tensor.shape)
    decoded_sentence = decode_sequence(input_seq)
    print("-")
    print("Input sentence:", train_inputs[seqid])
    print("Decoded sentence:", decoded_sentence)

  "Even though the tf.config.experimental_run_functions_eagerly "


-
Input sentence: amkita
Decoded sentence: అంకిత

-
Input sentence: ankita
Decoded sentence: అంకిత

-
Input sentence: ankitha
Decoded sentence: అంకిత

-
Input sentence: ankitam
Decoded sentence: అంకితం

-
Input sentence: ankitham
Decoded sentence: అంకితం



In [36]:
tf.config.run_functions_eagerly(True)


In [37]:
sample_model.fit(
    [input_tensor,decoder_input_data],
    decoder_output_data,
    batch_size=64,
    epochs=10,
    validation_data=([val_input_tensor,decoder_val_input_data],decoder_val_output_data),
    shuffle=True,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x22982efae48>

In [41]:
def evaluate(data_tensor,data_output,k):
    crct = 0
    for seqid in range(k):
        input_seq = data_tensor[seqid:seqid+1]
#     print(input_seq.shape,input_tensor.shape)
        decoded_sentence = decode_sequence(input_seq)
        st = data_output[seqid][1:]
        if(st!=decoded_sentence):
            pass
        else:
            crct+=1
    return crct/k

In [42]:
print(evaluate(test_input_tensor,test_outputs,len(test_input_data)))

0.44892987645728205
