In [None]:
!wget https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
!tar -xvf ./dakshina_dataset_v1.0.tar

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Flatten,Embedding,Dense
from keras.utils.vis_utils import plot_model

In [None]:
!pip install wandb
# wandb login
import wandb
wandb.login()

In [3]:
train_path = "./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
val_path =   "./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv"
test_path = "./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv"

def readData(path):    
    trainingData_df = pd.read_csv(path, sep='\t',on_bad_lines='skip',header=None)
    trainingData = trainingData_df.values.tolist()
    return trainingData

In [4]:
# Analysing dataset
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

trainingData = readData(train_path)
for line in trainingData:
    input_text, target_text = line[1],line[0]
    if not isinstance(input_text,str):
        continue
    target_text = " " + target_text + " "
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)
input_characters.add(' ')
target_characters.add(' ')
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))

num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())


print(input_token_index)
print(target_token_index)

print("Number of samples:", len(input_texts))
num_samples = len(input_texts)
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)

In [5]:
# Character encoding using Embedding layer....

# Encoder inputs embedding (Latin)
def getData(path):
    print(path)
    input_texts = []
    target_texts = []
    data = readData(path)
    for line in data:
        input_text, target_text = line[1],line[0]
        if not isinstance(input_text,str):
            continue
        target_text = " " + target_text + " "
        input_texts.append(input_text)
        target_texts.append(target_text)
    
    vocab_size = num_encoder_tokens
    max_length = max_encoder_seq_length

    EncoderInputEncodedWords = []
    for i,eachText in enumerate(input_texts):
        EncoderInputEncodedWords.append([])
        for eachChar in eachText:
            EncoderInputEncodedWords[i].append(input_token_index[eachChar])

    EncoderInputEncodedWords = pad_sequences(EncoderInputEncodedWords,maxlen=max_length,padding='post',value=0.0)
    print('EncoderInputEncodedWords.shape',EncoderInputEncodedWords.shape)
    print(EncoderInputEncodedWords[:10])

    vocab_size = num_decoder_tokens
    max_length = max_decoder_seq_length

    DecoderInputEncodedWords = []
    for i,eachText in enumerate(target_texts):
        DecoderInputEncodedWords.append([])
        for j,eachChar in enumerate(eachText):
            DecoderInputEncodedWords[i].append(target_token_index[eachChar])

    DecoderInputEncodedWords = pad_sequences(DecoderInputEncodedWords,maxlen = max_decoder_seq_length ,padding='post',value = 0.0)#max(num_decoder_tokens,num_encoder_tokens))
    print('DecoderInputEncodedWords.shape',DecoderInputEncodedWords.shape)
    print(DecoderInputEncodedWords[:10])

    decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32")
    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(target_text):
            if t > 0:
                decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
        decoder_target_data[i, t:, target_token_index[" "]] = 1.0

    with np.printoptions(threshold=np.inf):
      print(decoder_target_data[0])
    
    return EncoderInputEncodedWords,DecoderInputEncodedWords,decoder_target_data,input_texts,target_texts

In [7]:
encoder_input_train_data, decoder_input_train_data, decoder_target_train_data,train_eng,train_hin = getData(train_path)
encoder_input_val_data, decoder_input_val_data, decoder_target_val_data,val_eng,val_hin = getData(val_path)
encoder_input_test_data, decoder_input_test_data, decoder_target_test_data,test_eng,test_hin = getData(test_path)