In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install wandb



In [None]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mreturaj[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
import os
import random
import numpy as np
import tensorflow as tf

In [None]:
random.seed(0)
np.random.seed(0)

In [None]:
class Dataset(object):
  def __init__(self, data_path):
    self.train_path = os.path.join(data_path, 'hi.translit.sampled.train.tsv')
    self.validation_path = os.path.join(data_path, 'hi.translit.sampled.dev.tsv')
    self.test_path = os.path.join(data_path, 'hi.translit.sampled.test.tsv')
    self.encoder_tokenizer = None
    self.decoder_tokenizer = None
    self.load_train_data = False

  @staticmethod
  def _read_file(filepath):
    encoder_words, decoder_words = [], []
    with open(filepath, 'r') as fp:
      for line in fp:
        line = line.strip()
        if not line:
          continue
        target, src, _ = [x.strip() for x in line.split('\t')]
        src = src + "\n"  # \n represents end_of_word
        encoder_words.append(src)
        target = "\t" + target + "\n"  # \t represents start_word and \n represents end_of_word
        decoder_words.append(target)
    return encoder_words, decoder_words

  @property
  def vocab_size(self):
    assert self.load_train_data, "Seems like you want to know the vocab size even before loading train data"
    encoder_vocab_size = len(self.encoder_tokenizer.word_index) + 1 # number 0 is reserved for padding
    decoder_vocab_size = len(self.decoder_tokenizer.word_index) + 1 # number 0 is reserved for padding
    return encoder_vocab_size, decoder_vocab_size

  def _reset_tokenizer(self):
    self.load_train_data = False
    self.encoder_tokenizer = None
    self.decoder_tokenizer = None

  def _get_tokenizer(self, encoder_words, decoder_words):
    assert self.load_train_data, "Seems like you are trying to access test data even before accessing train data !!"
    if self.encoder_tokenizer is None:
      self.encoder_tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
      self.encoder_tokenizer.fit_on_texts(encoder_words)
    if self.decoder_tokenizer is None:
      self.decoder_tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
      self.decoder_tokenizer.fit_on_texts(decoder_words) 
    return self.encoder_tokenizer, self.decoder_tokenizer

  def _get_dataset(self, encoder_words, decoder_words):
    encoder_tokenizer, decoder_tokenizer = self._get_tokenizer(encoder_words, decoder_words)
    encoder_input = encoder_tokenizer.texts_to_sequences(encoder_words)
    encoder_input = tf.keras.preprocessing.sequence.pad_sequences(encoder_input, padding='post') 
    decoder_target = decoder_tokenizer.texts_to_sequences(decoder_words)
    decoder_target = tf.keras.preprocessing.sequence.pad_sequences(decoder_target, padding='post')
    return encoder_input, decoder_target

  def get_training_data(self):
    try:
      self.load_train_data = True
      train_encoder_words, train_decoder_words = self._read_file(self.train_path)
      train_encoder_input, train_decoder_target = self._get_dataset(train_encoder_words, train_decoder_words)
      val_encoder_words, val_decoder_words = self._read_file(self.validation_path)
      val_encoder_input, val_decoder_target = self._get_dataset(val_encoder_words, val_decoder_words)
    except Exception as ex:
      self._reset_tokenizer()
      raise ex
    return train_encoder_input, train_decoder_target, val_encoder_input, val_decoder_target

  def get_testing_data(self):
    test_encoder_words, test_decoder_words = self._read_file(self.test_path)
    test_encoder_input, _, test_decoder_target = self._get_dataset(test_encoder_words, test_decoder_words)
    return test_encoder_input, test_decoder_target

#### Model Experiments

In [None]:
class Attention(tf.keras.layers.Layer):
  def __init__(self, latent_dim):
    super(Attention, self).__init__()
    self.W1 = tf.keras.layers.Dense(latent_dim)
    self.W2 = tf.keras.layers.Dense(latent_dim)
    self.V = tf.keras.layers.Dense(1)

  def call(self, decoder_state, encoder_output):
    decoder_state = tf.concat(decoder_state, 1)
    decoder_state = tf.expand_dims(decoder_state, 1)
    score = self.V(tf.nn.tanh(self.W1(decoder_state) + self.W2(encoder_output)))
    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * encoder_output
    context_vector = tf.reduce_sum(context_vector, axis=1)
    return context_vector, attention_weights

In [None]:
class BaseModel(tf.keras.Model):
  def __init__(self, params, rnn_class):
    super(BaseModel, self).__init__()
    self.set_attributes(params)

  def set_attributes(self, params):
    for k, v in params.items():
      setattr(self, k, v)

  def stacked_layers(self, rnn_class, num_layers):
    first_rnn = rnn_class(self.latent_dim, return_state=True, return_sequences=True)
    if num_layers <= 1:
      return first_rnn, None
    stacked_input = tf.keras.Input(shape=(None, self.latent_dim))
    stacked_ouput = stacked_input
    for layer in range(1, num_layers):
      stacked_output = tf.keras.layers.Dropout(self.dropout)(stacked_ouput)
      stacked_encoder = rnn_class(self.latent_dim, return_state=True, return_sequences=True)
      x = stacked_encoder(stacked_output)
      stacked_output = x[0]
    stacked_rnn = tf.keras.Model(stacked_input, x)
    return first_rnn, stacked_rnn

  def call(self, *args, **kwargs):
    raise NotImplementedError

  def initialize_hidden_state(self, batch=None):
    if batch == None:
      batch = self.batch_size
    init = [tf.zeros((batch, self.latent_dim))]
    if isinstance(self.first_rnn, tf.keras.layers.LSTM):
      init *= 2
    return init


class Encoder(BaseModel):
  def __init__(self, params, rnn_class):
    super(Encoder, self).__init__(params, rnn_class)
    self.embed = tf.keras.layers.Embedding(self.encoder_vocab_size, self.embed_size, mask_zero=True)
    self.first_rnn, self.stacked_rnn = self.stacked_layers(rnn_class, self.num_encoder_layers)

  def call(self, x, hidden):
    x = self.embed(x)
    x = self.first_rnn(x, initial_state=hidden)
    if self.num_encoder_layers > 1:
      x = self.stacked_rnn(x[0])
    output, state = x[0], x[1:]
    return (output, state)


class Decoder(BaseModel):
  def __init__(self, params, rnn_class):
    super(Decoder, self).__init__(params, rnn_class)
    self.first_rnn, self.stacked_rnn = self.stacked_layers(rnn_class, self.num_decoder_layers)
    self.dense = tf.keras.layers.Dense(self.decoder_vocab_size, activation="softmax")
    if self.use_attention:
      self.attention = Attention(self.latent_dim)

  def call(self, x, hidden, encoder_output=None):
    x = tf.one_hot(x, depth=self.decoder_vocab_size)
    attention_weights = None
    if self.use_attention:
      context_vector, attention_weights = self.attention(hidden, encoder_output)
      x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    x = self.first_rnn(x, initial_state=hidden)
    if self.num_decoder_layers > 1:
      x = self.stacked_rnn(x[0])
    output, state = x[0], x[1:]
    output = self.dense(output)
    return (output, state, attention_weights)

In [None]:
class Runner(object):
  def __init__(self, params, rnn_class, encoder_tokenizer, decoder_tokenizer):
    self.params = params
    self.encoder_tokenizer = encoder_tokenizer
    self.decoder_tokenizer = decoder_tokenizer
    self.encoder = Encoder(params, rnn_class)
    self.decoder = Decoder(params, rnn_class)
    self.optimizer = tf.keras.optimizers.Adam()
    self.loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)

  @staticmethod
  def index_word(tokenizer, seq):
    result = ''
    for s in seq:
      if s == 0: # generally we should not encounter this id, but it we do then it is just a unrecognized character
        result += '?'
      else:
        result += tokenizer.index_word[s]
      if result[-1] == '\n':
        break
    return result

  @staticmethod
  def word_index(tokenizer, seq, max_length):
    result = []
    for s in seq:
      result.append(tokenizer.word_index[s])
    result = result + [0] *(max_length - len(result))
    return np.array(result)

  def _custom_loss_function(self, real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0)) # finds all the dummy characters that were added to make the sequcence length equal across data
    loss = self.loss_obj(real, pred) # returns the cross entropy for each data
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask # removes all the dummy characters from loss calculation
    return tf.reduce_mean(loss)

  @tf.function
  def _train_step(self, encoder_input, decoder_target):
    loss = 0
    encoder_hidden = self.encoder.initialize_hidden_state(batch=encoder_input.shape[0])
    with tf.GradientTape() as tape:
      encoder_output, encoder_hidden = self.encoder(encoder_input, encoder_hidden)
      decoder_hidden = encoder_hidden
      decoder_input = tf.expand_dims(decoder_target[:, 0], 1)
      for t in range(1, decoder_target.shape[1]):  # unfolding in time
        pred_prob, decoder_hidden, _ = self.decoder(decoder_input, decoder_hidden, encoder_output)
        loss += self._custom_loss_function(decoder_target[:, t], pred_prob)
        decoder_input = tf.expand_dims(decoder_target[:, t], 1)
    batch_loss = loss / int(decoder_target.shape[1])  # normalizing in time
    trainable_variables = self.encoder.trainable_variables + self.decoder.trainable_variables
    grads = tape.gradient(loss, trainable_variables)
    self.optimizer.apply_gradients(zip(grads, trainable_variables))
    return batch_loss

  def train(self, encoder_input, decoder_target, val_encoder_input, val_decoder_target, epochs=5):
    num_train_data = encoder_input.shape[0]
    indx = np.arange(num_train_data)
    np.random.shuffle(indx)
    train_loss, valid_accuracy = [], []
    for epoch in range(epochs):
      total_loss = 0
      step = 0
      start, end = 0, self.params['batch_size']
      while start < num_train_data:
        batch_indx = indx[start:end]
        inp, targ = encoder_input[batch_indx, :], decoder_target[batch_indx, :]
        total_loss += self._train_step(inp, targ)
        start = end
        end += self.params['batch_size']
        step += 1
      val_acc = self.validation_step(val_encoder_input, val_decoder_target)
      train_loss.append(total_loss/step)
      valid_accuracy.append(val_acc)
      # comment this line if you don't want to print loss/acc
      # print(f"Epoch: {epoch+1}, Loss: {total_loss/step}, val_acc: {val_acc}")
    return train_loss, valid_accuracy

  def translate(self, encoder_input, max_target_len):
    batch = encoder_input.shape[0]
    encoder_hidden = self.encoder.initialize_hidden_state(batch)
    encoder_output, decoder_hidden = self.encoder(encoder_input, encoder_hidden)
    result = np.zeros((batch, max_target_len), dtype=int)
    result[:, 0] = self.decoder_tokenizer.word_index['\t']
    decoder_input = tf.expand_dims(result[:, 0], 1)
    for t in range(1, max_target_len):
      pred_prob, decoder_hidden, _ = self.decoder(decoder_input, decoder_hidden, encoder_output)
      pred_id = tf.argmax(pred_prob, -1)
      result[:, t] = pred_id[:, 0]
      decoder_input = pred_id
    return result

  def validation_step(self, encoder_input, decoder_target):
    max_target_len = decoder_target.shape[1]
    results = self.translate(encoder_input, max_target_len)
    val_accuracy = 0
    for r, t in zip(results, decoder_target):
      res_word = self.index_word(self.decoder_tokenizer, r)
      targ_word = self.index_word(self.decoder_tokenizer, t)
      val_accuracy += 1 if res_word == targ_word else 0
    val_accuracy /= decoder_target.shape[0]
    return val_accuracy

In [None]:
# In most likely situation you only need to change this cell data if you need to, if you need to change anything else go ahead and change


RNN_MAP = {
    "lstm": tf.keras.layers.LSTM,
    "gru": tf.keras.layers.GRU,
    "rnn": tf.keras.layers.SimpleRNN
}

DATA_PATH = '/content/drive/MyDrive/IITM/collab/cs6910/RNN_data_set/dakshina_dataset_v1.0/hi/lexicons'

WANDB_PROJECT = "CS6910_ASSIGNMENT_3"
WANDB_ENTITY = "cs21m003_cs21d406"
WANDB_RUNS = 20

EPOCHS = 10


class WandbRunner(object):
  def __init__(self):
    dataset = Dataset(DATA_PATH)
    self.train_encoder_input, self.train_decoder_target, self.val_encoder_input, self.val_decoder_target = dataset.get_training_data()
    self.encoder_vocab_size, self.decoder_vocab_size = dataset.vocab_size
    self.encoder_tokenizer = dataset.encoder_tokenizer
    self.decoder_tokenizer = dataset.decoder_tokenizer

  def run_wandb(self):
    wandb.init()
    config = wandb.config
    params = {
      "encoder_vocab_size": self.encoder_vocab_size,
      "decoder_vocab_size": self.decoder_vocab_size, 
      "embed_size": config.inp_embed_size,
      "latent_dim": config.latent_dim,
      "num_encoder_layers": config.num_encoder_layers,
      "num_decoder_layers": config.num_decoder_layers,
      "dropout": config.dropout,
      "batch_size": config.batch_size, 
      "use_attention": config.attention
    }
    rnn_class = RNN_MAP[config.rnn_type]
    runner = Runner(params, rnn_class, self.encoder_tokenizer, self.decoder_tokenizer)
    train_loss, valid_accuracy = runner.train(self.train_encoder_input, self.train_decoder_target,
                                              self.val_encoder_input, self.val_decoder_target, epochs=config.epochs)
    wandb.run.name=f"emb_{config.inp_embed_size}_ld_{config.latent_dim}_nel_{config.num_encoder_layers}_ndl_{config.num_decoder_layers}_dpt_{config.dropout}_at_{config.attention}_bs_{config.batch_size}_cell_{config.rnn_type}"
    for tl, va in zip(train_loss, valid_accuracy):
      wandb.log({"training_loss": tl, "validation_accuracy": va})

  def do_hyperparameter_search(self):
    sweep_config = {
        "name": "Transliteration Search",
        "method": "random",
        "metric": {
            "name": "validation_accuracy",
            "goal": "maximize"
        },
        "parameters": {
            "inp_embed_size": {"values": [16, 32]}, # run for [64, 256] later
            "latent_dim": {"values": [16, 32]}, # run for [64, 256] later
            "num_encoder_layers": {"values": [1, 2, 3]},
            "num_decoder_layers": {"values": [1, 2, 3]},
            "dropout": {"values": [0.2, 0.3, 0.4]},
            "batch_size": {"values": [32, 64]},
            "attention": {"values": [False]},  # Donot change this for question 1
            "rnn_type": {"values": ["lstm", "gru"]}, # run for rnn later
            "epochs": {"values": [EPOCHS]}
        }
    }
    sweep_id = wandb.sweep(sweep_config, project=WANDB_PROJECT, entity=WANDB_ENTITY)
    wandb.agent(sweep_id, function=self.run_wandb, count=WANDB_RUNS)

In [None]:
# Run this code to call wandb
# It takes about 15-20 mins to complete 1 run, so be patient

wandb_runner = WandbRunner()
wandb_runner.do_hyperparameter_search()

In [None]:
## Do not run this, it is only for testing purposes

# dataset = Dataset(DATA_PATH)
# train_encoder_input, train_decoder_target, val_encoder_input, val_decoder_target = dataset.get_training_data()
# encoder_vocab_size, decoder_vocab_size = dataset.vocab_size

# params = {
#   "encoder_vocab_size": encoder_vocab_size,
#   "decoder_vocab_size": decoder_vocab_size, 
#   "embed_size": 30,
#   "latent_dim": 30,
#   "num_encoder_layers": 2,
#   "num_decoder_layers": 2,
#   "dropout": 0.2,
#   "batch_size": 32, 
#   "use_attention": False
# }

# run = Runner(params, RNN_MAP['rnn'], dataset.encoder_tokenizer, dataset.decoder_tokenizer)
# tl, va = run.train(train_encoder_input, train_decoder_target, val_encoder_input, val_decoder_target)
# print(va)  # prints validation accuracy at each epochs

In [None]:
# student's assignment:
# https://github.com/sujaybokil/CS6910-Assignment3/blob/master/DL_Assignment3_Master.ipynb

# Paperspace blog
# https://blog.paperspace.com/seq-to-seq-attention-mechanism-keras/

# stacked lstm codes:
# https://github.com/sachinruk/PyData_Keras_Talk/blob/master/cosine_LSTM.ipynb

# seq2seq model tf
# https://www.tensorflow.org/addons/tutorials/networks_seq2seq_nmt

# masking and padding
# https://www.tensorflow.org/guide/keras/masking_and_padding

# transformer
# https://www.tensorflow.org/text/tutorials/transformer

# one hot encoding
# https://www.tensorflow.org/api_docs/python/tf/one_hot

# masking (can use one hot encoding with masking)
# https://www.tensorflow.org/api_docs/python/tf/keras/layers/Masking

# masked loss function
# https://stackoverflow.com/questions/56328140/how-do-i-implement-a-masked-softmax-cross-entropy-loss-function-in-keras

# char seq2seq lstm
# https://keras.io/examples/nlp/lstm_seq2seq/
