In [274]:
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import wandb

In [275]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [276]:
# !unzip /content/gdrive/MyDrive/aksharantar_sampled.zip > /dev/null

In [277]:
USE_CUDA = False

In [278]:
OUTPUT_LANGUAGE = 'mni'
MAX_WORD_LENGTH = 25

## READING DATA

In [279]:
'''
Below train, validation and test data are list of strings
'''
train_data=pd.read_csv('aksharantar_sampled/'+OUTPUT_LANGUAGE+'/'+OUTPUT_LANGUAGE+'_train.csv',header=None)
X_train = list(train_data[0])
Y_train = list(train_data[1])

validation_data=pd.read_csv('aksharantar_sampled/'+OUTPUT_LANGUAGE+'/'+OUTPUT_LANGUAGE+'_valid.csv',header=None)
X_valid = list(validation_data[0])
Y_valid = list(validation_data[1])

test_data=pd.read_csv('aksharantar_sampled/'+OUTPUT_LANGUAGE+'/'+OUTPUT_LANGUAGE+'_test.csv',header=None)
X_test = list(test_data[0])
Y_test = list(test_data[1])

In [280]:
SOS_token = 0
EOS_token = 1
UNK_TOKEN = 2
# ND_TOKEN = 
class Lang:
    def __init__(self, name):
        self.name = name
        self.chr2index = {}
        self.chr2count = {}
        # self.index2chr = {0: "$", 1: "*", 2: "_"}
        self.index2chr = {0: "$", 1: "*"}
        self.n_chrs = 2  # 3 if Count SOS and EOS and UNK

    def addword(self, word):
        for letter in word:
            self.addchr(letter)

    def addchr(self, chr_):
        if chr_ not in self.chr2index:
            self.chr2index[chr_] = self.n_chrs
            self.chr2count[chr_] = 1
            self.index2chr[self.n_chrs] = chr_
            self.n_chrs += 1
        else:
            self.chr2count[chr_] += 1
            
    def add_word_list(self,word_lst):
        for word in word_lst:
            self.addword(word)

In [281]:
inp_lang = Lang("eng")
inp_lang.add_word_list(X_train)

out_lang = Lang(OUTPUT_LANGUAGE)
out_lang.add_word_list(Y_train)


## DATA PROCESSING

In [282]:

def get_one_hot(char_dict,ch,len_alphabets):
  return char_dict[ch]

def data_processing(data,char_dict,len_chrs):
  ONE_HOT = []
  for word in data :
    encoded_word = [get_one_hot(char_dict,i,len_chrs) for i in word]
    encoded_word.append(EOS_token)
    encoded_word = torch.LongTensor(encoded_word)
    ONE_HOT.append(encoded_word)
  return ONE_HOT

$ - Start Character \\
\* - End Character

In [283]:
INP_LANG_SIZE = inp_lang.n_chrs
OUT_LANG_SIZE = out_lang.n_chrs
print(INP_LANG_SIZE)
print(OUT_LANG_SIZE)

28
46


In [284]:
# # These are lists of tensors corresponding to each word
inp_train = data_processing(X_train,inp_lang.chr2index,len(inp_lang.chr2index))
tgt_train = data_processing(Y_train,out_lang.chr2index,len(out_lang.chr2index))
inp_valid = data_processing(X_valid,inp_lang.chr2index,len(inp_lang.chr2index))
tgt_valid = data_processing(Y_valid,out_lang.chr2index,len(out_lang.chr2index))
inp_test = data_processing(X_test,inp_lang.chr2index,len(inp_lang.chr2index))
tgt_test = data_processing(Y_test,out_lang.chr2index,len(out_lang.chr2index))

In [285]:
# print(max([len(X_train[i]) for i in range(len(X_train))]))
# print(max([len(X_test[i]) for i in range(len(X_test))]))
# print(max([len(X_valid[i]) for i in range(len(X_valid))]))
# print(max([len(Y_train[i]) for i in range(len(Y_train))]))
# print(max([len(Y_test[i]) for i in range(len(Y_test))]))
# print(max([len(Y_valid[i]) for i in range(len(Y_valid))]))

## ENCODER

In [286]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_layer_size, num_encoder_layers, cell_type, dropout_prob, bidirectional):
      super(Encoder, self).__init__()
      '''
        self.input_size : int
        self.hidden_layer_size : int
        self.num_encoder_layers : int
        self.cell_type : string
      '''
      self.input_size = input_size
      self.hidden_layer_size = hidden_layer_size
      self.num_encoder_layers = num_encoder_layers
      self.cell_type = cell_type

      self.embedding = nn.Embedding(self.input_size, embedding_size)

      if cell_type == 'RNN':
        self.rnn = nn.RNN(input_size = embedding_size, hidden_size = hidden_layer_size, num_layers = num_encoder_layers, dropout = dropout_prob, bidirectional = bidirectional)
      elif cell_type == 'LSTM':
        self.rnn = nn.LSTM(input_size = embedding_size, hidden_size = hidden_layer_size, num_layers = num_encoder_layers, dropout = dropout_prob, bidirectional = bidirectional)
      elif cell_type == 'GRU':
        self.rnn = nn.GRU(input_size = embedding_size, hidden_size = hidden_layer_size, num_layers = num_encoder_layers, dropout = dropout_prob, bidirectional = bidirectional)
      self.dropout = nn.Dropout(dropout_prob)
      self.D = 1
      if bidirectional == True :
        self.D = 2

    def forward(self, input, prev_hidden, prev_cell = None):
      # seq_len = len(input)
      embedded = self.embedding(input)
      embedded = embedded.view(1,1,-1)
      embedded = self.dropout(embedded)
      if self.cell_type == 'RNN':
        output, hidden = self.rnn(embedded,prev_hidden)
        
      elif self.cell_type == 'LSTM':
        output, (hidden,cell) = self.rnn(embedded,(prev_hidden,prev_cell))
        return output, (hidden,cell)
      
      elif self.cell_type == 'GRU':
        output, hidden = self.rnn(embedded,prev_hidden)
        
      return output,hidden

    def init_hidden(self):
      
      if self.cell_type == 'LSTM':
        hidden = torch.zeros(self.D*self.num_encoder_layers,1,self.hidden_layer_size)
        cell = torch.zeros(self.D*self.num_encoder_layers,1,self.hidden_layer_size)
        return hidden,cell
      else :
        hidden = torch.zeros(self.D*self.num_encoder_layers,1,self.hidden_layer_size)
      return hidden


## DECODER

In [287]:
# class Decoder

class Decoder(nn.Module):
  def __init__(self, output_size, embedding_size, hidden_layer_size, num_layers, cell_type, dropout_prob, bidirectional):
    super(Decoder, self).__init__()
    '''
      self.output_size : int
      self.hidden_layer_size : int
      self.num_encoder_layers : int
      self.cell_type : string
      self.rnn : RNN,LSTM,GRU 
    '''
    self.output_size = output_size
    self.hidden_layer_size = hidden_layer_size
    self.num_layers = num_layers
    self.cell_type = cell_type
    self.embedding_size = embedding_size
    self.embedding = nn.Embedding(OUT_LANG_SIZE, embedding_size)
    self.D = 1
    if bidirectional == True :
      self.D = 2
        
    if cell_type == 'RNN':
      self.rnn = nn.RNN(input_size = embedding_size, hidden_size = hidden_layer_size, num_layers = num_layers, dropout = dropout_prob, bidirectional = bidirectional)
    elif cell_type == 'LSTM':
      self.rnn = nn.LSTM(input_size = embedding_size, hidden_size = hidden_layer_size, num_layers = num_layers, dropout = dropout_prob, bidirectional = bidirectional)
    elif cell_type == 'GRU':
      self.rnn = nn.GRU(input_size = embedding_size, hidden_size = hidden_layer_size, num_layers = num_layers, dropout = dropout_prob, bidirectional = bidirectional)

    self.dropout = nn.Dropout(dropout_prob)
    self.fc = nn.Linear(self.D*hidden_layer_size, output_size)
    self.softmax = nn.LogSoftmax(dim = 1)
      
  def forward(self, input, prev_hidden, prev_cell = None):
    embedded = self.embedding(input)
    embedded = embedded.view(1,1,-1)
    embedded = F.relu(embedded)
    embedded = self.dropout(embedded)
    
    if self.cell_type == 'RNN':
      output, hidden = self.rnn(embedded,prev_hidden)
      
    elif self.cell_type == 'LSTM':
      output,(hidden,cell) = self.rnn(embedded,(prev_hidden,prev_cell))
    
    elif self.cell_type == 'GRU':
      output, hidden = self.rnn(embedded,prev_hidden)
    output = self.fc(output[0])
    y_pred = self.softmax(output)
    # print(y_pred.size())
    if self.cell_type == 'LSTM':
      return y_pred,(hidden,cell)
    
    return y_pred, hidden

## SEQ2SEQ

In [288]:


class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, learning_rate, clip = 5.0, teacher_forcing_ratio = 0.5):
    super(Seq2Seq, self).__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.teacher_forcing_ratio = teacher_forcing_ratio
    self.encoder_optimizer = optim.Adam(self.encoder.parameters(), lr = learning_rate)
    self.decoder_optimizer = optim.Adam(self.decoder.parameters(), lr = learning_rate)
    self.encoder_optimizer.zero_grad()
    self.decoder_optimizer.zero_grad()
    self.criterion = nn.NLLLoss()
    self.clip = clip
    self.teacher_forcing_ratio = teacher_forcing_ratio
    # self.batch_size = batch_size
      
  def forward(self, input, target):
    self.encoder_optimizer.zero_grad()
    self.decoder_optimizer.zero_grad()
    inp_len = input.size()[0]
    tgt_len = target.size()[0]
    loss = 0
    '''
    input : seq_length_inp
    target : seq_length_tgt
    '''
    # Encoder
    encoder_outputs = torch.zeros(MAX_WORD_LENGTH,self.encoder.D*self.encoder.hidden_layer_size)
    if self.encoder.cell_type == 'LSTM':
      encoder_hidden,encoder_cell = self.encoder.init_hidden()
    else :
      encoder_hidden = self.encoder.init_hidden()
    
    for ei in range(inp_len):
      if self.encoder.cell_type == 'LSTM':
        encoder_output,(encoder_hidden,encoder_cell) = self.encoder(input = input[i],prev_hidden = encoder_hidden,prev_cell = encoder_cell)
      else :
        encoder_output, encoder_hidden = self.encoder(input[ei],encoder_hidden)
      encoder_outputs[ei] = encoder_output[0][0]

    # Decoder
    decoder_input = torch.LongTensor([[SOS_token]])
    decoder_hidden = encoder_hidden
    
    if self.decoder.cell_type == 'LSTM':
      decoder_cell = encoder_cell
      
    # With Teacher Forcing
    use_teacher_forcing = random.random() < self.teacher_forcing_ratio
    use_teacher_forcing = True
    if use_teacher_forcing:
      for i in range(tgt_len):
        # print(decoder_input.size(),decoder_hidden.size())
        if self.decoder.cell_type == 'LSTM':
          decoder_output,(decoder_hidden,decoder_cell) = self.decoder(input = decoder_input,prev_hidden = decoder_hidden,prev_cell = decoder_cell)
        else:
          decoder_output,decoder_hidden = self.decoder(decoder_input,decoder_hidden)
        loss += self.criterion(decoder_output.view(-1),target[i])
        decoder_input = target[i]
    else :
      for i in range(tgt_len):
        # print(decoder_input.size(),decoder_hidden.size())
        if self.decoder.cell_type == 'LSTM':
          decoder_output,(decoder_hidden,decoder_cell) = self.decoder(input = decoder_input,prev_hidden = decoder_hidden,prev_cell = decoder_cell)
        else:
          decoder_output,decoder_hidden = self.decoder(decoder_input,decoder_hidden)
        loss += self.criterion(decoder_output.view(-1),target[i])
        topv,topi = decoder_output.view(-1).topk(1)
        ni = topi[0]
        decoder_input = torch.LongTensor([[ni]])
        if ni == EOS_token : 
          break
    loss.backward()
    # torch.nn.utils.clip_grad_norm_(self.encoder.parameters(), self.clip)
    # torch.nn.utils.clip_grad_norm_(self.decoder.parameters(), self.clip)
    self.encoder_optimizer.step()
    self.decoder_optimizer.step()
    return loss/tgt_len

  def predict(self,input):
    with torch.no_grad():
      inp_len = input.size()[0]
      
      # Encoder
      encoder_outputs = torch.zeros(MAX_WORD_LENGTH,self.encoder.D*self.encoder.hidden_layer_size)
      if self.encoder.cell_type == 'LSTM':
        encoder_hidden,encoder_cell = self.encoder.init_hidden()
      else :
        encoder_hidden = self.encoder.init_hidden()
        
      for ei in range(inp_len):
        if self.encoder.cell_type == 'LSTM':
          encoder_output,(encoder_hidden,encoder_cell) = self.encoder(input = input[i],prev_hidden = encoder_hidden,prev_cell = encoder_cell)
        else :
          encoder_output, encoder_hidden = self.encoder(input[ei],encoder_hidden)
        encoder_outputs[ei] += encoder_output[0,0]
        
      decoder_input = torch.LongTensor([[SOS_token]])
      decoder_hidden = encoder_hidden
      if self.decoder.cell_type == 'LSTM':
        decoder_cell = encoder_cell
      
      decoded_words = []
      # decoder_outputs = []
      pred_word = ''
      for i in range(MAX_WORD_LENGTH):
        decoder_output,decoder_hidden = self.decoder(decoder_input,decoder_hidden)
        topv,topi = decoder_output[0].topk(1)
        ni = topi[0].item()
        decoder_input = torch.LongTensor([[ni]])
        decoded_words.append(ni)
        pred_word += out_lang.index2chr[ni]
        if ni == EOS_token : 
          break
        
      return (torch.LongTensor(decoded_words),pred_word)


In [289]:
x = torch.zeros(5,3)
soft = nn.LogSoftmax(dim=1)
o = soft(x)
print(o)

tensor([[-1.0986, -1.0986, -1.0986],
        [-1.0986, -1.0986, -1.0986],
        [-1.0986, -1.0986, -1.0986],
        [-1.0986, -1.0986, -1.0986],
        [-1.0986, -1.0986, -1.0986]])


## Train

In [290]:

def train_once(model):
  # Trains once on the whole dataset
  training_loss = 0
  train_length = len(inp_train)
  for i in range(train_length):
    input_tensor = inp_train[i]
    target_tensor = tgt_train[i]
    loss = model(input_tensor,target_tensor)
    training_loss += loss
    if i%200 == 0:
      print(i)  
  return training_loss

def accuracy_score(pred,actual):
  # pred,actual : list of strings/words
  return np.sum(np.array(pred) == np.array(actual))/len(pred)

def eval_acc_loss(model):
  train_pred_ = [model.predict(i) for i in inp_train]
  valid_pred_ = [model.predict(i) for i in inp_valid]
  
  train_pred = list(map(list, zip(*train_pred_)))[1]
  valid_pred = list(map(list, zip(*valid_pred_)))[1]
  
  train_accuracy = accuracy_score(train_pred,Y_train)
  valid_accuracy = accuracy_score(valid_pred,Y_valid)
  return train_accuracy,valid_accuracy

def train(model,epochs):
  for curr_epoch in range(epochs):
    train_loss = train_once(model)
    # train_accuracy,valid_accuracy,train_loss_,valid_loss = eval_acc_loss(model)
    # print("Epoch : %d, Training Loss : %f, Training Accuracy : %f" % (curr_epoch+1,train_loss,train_accuracy))
    print("Epoch : %d, Training Loss : %f" % (curr_epoch+1,train_loss))
    
    # wandb.log({ "training_accuracy" : train_accuracy,
    #           "validation_accuracy" : valid_accuracy,
    #           "training_loss" : train_loss,
    #           "validation_loss" : valid_loss,
    #           "epoch" : curr_epoch+1})


## Run

In [291]:
# define hyperparameters
input_size = inp_lang.n_chrs
embedding_size = 128
output_size = out_lang.n_chrs
hidden_layer_size = 32
num_layers = 2
cell_type = 'RNN'
dropout_prob = 0.1
learning_rate = 0.001
bidirectional = True
epochs = 10
encoder = Encoder(input_size, embedding_size, hidden_layer_size, num_layers, cell_type, dropout_prob, bidirectional)
decoder = Decoder(output_size, embedding_size, hidden_layer_size, num_layers, cell_type, dropout_prob, bidirectional)
model = Seq2Seq(encoder,decoder,learning_rate)
train(model,epochs)
# print(inp_train[3])
# print(model.predict(inp_train[3]))



0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800
4000
4200
4400
4600
4800
5000
5200
5400
5600
5800
6000
6200
6400


In [84]:
x = eval_acc_loss(model)
print(x)

(0.0, 0.0)


In [170]:
print(inp_train[3])
print(model.predict(inp_train[3]))


tensor([10,  3,  8, 10, 11, 12, 10, 15,  2, 16,  8,  7,  8,  1])
tensor([]) tensor([], dtype=torch.int64)


IndexError: index 0 is out of bounds for dimension 0 with size 0

In [165]:
xr = model.predict(inp_train[1])
print(xr)

IndexError: index 0 is out of bounds for dimension 0 with size 0

In [73]:
def train_model():
    config_defaults = {
        "embedding_size" : ,
        "hidden_layer_size" : ,
        "num_layers" : ,
        "cell_type" : ,
        "dropout_prob" : ,
        "learning_rate" : ,
        "bidirectional" : ,
    }
    
    wandb.init(config=config_defaults)
    config = wandb.config
    input_size = inp_lang.n_chrs
    embedding_size = config.embedding_size
    output_size = out_lang.n_chrs
    hidden_layer_size = config.hidden_layer_size
    num_layers = config.num_layers
    cell_type = config.cell_type
    dropout_prob = config.dropout_prob
    learning_rate = config.learning_rate
    bidirectional = config.birectional
    epochs = config.epochs
    encoder = Encoder(input_size, embedding_size, hidden_layer_size, num_layers, cell_type, dropout_prob, bidirectional)
    decoder = Decoder(output_size, embedding_size, hidden_layer_size, num_layers, cell_type, dropout_prob, bidirectional)
    model = Seq2Seq(encoder,decoder,learning_rate)
    train(model,epochs)
    run_name = "es_{}_hl_{}_nl_{}_ct_{}_dp_{}_lr_{}_bi_{}_ep_{}".format(embedding_size,hidden_layer_size,num_layers,cell_type,dropout_prob,learning_rate,bidirectional,epochs)

    
    run_name = "es_{}_hl_{}_nl_{}_ct_{}_dp_{}_lr_{}_bi_{}_ep_{}".format(embedding_size,hidden_layer_size,num_layers,cell_type,dropout_prob,learning_rate,bidirectional,epochs)
    wandb.run.name = run_name
    wandb.run.save()
    
    
    
    

[tensor([1]),
 '*',
 tensor([[[[3.7510e-02, 4.4114e+04, 2.1766e+04, 1.5324e+04, 8.1817e+03,
            1.7772e+04, 3.0762e+04, 2.3672e+04, 3.0959e+04, 2.2608e+04,
            2.0143e+04, 1.9218e+04, 9.8513e+03, 1.8196e+04, 1.7789e+04,
            1.7351e+04, 2.0242e+04, 2.0066e+04, 1.1267e+04, 1.7333e+04,
            1.9491e+04, 1.2423e+04, 2.3236e+04, 2.0014e+04, 9.6049e+03,
            2.1037e+04, 1.4871e+04, 6.7044e+03, 1.8983e+04, 1.6248e+04,
            1.3851e+04, 8.8384e+03, 1.7105e+04, 1.0929e+04, 9.3486e+03,
            1.4532e+04, 1.1596e+04, 1.5291e+04, 1.3010e+04, 1.5571e+04,
            1.1392e+04, 5.1868e+03, 8.4933e+03, 1.1900e+03, 2.1115e+03,
            1.7188e+03]]]], grad_fn=<StackBackward0>)]

In [None]:
sweep_config = {
    # "name" : "assignment_sweeps",
    "method" : "bayes",
    "metric" :{
        "name" : "validation_accuracy",
        "goal" : "maximize"
    },
    "parameters" : {
        "embedding_size" : {
            "values" : [16,32,64,128]
        },
        "num_layers" : {
            "values" : [1,2,3,4]
        },
        "hidden_layer_size" : {
            "values" : [32,64,128]
        },
        "learning_rate" : {
            "values" : [0.001,0.0001]
        },
        "cell_type" : {
           "values" : ['RNN', 'LSTM', 'GRU'] 
        },
        "dropout" : {
            "values" : [0.1,0.2]
        },
        "bidirectional ": {
            "values" : [True,False]
        }
    }
}



In [None]:
sweep_id = wandb.sweep(sweep_config,project="CS6910_Assignment_2")
wandb.agent(sweep_id = sweep_id,function = train_model,count = 25)