In [3]:
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
# from torch.nn.utils.rnn import masked_cross_entropy
# import langdetect
# from langdetect import detect

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
!unzip /content/gdrive/MyDrive/aksharantar_sampled.zip > /dev/null

## READING DATA

In [6]:
train_data=pd.read_csv('aksharantar_sampled/hin/hin_train.csv',header=None)
X_train = list(train_data[0])
Y_train = list(train_data[1])

validation_data=pd.read_csv('aksharantar_sampled/hin/hin_valid.csv',header=None)
X_valid = list(validation_data[0])
Y_valid = list(validation_data[1])

test_data=pd.read_csv('aksharantar_sampled/hin/hin_test.csv',header=None)
X_test = list(test_data[0])
Y_test = list(test_data[1])

## DATA PROCESSING

In [7]:
# def get_one_hot(char_dict,ch,len_alphabets):
#   input_index = char_dict[ch]
#   one_hot_tensor = np.zeros(len_alphabets)
#   one_hot_tensor[input_index] = 1
#   return one_hot_tensor.tolist()

def get_one_hot(char_dict,ch,len_alphabets):
  return char_dict[ch]

# def data_processing(data,char_dict,len_chrs):
#   ONE_HOT = []
#   for word in data :
#     word = '$' + word + '*'
#     encoded_word = [get_one_hot(char_dict,i,len_chrs) for i in word]
#     encoded_word = torch.tensor(encoded_word)
#     ONE_HOT.append(encoded_word)
#   return ONE_HOT

MAX_SEQ_LEN = 28

def add_padding(word):
  while len(word) != MAX_SEQ_LEN:
    word += '_'
  return word

def data_processing(data,char_dict,len_chrs):
  ONE_HOT = []
  for word in data :
    word = add_padding(word)
    word = '$' + word + '*'
    encoded_word = [get_one_hot(char_dict,i,len_chrs) for i in word]
    encoded_word = torch.tensor(encoded_word)
    ONE_HOT.append(encoded_word)
  return torch.stack(ONE_HOT)

$ - Start Character \\
\* - End Character

In [8]:
tgt_chrs = '$'
for i in range(2304,2432):
  tgt_chrs += chr(i)
tgt_chrs += '*_'
tgt_char_dict = {char: i for i, char in enumerate(tgt_chrs)}

inp_chrs = '$abcdefghijklmnopqrstuvwxyz*_'
inp_char_dict = {char: i for i, char in enumerate(inp_chrs)}

In [9]:
# print(len(inp_chrs))
# print(len(tgt_chrs))
INP_LANG_SIZE = len(inp_chrs)
OUT_LANG_SIZE = len(tgt_chrs)

Below is the template to create your dataset for the langauage you want to train on

In [10]:
'''
start -> unicode of the first character in the language
end -> unicode of the last character in the language

tgt_chrs = '$'
for i in range(start,end):
  tgt_chrs += chr(i)
tgt_chrs += '*_'
tgt_char_dict = {char: i for i, char in enumerate(tgt_chrs)}

inp_chrs = '$abcdefghijklmnopqrstuvwxyz*_'
inp_char_dict = {char: i for i, char in enumerate(inp_chrs)}
'''

"\nstart -> unicode of the first character in the language\nend -> unicode of the last character in the language\n\ntgt_chrs = '$'\nfor i in range(start,end):\n  tgt_chrs += chr(i)\ntgt_chrs += '*_'\ntgt_char_dict = {char: i for i, char in enumerate(tgt_chrs)}\n\ninp_chrs = '$abcdefghijklmnopqrstuvwxyz*_'\ninp_char_dict = {char: i for i, char in enumerate(inp_chrs)}\n"

In [11]:
# These are lists of tensors corresponding to each word
inp_train = data_processing(X_train,inp_char_dict,len(inp_chrs))
tgt_train = data_processing(Y_train,tgt_char_dict,len(tgt_chrs))
inp_valid = data_processing(X_valid,inp_char_dict,len(inp_chrs))
tgt_valid = data_processing(Y_valid,tgt_char_dict,len(tgt_chrs))
inp_test = data_processing(X_test,inp_char_dict,len(inp_chrs))
tgt_test = data_processing(Y_test,tgt_char_dict,len(tgt_chrs))

In [12]:
print(inp_train.size())
print(tgt_train.size())

torch.Size([51200, 30])
torch.Size([51200, 30])


In [13]:
# print(max([len(inp_train[i]) for i in range(len(inp_train))]))
# print(max([len(inp_test[i]) for i in range(len(inp_test))]))
# print(max([len(inp_valid[i]) for i in range(len(inp_valid))]))
# print(max([len(tgt_train[i]) for i in range(len(tgt_train))]))
# print(max([len(tgt_test[i]) for i in range(len(tgt_test))]))
# print(max([len(tgt_valid[i]) for i in range(len(tgt_valid))]))

In [14]:
print(inp_train[:100].size())

torch.Size([100, 30])


## ENCODER

In [15]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_layer_size, num_encoder_layers, cell_type, dropout_prob, bidirectional):
      super(Encoder, self).__init__()
      '''
        self.input_size : int
        self.hidden_layer_size : int
        self.num_encoder_layers : int
        self.cell_type : string
      '''
      self.input_size = input_size
      self.hidden_layer_size = hidden_layer_size
      self.num_encoder_layers = num_encoder_layers
      self.cell_type = cell_type
      # self.cells = {'RNN':RNN,'LSTM':LSTM,'GRU':GRU}

      # I changed this
      # self.embedding = nn.Embedding(input_size, embedding_size)
      self.embedding = nn.Embedding(INP_LANG_SIZE, embedding_size)

      # if cell_type == 'RNN':
      #   self.rnn = nn.RNN(input_size = embedding_size, hidden_size = hidden_layer_size, num_layers = num_encoder_layers, nonlinearity = 'relu', dropout = dropout_prob, bidirectional = bidirectional, batch_first = True)
      # elif cell_type == 'LSTM':
      #   self.rnn = nn.LSTM(input_size = embedding_size, hidden_size = hidden_layer_size, num_layers = num_encoder_layers, dropout = dropout_prob, bidirectional = bidirectional, batch_first = True)
      # elif cell_type == 'GRU':
      #   self.rnn = nn.GRU(input_size = embedding_size, hidden_size = hidden_layer_size, num_layers = num_encoder_layers, dropout = dropout_prob, bidirectional = bidirectional,batch_first = True)
      
      if cell_type == 'RNN':
        self.rnn = nn.RNN(input_size = embedding_size, hidden_size = hidden_layer_size, num_layers = num_encoder_layers, nonlinearity = 'relu', dropout = dropout_prob, bidirectional = bidirectional)
      elif cell_type == 'LSTM':
        self.rnn = nn.LSTM(input_size = embedding_size, hidden_size = hidden_layer_size, num_layers = num_encoder_layers, dropout = dropout_prob, bidirectional = bidirectional)
      elif cell_type == 'GRU':
        self.rnn = nn.GRU(input_size = embedding_size, hidden_size = hidden_layer_size, num_layers = num_encoder_layers, dropout = dropout_prob, bidirectional = bidirectional)
      
      self.dropout = nn.Dropout(dropout_prob)
      # self.batch_size = batch_size
      self.D = 1
      if bidirectional == True :
        self.D = 2

    def forward(self, input, prev_hidden):
      # print("input size is",input.size())

      # print(input.size())
      # print(prev_hidden.size())
      embedded = self.embedding(input.to(torch.long))
      embedded = self.dropout(embedded)

      # print(embedded.size())

      if self.cell_type == 'RNN':
        output, hidden = self.rnn(embedded,prev_hidden)
      elif self.cell_type == 'LSTM':
        cell = torch.zeros(self.num_encoder_layers,self.batch_size,self.hidden_layer_size)
        output, (hidden,cell) = self.rnn(embedded,(prev_hidden,cell))
      elif self.cell_type == 'GRU':
        output, hidden = self.rnn(embedded,prev_hidden)

      return output,hidden

    def init_hidden(self,batch_size):
      # return torch.zeros(1,1,self.hidden_layer_size)
      if self.cell_type == 'LSTM':
        return torch.zeros(self.num_encoder_layers,batch_size,self.hidden_layer_size)
      else :
        return torch.zeros(self.num_encoder_layers,batch_size,self.hidden_layer_size)


## DECODER

In [16]:
# class Decoder

class Decoder(nn.Module):
  def __init__(self, output_size, embedding_size, hidden_layer_size, num_layers, cell_type, dropout_prob, bidirectional):
    super(Decoder, self).__init__()
    '''
      self.output_size : int
      self.hidden_layer_size : int
      self.num_encoder_layers : int
      self.cell_type : string
      self.rnn : RNN,LSTM,GRU 
    '''
    self.output_size = output_size
    self.hidden_layer_size = hidden_layer_size
    self.num_layers = num_layers
    self.cell_type = cell_type
    self.embedding_size = embedding_size
    # self.batch_size = batch_size
    # self.cells = {'RNN':RNN,'LSTM':LSTM,'GRU':GRU}
    # I changed this
    # self.embedding = nn.Embedding(output_size, embedding_size)
    self.embedding = nn.Embedding(OUT_LANG_SIZE, embedding_size)

    # if cell_type == 'RNN':
    #   self.rnn = nn.RNN(input_size = embedding_size, hidden_size = hidden_layer_size, num_layers = num_layers, nonlinearity = 'relu', dropout = dropout_prob, bidirectional = bidirectional, batch_first = True)
    # elif cell_type == 'LSTM':
    #   self.rnn = nn.LSTM(input_size = embedding_size, hidden_size = hidden_layer_size, num_layers = num_layers, dropout = dropout_prob, bidirectional = bidirectional, batch_first = True)
    # elif cell_type == 'GRU':
    #   self.rnn = nn.GRU(input_size = embedding_size, hidden_size = hidden_layer_size, num_layers = num_layers, dropout = dropout_prob, bidirectional = bidirectional, batch_first = True)

    if cell_type == 'RNN':
      self.rnn = nn.RNN(input_size = embedding_size, hidden_size = hidden_layer_size, num_layers = num_layers, nonlinearity = 'relu', dropout = dropout_prob, bidirectional = bidirectional)
    elif cell_type == 'LSTM':
      self.rnn = nn.LSTM(input_size = embedding_size, hidden_size = hidden_layer_size, num_layers = num_layers, dropout = dropout_prob, bidirectional = bidirectional)
    elif cell_type == 'GRU':
      self.rnn = nn.GRU(input_size = embedding_size, hidden_size = hidden_layer_size, num_layers = num_layers, dropout = dropout_prob, bidirectional = bidirectional)

    self.dropout = nn.Dropout(dropout_prob)
    self.fc = nn.Linear(hidden_layer_size, output_size)

      
  def forward(self, input, prev_hidden):
    
    batch_size = input.size(0)
    # print(input)
    embedded = self.embedding(input.to(torch.int))
    embedded = self.dropout(embedded)
    embedded = embedded.view(1,batch_size,-1)
    # print(embedded.size())
    # embedded = self.embedding(input)
    
    if self.cell_type == 'RNN':
      output, hidden = self.rnn(embedded,prev_hidden)
    elif self.cell_type == 'LSTM':
      output, hidden = self.rnn(embedded,prev_hidden)
    elif self.cell_type == 'GRU':
      output, hidden = self.rnn(embedded,prev_hidden)

    y_pred = self.fc(output)
    return y_pred, hidden

  # def init_hidden(self,batch_size):
  #   # return torch.zeros(self.num_layers,self.hidden_layer_size)
  #   if self.cell_type == 'LSTM':
  #       return torch.zeros(self.num_encoder_layers,self.batch_size,self.hidden_layer_size)
  #   else :
  #     return torch.zeros(self.num_encoder_layers,self.batch_size,self.hidden_layer_size)


## SEQ2SEQ

In [17]:

class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, learning_rate, clip = 5.0, teacher_forcing_ratio = 0.5):
    super(Seq2Seq, self).__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.teacher_forcing_ratio = teacher_forcing_ratio
    self.encoder_optimizer = optim.Adam(self.encoder.parameters(), lr = learning_rate)
    self.decoder_optimizer = optim.Adam(self.decoder.parameters(), lr = learning_rate)
    self.encoder_optimizer.zero_grad()
    self.decoder_optimizer.zero_grad()
    self.criterion = nn.CrossEntropyLoss()
    self.clip = clip
    # self.batch_size = batch_size
      
  def forward(self, input, target):
    batch_size,max_len = input.size()[0],input.size()[1]
    input = input.permute(1,0)
    target = target.permute(1,0)
    # print(target.size())
    '''
    input : max_word_length,batch_size
    target : max_word_length,batch_size
    '''
    self.encoder_optimizer.zero_grad()
    self.decoder_optimizer.zero_grad()
    # Encoder
    encoder_hidden = self.encoder.init_hidden(batch_size)
    encoder_output, encoder_hidden = self.encoder(input,encoder_hidden)
    '''
    encoder_hidden : num_layers,batch_size,embedding_size
    encoder_output : max_input_length,batch_size,embedding_size
    '''

    # Decoder
    decoder_input = target[0]
    decoder_hidden = encoder_hidden[0:self.decoder.num_layers]
    decoder_outputs = torch.zeros(max_len,batch_size,self.decoder.output_size)
    loss = 0

    # With Teacher Forcing
    for i in range(1,max_len):
      decoder_output,decoder_hidden = self.decoder(decoder_input,decoder_hidden)
      loss += self.criterion(decoder_output[0],target[i])
      decoder_outputs[i] = decoder_output
      decoder_input = target[i]

    # Without Teacher Forcing
    # for i in range(1,max_len):
    #   # print(decoder_input.size())
    #   decoder_output,decoder_hidden = self.decoder(decoder_input,decoder_hidden)
    #   loss += self.criterion(decoder_output[0],target[i])
    #   decoder_outputs[i] = decoder_output
    #   decoder_input = torch.argmax(decoder_output[0],1)


    # encoder_outputs = ()
    # for i in range(input_length):
    #   encoder_output, encoder_hidden = self.encoder(input[i],encoder_hidden)
    #   encoder_outputs = encoder_outputs + (encoder_output,)
    # encoder_outputs = torch.stack(encoder_outputs)


    # decoder_input = target[0]
    # decoder_hidden = encoder_hidden
    # loss = 0
    # for i in range(1,target_length):
    #   decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
    #   loss += self.criterion(decoder_output, target[i].to(torch.long))
    #   decoder_input = target[i]
    
    # print(decoder_outputs.size())
    # loss = self.criterion()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(self.encoder.parameters(), self.clip)
    torch.nn.utils.clip_grad_norm_(self.decoder.parameters(), self.clip)
    self.encoder_optimizer.step()
    self.decoder_optimizer.step()
    return loss

  def predict(input):

    pass

In [18]:
x = tgt_train[0:128]
print(x.permute(1,0).size())

torch.Size([30, 128])


## Testing

In [19]:
# Testing

# clip = 4.0
# batch_size = 128

# encodertest = Encoder(30,32,10,5,'RNN',0.1,False,batch_size = batch_size)
# decodertest = Decoder(30,32,10,5,'RNN',0.1,False,batch_size = batch_size)
# Model = Seq2Seq(Encoder,Decoder)
# # print(encodertest)

# criterion_ = nn.CrossEntropyLoss()

# # input = inp_train[0]
# # target = tgt_train[0]

# # input_length = input.size()[0]
# # target_length = target.size()[0]

# encoder_hidden = encodertest.init_hidden()

# # Initial Implementation
# # encoder_outputs = ()
# # for i in range(0,len(inp_train),batch_size):
# #   curr_batch = min(len(inp_train)-i,batch_size)
# #   encoder_output, encoder_hidden = encodertest(input[i:i+curr_batch],encoder_hidden)
# #   print(encoder_output.size())
# #   encoder_outputs = encoder_outputs + (encoder_output,)
# # encoder_outputs = torch.stack(encoder_outputs)

# encoder_output, encoder_hidden = encodertest(inp_train[0:batch_size],encoder_hidden)
# encoder_hidden = encoder_hidden.permute(1,0,2)
# # print(encoder_hidden[encoder_hidden.size()[0]-1])


# decoder_input = tgt_train[0]
# decoder_hidden = encoder_hidden[encoder_hidden.size()[0]-1]
# loss = 0
# decoder_outputs = ()
# for i in range(1,len(tgt_train)):
#   decoder_output, decoder_hidden = decodertest(decoder_input, decoder_hidden)
#   loss += criterion_(decoder_output, tgt_train[i].to(torch.long))
#   decoder_input = tgt_train[i]
# loss.backward()
# torch.nn.utils.clip_grad_norm_(encodertest.parameters(), clip)
# torch.nn.utils.clip_grad_norm_(decodertest.parameters(), clip)
# print(loss)



# encoder_optimizer.step()
# decoder_optimizer.step()



## Training

In [21]:
def train_once(model,batch_size):
  # Trains once on the whole dataset
  training_loss = 0
  # for i in range(len(inp_train)):
    # if i % 10 == 0:
    #   print(i)
  train_length = inp_train.size(0)
  for i in range(0,train_length,batch_size):
    curr_batch = min(train_length-i,batch_size)
    input_tensor = inp_train[i:i+curr_batch]
    target_tensor = tgt_train[i:i+curr_batch]
    loss = model(input_tensor,target_tensor)
    training_loss += loss
  return training_loss

def train(model,epochs,batch_size):
  for epoch in range(epochs):
    loss = train_once(model,batch_size)
    print("Epoch : %d, Training Loss : %f" % (epoch+1,loss))



## Run

In [None]:
# define hyperparameters
input_size = INP_LANG_SIZE
embedding_size = 64
output_size = OUT_LANG_SIZE
hidden_layer_size = 64
num_layers = 4
cell_type = 'RNN'
dropout_prob = 0.1
learning_rate = 0.01
bidirectional = False
batch_size = 256
epochs = 5
encoder = Encoder(input_size, embedding_size, hidden_layer_size, num_layers, cell_type, dropout_prob, bidirectional)
decoder = Decoder(output_size, embedding_size, hidden_layer_size, num_layers, cell_type, dropout_prob, bidirectional)
model = Seq2Seq(encoder,decoder,learning_rate)
train(model,epochs,batch_size)
# optimizer = optim.Adam(model.parameters(),lr = learning_rate)
# criterion = nn.CrossEntropyLoss()


Epoch : 1, Training Loss : 6821.577148


In [None]:
sweep_config = {
    # "name" : "assignment_sweeps",
    "method" : "bayes",
    "metric" :{
        "name" : "validation_accuracy",
        "goal" : "maximize"
    },
    "parameters" : {
        "embedding_size" : {
            "values" : [16,32,64,128,256]
        },
        "num_encoder_layers" : {
            "values" : [1,2,3,4]
        },
        "num_decoder_layers" : {
            "values" : [1,2,3,4]
        },
        "hidden_layer_size" : {
            "values" : [16,32,64,128,256]
        },
        "learning_rate" : {
            "values" : [0.001,0.0001]
        },
        "cell_type" : {
           "values" : ['RNN', 'LSTM', 'GRU'] 
        },
        "dropout" : {
            "values" : [0.1,0.2,0.3]
        },
        "weight_initializer" : {
            "values" : ["random","xavier"]
        },
    }
}

