<a href="https://colab.research.google.com/github/Dhananjay42/cs6910-assn3/blob/main/assn3_nb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##CS6910: Assignment-3
-- Submitted by Dhananjay Balakrishnan, ME19B012

##Setup and Loading the Dataset

In [3]:
import csv
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [4]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [5]:
data_dir = '/content/gdrive/MyDrive/CS6910_A3/aksharantar_sampled/tam/'

In [6]:
def obtain_data(dir):
  x = []
  y = []

  with open(dir, 'r') as file:
    reader = csv.DictReader(file, fieldnames=['x', 'y'])
  
    for row in reader:
      x.append(row['x'])
      y.append(row['y'])
    
  return x, y

In [7]:
x_train, y_train = obtain_data(data_dir + 'tam_train.csv')
x_test, y_test = obtain_data(data_dir + 'tam_test.csv')
x_val, y_val = obtain_data(data_dir + 'tam_valid.csv')

In [8]:
for i in range(0, 2):
  print(x_train[i], y_train[i])

thottacharya தொட்டாச்சார்ய
menmaithaan மென்மைதான்


In [9]:
start_token = 0
end_token = 1

In [10]:
eng_characters = {}

In [11]:
class Language:
  def __init__(self, name):
    self.name = name
    self.char2index = {}
    self.index2char = {0: "SOS", 1: "EOS", 2: "unknown"}
    self.n_chars = 3  # Count SOS and EOS
    self.max_size = 0

  def update_vocab(self, x):
    for word in x:
      if len(word) > self.max_size:
        self.max_size = len(word)

      for letter in word:
        if letter not in self.char2index.keys():
          self.char2index[letter] = self.n_chars
          self.index2char[self.n_chars] = letter
          self.n_chars = self.n_chars + 1
  
  def get_index(self, character):
    if character in self.char2index.keys():
      return self.char2index[character]
    else:
      return 2
  
  def get_character(self, index):
    if index == 2:
      return '$'
    else:
      return self.index2char[index]

In [12]:
english = Language('eng')
tamil = Language('tam')

In [13]:
english.update_vocab(x_train)
tamil.update_vocab(y_train)

In [14]:
def encoded_word(language, word):
  coded = [language.get_index(letter) for letter in word]
  coded.append(end_token)
  return coded

def get_pairs(lang1, lang2, inputs, targets):
  return [(torch.tensor(encoded_word(lang1, x), dtype=torch.long, device=device).view(-1, 1), torch.tensor(encoded_word(lang2, y), dtype=torch.long, device=device).view(-1, 1)) 
  for (x,y) in zip(inputs,targets)]

def decoded_word(language, encoded_word):
  characters = [language.get_character(num) for num in encoded_word[:-1]]
  decoded = ''.join(characters)
  return decoded

In [15]:
train_data = get_pairs(english, tamil, x_train, y_train)
test_data = get_pairs(english, tamil, x_test, y_test)
val_data = get_pairs(english, tamil, x_val, y_val)

#RNN without Attention

In [41]:
class Encoder(nn.Module):
  def __init__(self, inp_vocab_size, embedding_size, n_layers, hl_size, cell_type = 'RNN', bidirectional = True):
    super(Encoder, self).__init__()
    self.vocab_size = inp_vocab_size
    self.embedding_size = embedding_size
    self.n_layers = n_layers
    self.hl_size = hl_size
    self.bidirectional = bidirectional
    self.cell_type = cell_type

    if cell_type == 'RNN':
      self.cell = nn.RNN(self.embedding_size, self.hl_size, num_layers = self.n_layers, bidirectional = self.bidirectional).to(device)
    elif cell_type == 'GRU':
      self.cell = nn.GRU(self.embedding_size, self.hl_size, num_layers = self.n_layers, bidirectional = self.bidirectional).to(device)
    elif cell_type == 'LSTM':
      self.cell = nn.LSTM(self.embedding_size, self.hl_size, num_layers = self.n_layers, bidirectional = self.bidirectional).to(device)
    else:
      print('Wrong Cell Type.')
      exit()
    self.embedding_layer = nn.Embedding(self.vocab_size, self.embedding_size).to(device)
  
  def forward(self, input, hidden, c = 0):
    embedded = self.embedding_layer(input).view(1, 1, -1)
    if self.cell_type != 'LSTM':
      output, hidden = self.cell(embedded, hidden)
      return output, hidden
    else:
      output, (hidden, c) = self.cell(embedded, (hidden, c))
      return output, hidden, c
  
  def init_hidden(self):
    return torch.zeros(self.n_layers, 1, self.hl_size, device = device)

In [42]:
class DecoderVanilla(nn.Module):
  def __init__(self, out_vocab_size, embedding_size, n_layers, hl_size, cell_type = 'RNN', bidirectional = True):
    super(DecoderVanilla, self).__init__()
    self.vocab_size = out_vocab_size
    self.embedding_size = embedding_size
    self.n_layers = n_layers
    self.hl_size = hl_size
    self.linear = nn.Linear(self.hl_size, self.vocab_size).to(device)
    self.softmax = nn.LogSoftmax(dim=1)
    self.cell_type = cell_type
    self.bidirectional = bidirectional

    if cell_type == 'RNN':
      self.cell = nn.RNN(self.embedding_size, self.hl_size, num_layers = self.n_layers, bidirectional = self.bidirectional).to(device)
    elif cell_type == 'GRU':
      self.cell = nn.GRU(self.embedding_size, self.hl_size, num_layers = self.n_layers, bidirectional = self.bidirectional).to(device)
    elif cell_type == 'LSTM':
      self.cell = nn.LSTM(self.embedding_size, self.hl_size, num_layers = self.n_layers, bidirectional = self.bidirectional).to(device)
    else:
      print('Wrong Cell Type.')
      exit()
    
    self.embedding_layer = nn.Embedding(self.vocab_size, self.embedding_size).to(device)
  
  def forward(self, input, hidden, c = 0):
    embedded = self.embedding_layer(input).view(1, 1, -1)
    output = F.relu(embedded)

    if self.cell_type != 'LSTM':
      output, hidden = self.cell(output, hidden)
      output = self.linear(output[0])
      output = self.softmax(output)
      return output, hidden
    else:
      output, (hidden, c) = self.cell(output, (hidden, c))
      output = self.linear(output[0])
      output = self.softmax(output)
      return output, hidden, c

In [52]:
class seq2seq_vanilla():
  def __init__(self, inp_language, out_language, embedding_size, n_layers, hl_size, cell_type = 'LSTM', lr = 0.01, teacher_forcing_ratio = 0.5,bidirectional_flag = False):
    self.encoder = Encoder(inp_language.n_chars, embedding_size, n_layers, hl_size, cell_type, bidirectional = bidirectional_flag)
    self.decoder = DecoderVanilla(out_language.n_chars, embedding_size, n_layers, hl_size, cell_type, bidirectional = bidirectional_flag)
    self.lr = lr
    self.teacher_forcing = teacher_forcing_ratio
    self.max_length = out_language.max_size
    self.cell_type = cell_type

    self.encoder_optimizer = optim.SGD(self.encoder.parameters(), lr=self.lr)
    self.decoder_optimizer = optim.SGD(self.decoder.parameters(), lr = self.lr)

    self.loss_fn = nn.NLLLoss()

  def train_step(self, input, target):
    encoder_hidden = self.encoder.init_hidden()
    encoder_c = self.encoder.init_hidden()

    self.encoder_optimizer.zero_grad()
    self.decoder_optimizer.zero_grad()

    input_length = input.size(0)
    target_length = target.size(0)

    loss = 0

    for i in range(0, input_length):
      if self.cell_type != 'LSTM':
        encoder_output, encoder_hidden = self.encoder.forward(input[i], encoder_hidden)
      else:
        encoder_output, encoder_hidden, encoder_c = self.encoder.forward(input[i], encoder_hidden, encoder_c)
    
    decoder_input = torch.tensor([[start_token]], device=device)

    #decoder_hidden = encoder_hidden.unsqueeze(1)
    decoder_hidden = encoder_hidden
    decoder_c = encoder_c

    num = random.random()

    if num < self.teacher_forcing:
      #here, we use teacher forcing. 
      for j in range(0, target_length):
        if self.cell_type != 'LSTM':
          decoder_output, decoder_hidden = self.decoder.forward(decoder_input, decoder_hidden)
        else:
          decoder_output, decoder_hidden, decoder_c = self.decoder.forward(decoder_input, decoder_hidden, decoder_c)

        loss = loss + self.loss_fn(decoder_output, target[j])
        decoder_input = target[j]#.unsqueeze(0)

    else:
      #here, there is no teacher forcing. the predictions themselves are used. 
      #outputs = []
      for j in range(0, target_length):
        if self.cell_type != 'LSTM':
          decoder_output, decoder_hidden = self.decoder.forward(decoder_input, decoder_hidden)
        else:
          decoder_output, decoder_hidden, decoder_c = self.decoder.forward(decoder_input, decoder_hidden, decoder_c)

        loss = loss + self.loss_fn(decoder_output, target[j])
        value, index = decoder_output.topk(1)
        decoder_input = index.squeeze().detach()
        #outputs.append(decoder_input.item())
        if decoder_input.item() == end_token:
          break
        
        #target_mod = [t[0] for t in target.tolist()]

      #print(decoded_word(tamil,outputs), decoded_word(tamil,target_mod))
            
    loss.backward()
    self.encoder_optimizer.step()
    self.decoder_optimizer.step()

    return loss.item()/target_length
  
  def predict(self, input, target):
    with torch.no_grad():
      encoder_hidden = self.encoder.init_hidden()
      encoder_c = self.encoder.init_hidden()

      input_length = input.size(0)
      for i in range(0, input_length):
        if self.cell_type != 'LSTM':
          encoder_output, encoder_hidden = self.encoder.forward(input[i], encoder_hidden)
        else:
          encoder_output, encoder_hidden, encoder_c = self.encoder.forward(input[i], encoder_hidden, encoder_c)

      decoder_input = torch.tensor([[start_token]], device=device)
      #decoder_hidden = encoder_hidden.unsqueeze(1)
      decoder_hidden = encoder_hidden
      decoder_c = encoder_c

      outputs = []
      for i in range(0, self.max_length):
        if self.cell_type != 'LSTM':
          decoder_output, decoder_hidden = self.decoder.forward(decoder_input, decoder_hidden)
        else:
          decoder_output, decoder_hidden, decoder_c = self.decoder.forward(decoder_input, decoder_hidden, decoder_c)

        value, index = decoder_output.data.topk(1)
        decoder_input = index.squeeze().detach()
        outputs.append(decoder_input.item())
        if decoder_input.item() == end_token:
          break

      return outputs


  def evaluate(self, data):
    correct = 0
    count = 0

    for pair in data:
      input = pair[0]
      target = pair[1]
      pred = self.predict(input, target)
      target = target.tolist()
      target = [t[0] for t in target]

      if count%500 == 0:
        print(decoded_word(tamil,pred), decoded_word(tamil,target))
      count = count + 1
      
      if len(pred) != len(target):
        continue
      else:
        if pred == target:
          correct = correct + 1

    return correct/len(x_test)    

In [53]:
model = seq2seq_vanilla(inp_language = english, out_language = tamil, embedding_size = 64, n_layers = 2, hl_size = 128)

In [50]:
n_iters = 75000

In [27]:
print(len(val_data))

4096


In [28]:
training_pairs = [random.choice(train_data) for i in range(0, n_iters)]

In [54]:
train_loss = 0
for i in range(0, n_iters):
  training_pair = training_pairs[i]
  x = training_pair[0]
  y = training_pair[1]
  loss = model.train_step(x, y)
  train_loss = train_loss + loss

  if (i+1)%5000 == 0:
    print('------------------------------------------------')
    print('train loss is:', train_loss/5000)
    test_acc = model.evaluate(val_data)
    print('test accuracy is:', test_acc)
    train_loss = 0

------------------------------------------------
train loss is: 2.7181953461752504
புப்பிப்பப்பத்க் நாணயக்குற்றிகளும்
புப்பிப்புக் கௌரவத்தை
புப்பு போயிங்
புப்பு அடிகளை
புப்பிக்் பிஸினஸ்
புப்பிப்பி்்் கோட்பாடுகளில்
புப்புக் கோவலர்
பும் தாலி
ப்ப் வண
test accuracy is: 0.0
------------------------------------------------
train loss is: 2.489743823318045
பிர்்்்்்்்்்்் நாணயக்குற்றிகளும்
தாத்தத்த கௌரவத்தை
பிர்்் போயிங்
பார்ா் அடிகளை
பி்்்்்்் பிஸினஸ்
பார்கிக்கக்் கோட்பாடுகளில்
பா்்்் கோவலர்
கா்் தாலி
கின் வண
test accuracy is: 0.00048828125
------------------------------------------------
train loss is: 2.355724639434746
துருக்ககககககககக்் நாணயக்குற்றிகளும்
துுத்தத்ுு கௌரவத்தை
கி்்்்் போயிங்
கி்்கள அடிகளை
கி்்ப்ப்்் பிஸினஸ்
குர்கககககககக் கோட்பாடுகளில்
காக்கள் கோவலர்
கில் தாலி
கின் வண
test accuracy is: 0.0
------------------------------------------------
train loss is: 2.1930159980694723
மா்்ககககககககககும் நாணயக்குற்றிகளும்
குததததததா கௌரவத்தை
பிர்கிக் போயிங்
விட்கா அடிகளை
பிர்பிர்் பிஸினஸ்
கட்ப

In [None]:
# train_loss = 0
# for i in range(0, n_epochs):
#   for training_pair in train_data:
#     x = training_pair[0]
#     y = training_pair[1]
#     loss = model.train_step(x, y)
#     train_loss = train_loss + loss

#     if (i+1)%5000 == 0:
#       print('------------------------------------------------------------')
#       print('train loss is:', train_loss/5000)
#       test_acc = model.evaluate(training_pairs[0:1000])
#       print('test accuracy is:', test_acc)
#       train_loss = 0