In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import wandb 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [3]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 40
WANDB_NOTEBOOK_NAME = 'Assignment3'
WANDB_PROJECT_NAME = 'CS6910_A3'
WANDB_ENTITY = 'cs20b004'
wandb.login()

class Lang:
    def __init__(self, name):
        self.name = name
        # self.word2index = {}
        # self.word2count = {}
        # self.index2word = {0: "SOS", 1: "EOS"}
        # self.n_words = 2  # Count SOS and EOS
        self.letter_index = {}
        self.letter_count = {}
        self.index_letter = {0: "SOS", 1: "EOS"}
        self.n_letters = 2  # Count SOS and EOS


    def add_word(self, word):
        # for word in sentence.split(' '):
        #     self.addWord(word)
        for letter in word:
            self.add_letter(letter)

    def add_letter(self, letter):
        # if word not in self.word2index:
        #     self.word2index[word] = self.n_words
        #     self.word2count[word] = 1
        #     self.index2word[self.n_words] = word
        #     self.n_words += 1
        # else:
        #     self.word2count[word] += 1
        if letter not in self.letter_index:
            self.letter_index[letter] = self.n_letters
            self.letter_count[letter] = 1
            self.index_letter[self.n_letters] = letter
            self.n_letters += 1
        else:
            self.letter_count[letter] += 1

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcs20b004[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
def read_words(output_lang, type = 'train', reverse = False):
    lines = open('aksharantar_sampled/%s/%s_%s.csv' % (output_lang,output_lang,type), encoding='utf-8').read().strip().split('\n')
    pairs = [[s for s in l.split(',')] for l in lines]
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
    else:
        pairs = [list(p) for p in pairs]
    return pairs

def init_lang(lang1, lang2, type = 'train', reverse = False):
    pairs = read_words(lang2, type, reverse)
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)
    char = 'a'
    for i in range(26):
        input_lang.add_letter(char)
        char = chr(ord(char) + 1)
    char = '\u0900'
    for i in range(128):
        output_lang.add_letter(char)
        char = chr(ord(char) + 1)

    
    return input_lang, output_lang, pairs

In [5]:
pairs = read_words('mar',type = 'train')
for p in pairs:
    MAX_LENGTH = max(MAX_LENGTH, len(p[1]))
print("MAX_LENGTH: ", MAX_LENGTH)

MAX_LENGTH:  40


In [6]:
# def indexesFromSentence(lang, sentence):
#     return [lang.word2index[word] for word in sentence.split(' ')]
def word_to_index(lang, word):
    return [lang.letter_index[letter] for letter in word]


# def tensorFromSentence(lang, sentence):
#     indexes = indexesFromSentence(lang, sentence)
#     indexes.append(EOS_token)
#     return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def word_to_tensor(lang, word):
    indexes = word_to_index(lang, word)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


# def tensorsFromPair(pair):
#     input_tensor = tensorFromSentence(input_lang, pair[0])
#     target_tensor = tensorFromSentence(output_lang, pair[1])
#     return (input_tensor, target_tensor)

def pair_to_tensor(input_lang,output_lang, pair):
    input_tensor = word_to_tensor(input_lang, pair[0])
    target_tensor = word_to_tensor(output_lang, pair[1])
    return (input_tensor, target_tensor)


In [7]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, type = 'gru', nonlinearity = 'tanh'):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.type = type
        if self.type == 'gru':
            self.gru = nn.GRU(embedding_size, hidden_size, num_layers = num_layers)
        elif self.type == 'lstm':
            self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers = num_layers)
        elif self.type == 'rnn':
            self.rnn = nn.RNN(embedding_size, hidden_size, num_layers = num_layers)

    def forward(self, input, hidden, cell):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        if self.type == 'gru':
            output, hidden = self.gru(output, hidden)
        elif self.type == 'lstm':
            output, (hidden, cell) = self.lstm(output, (hidden, cell))
        elif self.type == 'rnn':
            output, hidden = self.rnn(output, hidden)
        return output, hidden, cell

    def initHidden(self):
        return torch.zeros(self.num_layers , 1, self.hidden_size, device=device)
    
    def init_cell(self):
        return self.initHidden()

In [8]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding_size, output_size, num_layers, nonlinearity = 'tanh', dropout_p = 0.1, type = 'gru'):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        # self.dropout_p = dropout_p
        self.embedding = nn.Embedding(output_size, embedding_size)
        # self.dropout = nn.Dropout(self.dropout_p)
        self.num_layers = num_layers
        self.type = type
        if self.type == 'gru':
            self.gru = nn.GRU(embedding_size, hidden_size, num_layers = num_layers)
        elif self.type == 'lstm':
            self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers = num_layers)
        elif self.type == 'rnn':
            self.rnn = nn.RNN(embedding_size, hidden_size, num_layers = num_layers)
            
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden, cell):
        output = self.embedding(input).view(1, 1, -1)
        # output = self.dropout(output)
        output = F.relu(output)
        if self.type == 'gru':
            output, hidden = self.gru(output, hidden)
        elif self.type == 'lstm':
            output, (hidden, cell) = self.lstm(output, (hidden, cell))
        elif self.type == 'rnn':
            output, hidden = self.rnn(output, hidden)
        
        output = self.softmax(self.out(output[0]))
        return output, hidden, cell

    def initHidden(self):
        return torch.zeros(self.num_layers, 1, self.hidden_size, device=device)
    
        
    def init_cell(self):
        return self.initHidden()

In [9]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding_size, output_size, num_layers, nonlinearity = 'tanh', dropout_p=0.1, max_length=MAX_LENGTH, type = 'gru'):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        self.num_layers = num_layers
        self.embedding = nn.Embedding(self.output_size, self.embedding_size)
        self.attn = nn.Linear(self.hidden_size + self.embedding_size, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size + self.embedding_size, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.type = type
        
        if self.type == 'gru':
            self.gru = nn.GRU(hidden_size, hidden_size, num_layers = num_layers)
        elif self.type == 'lstm':
            self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers = num_layers)
        elif self.type == 'rnn':
            self.rnn = nn.RNN(hidden_size, hidden_size, num_layers = num_layers)

        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, cell, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        if self.type == 'gru':
            output, hidden = self.gru(output, hidden)
        elif self.type == 'lstm':
            output, (hidden, cell) = self.lstm(output, (hidden, cell))
        elif self.type == 'rnn':
            output, hidden = self.rnn(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, cell, attn_weights

    def initHidden(self):
        return torch.zeros(self.num_layers, 1, self.hidden_size, device=device)
        
    def init_cell(self):
        return self.initHidden()

In [10]:
class Transliterator():
    def __init__(self, encoder_hp, decoder_hp, attn = True):
        self.encoder = EncoderRNN(
                        encoder_hp['input_size'],
                        encoder_hp['embedding_size'],
                        encoder_hp['hidden_size'], 
                        encoder_hp['num_layers'], 
                        type = encoder_hp['type']).to(device)
        self.attn = attn
        if attn:
            self.decoder = AttnDecoderRNN(
                            decoder_hp['hidden_size'], 
                            decoder_hp['embedding_size'],
                            decoder_hp['output_size'], 
                            decoder_hp['num_layers'], 
                            type = decoder_hp['type']).to(device)
        else:
            self.decoder = DecoderRNN(
                            decoder_hp['hidden_size'], 
                            decoder_hp['embedding_size'],
                            decoder_hp['output_size'], 
                            decoder_hp['num_layers'],  
                            type = decoder_hp['type']).to(device)

    def train(self, input_tensor, target_tensor, max_length=MAX_LENGTH):
        encoder_hidden = self.encoder.initHidden()
        encoder_cell = self.encoder.init_cell()

        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()

        input_length = input_tensor.size(0)
        target_length = target_tensor.size(0)

        encoder_outputs = torch.zeros(max_length, self.encoder.hidden_size, device=device)

        loss = 0

        for ei in range(input_length):
            encoder_output, encoder_hidden, encoder_cell = self.encoder(input_tensor[ei], encoder_hidden, encoder_cell)
            encoder_outputs[ei] = encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)
        decoder_hidden = encoder_hidden[encoder_hidden.shape[0] - 1].unsqueeze(0)
        decoder_cell = encoder_cell[encoder_cell.shape[0] - 1].unsqueeze(0)

        decoded_word = ''
        for i in range(self.decoder.num_layers - 1):
            decoder_hidden = torch.cat((decoder_hidden, encoder_hidden[encoder_hidden.shape[0] - 1].unsqueeze(0)), 0)
            decoder_cell = torch.cat((decoder_cell, encoder_cell[encoder_cell.shape[0] - 1].unsqueeze(0)), 0)

        use_teacher_forcing = True if random.random() < self.teacher_forcing_ratio else False
        # print(encoder_outputs.shape)
        if use_teacher_forcing:
            for di in range(target_length):
                if self.attn:
                    decoder_output, decoder_hidden, decoder_cell, decoder_attention = self.decoder(decoder_input, decoder_hidden, decoder_cell, encoder_outputs)
                else:
                    decoder_output, decoder_hidden, decoder_cell = self.decoder(decoder_input, decoder_hidden, decoder_cell)
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze().detach()  
                loss += self.criterion(decoder_output, target_tensor[di])
                decoded_word += self.output_lang.index_letter[topi.item()]
                decoder_input = target_tensor[di]
        else:
            for di in range(target_length):
                if self.attn:
                    decoder_output, decoder_hidden, decoder_cell, decoder_attention = self.decoder(decoder_input, decoder_hidden, decoder_cell, encoder_outputs)
                else:
                    decoder_output, decoder_hidden, decoder_cell = self.decoder(decoder_input, decoder_cell, decoder_hidden)
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze().detach()  
                loss += self.criterion(decoder_output, target_tensor[di])
                if decoder_input.item() == EOS_token:
                    break
                decoded_word += self.output_lang.index_letter[topi.item()]
                
        
        loss.backward()

        self.encoder_optimizer.step()
        self.decoder_optimizer.step()
        
        return loss.item() / target_length, decoded_word 
    
    def fit( self, train_io_pair, val_io_pair, input_lang, output_lang, n_epochs,optimizer = 'adam', criterion = nn.NLLLoss(), print_every=1000, learning_rate=0.0001, teacher_forcing_ratio = 0.5, use_wanb = False):
        self.input_lang = input_lang
        self.output_lang = output_lang

        if(optimizer == 'adam'):
            self.encoder_optimizer = optim.Adam(self.encoder.parameters(), lr=learning_rate)
            self.decoder_optimizer = optim.Adam(self.decoder.parameters(), lr=learning_rate)
        elif(optimizer == 'sgd'):
            self.encoder_optimizer = optim.SGD(self.encoder.parameters(), lr=learning_rate)
            self.decoder_optimizer = optim.SGD(self.decoder.parameters(), lr=learning_rate)
            
        self.criterion = criterion
        self.learning_rate = learning_rate
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.n_epochs = n_epochs
        self.print_every = print_every
        
        self.all_losses = []
        self.total_loss = 0

        # for i in range(len(train_io_pair)):
        #     train_io_pair[i] = pair_to_tensor(train_io_pair[i])
        training_pairs = [pair_to_tensor(self.input_lang, self.output_lang, train_io_pair[i]) for i in range(len(train_io_pair))]
        train_accs = []
        val_accs = []
        for epc in range(1,n_epochs + 1):
            correct = 0
            for inp in range(len(train_io_pair)):
                input_tensor = training_pairs[inp][0]
                target_tensor = training_pairs[inp][1]
                loss, decoder_output = self.train(input_tensor, target_tensor)
                self.total_loss += loss
                self.all_losses.append(loss)
                if(decoder_output == train_io_pair[inp][1]):
                    correct += 1
            train_acc = self.eval(train_io_pair)
            train_accs.append(train_acc)
            val_acc = self.eval(val_io_pair)
            val_accs.append(val_acc)
            print("Epoch: ", epc, " Loss: ", self.total_loss / len(train_io_pair),
                  "Training Accuracy: ", correct/len(train_io_pair),
                  "Validation Accuracy: ", val_acc)
            self.total_loss = 0
            if(use_wanb):
                wandb.log({
                    "epoch" : epc,
                    "train_loss" : self.total_loss / len(train_io_pair),
                    "Training Accuracy": train_acc, 
                    "Validation Accuracy": val_acc
                    })
            if len(val_accs) > 2 and val_accs[-1] < val_accs[-2]:
                break
            if len(train_accs) > 2 and train_accs[-1] < train_accs[-2]:
                break
        
        torch.save(self.encoder.state_dict(), f'Saved_models/val_acc:{val_accs[-1]}encoder.pth')
        torch.save(self.decoder.state_dict(), f'Saved_models/val_acc:{val_accs[-1]}decoder.pth')

    def eval(self, io_pairs):
        correct = 0
        for i in range(len(io_pairs)):
            output, _ = self.predict(io_pairs[i][0])
            if output == io_pairs[i][1]:
                correct += 1
        return correct / len(io_pairs)
    
    def predict(self, input_word, max_length=MAX_LENGTH):
        with torch.no_grad():
            input_tensor = word_to_tensor(self.input_lang, input_word)
            input_length = input_tensor.size()[0]
            encoder_hidden = self.encoder.initHidden()
            encoder_cell = self.encoder.init_cell()
            encoder_outputs = torch.zeros(max_length, self.encoder.hidden_size, device=device)

            for ei in range(input_length):
                encoder_output, encoder_hidden, encoder_cell = self.encoder(input_tensor[ei],
                                                        encoder_hidden, encoder_cell)
                encoder_outputs[ei] += encoder_output[0, 0]

            decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

            decoder_hidden = encoder_hidden[encoder_hidden.shape[0] - 1].unsqueeze(0)
            decoder_cell = encoder_cell[encoder_cell.shape[0] - 1].unsqueeze(0)

            decoded_word = ''
            for i in range(self.decoder.num_layers - 1):
                decoder_hidden = torch.cat((decoder_hidden, encoder_hidden[encoder_hidden.shape[0] - 1].unsqueeze(0)), 0)
                decoder_cell = torch.cat((decoder_cell, encoder_cell[encoder_cell.shape[0] - 1].unsqueeze(0)), 0)

            decoder_attentions = torch.zeros(max_length, max_length)

            for di in range(max_length):
                if self.attn:
                    decoder_output, decoder_hidden, decoder_cell, decoder_attention = self.decoder(
                    decoder_input, decoder_hidden, decoder_cell, encoder_outputs)
                    decoder_attentions[di] = decoder_attention.data
                else:
                    decoder_output, decoder_hidden, decoder_cell = self.decoder(
                    decoder_input, decoder_hidden, decoder_cell)
                topv, topi = decoder_output.data.topk(1)
                if topi.item() == EOS_token:
                    # decoded_words.append('<EOS>')
                    # decoded_word += '<EOS>'
                    break
                else:
                    # decoded_words.append(output_lang.letter_index[topi.item()])
                    decoded_word += self.output_lang.index_letter[topi.item()]
                    # print(output_lang.index_letter[topi.item()])

                decoder_input = topi.squeeze().detach()

            return decoded_word, decoder_attentions[:di + 1]


In [11]:
input_lang, output_lang, train_pairs = init_lang( 'eng','mar', type = 'train', reverse = False)
validation_pairs = read_words('mar', type = 'valid')
# print(input_lang.n_letters)
encoder_hp = {
    'input_size': input_lang.n_letters, 
    'embedding_size': 64, 
    'hidden_size': 512, 
    'num_layers': 1, 
    'type': 'gru'}
decoder_hp = {
    'hidden_size': 512, 
    'embedding_size': 64, 
    'output_size': output_lang.n_letters, 
    'num_layers': 1, 
    'type': 'lstm'}

model = Transliterator(encoder_hp=encoder_hp, decoder_hp=decoder_hp, attn = True)

model.fit(train_pairs[:1000], validation_pairs, input_lang, output_lang, optimizer='adam', n_epochs = 1, learning_rate = 0.0003, teacher_forcing_ratio = 0.5)

Epoch:  1  Loss:  2.929136398452687 Training Accuracy:  0.0 Validation Accuracy:  0.000244140625


In [40]:
sweep_config = {
    'method': 'random', #grid, random
    'metric': {
        'name': 'Validation Accuracy',
        'goal': 'maximize'
    },
    'parameters': {
        'embedding_size': {
            'values': [64, 128]   }, 
        'hidden_size': {
            'values': [128, 256, 384, 512]   },
        'encoder_num_layers': { 
            'values': [1, 2, 3]   },
        'encoder_type': {'values' : ['gru', 'lstm']},
        'decoder_num_layers': {
            'values': [1, 2, 3]   },
        'decoder_type': {'values' : ['gru' ,'lstm']},
        'learning_rate': {'values': [0.0001, 0.0006, 0.001, 0.0003]},
        'teacher_forcing_ratio': {'values': [0.5, 0.6, 0.7, 0.8, 0.9]},
        'optimizer' : {'values': ['adam', 'sgd']},
        'epochs' : { 'values' : [10]}
        
    }
}

sweep_id = wandb.sweep(sweep_config, project="CS6910_Assignment3")

def train():
    config_defaults = {
        'hidden_size': 256,
        'encoder_num_layers': 1,
        'encoder_type': 'gru',
        'decoder_num_layers': 1,
        'decoder_type': 'gru',
        'learning_rate': 0.0001,
        'teacher_forcing_ratio': 0.5,
        'optimizer' : 'adam',
        'epochs' : 5
    }

    run = wandb.init()
    config = wandb.config
    name_str = f"nle_{wandb.config['encoder_num_layers']}_nld_{wandb.config['decoder_num_layers']}_lr_{wandb.config['learning_rate']}_eu_{wandb.config['encoder_type']}_du_{wandb.config['decoder_type']}"
    wandb.run.name = name_str
    model = Transliterator(
        encoder_hp={
            'input_size': input_lang.n_letters,
            'embedding_size' : config.embedding_size, 
            'hidden_size': config.hidden_size, 
            'num_layers': config.encoder_num_layers, 
            'type': config.encoder_type}, 
        decoder_hp={
            'hidden_size': config.hidden_size, 
            'embedding_size': config.embedding_size,
            'output_size': output_lang.n_letters, 
            'num_layers': config.decoder_num_layers, 
            'type': config.decoder_type}, 
        attn = False)
    model.fit(train_pairs, validation_pairs, input_lang, output_lang, n_epochs = config.epochs, learning_rate = config.learning_rate, teacher_forcing_ratio = config.teacher_forcing_ratio, use_wanb = True)
    run.finish()

wandb.agent(sweep_id, train, count=5, project="CS6910_Assignment3", entity="cs20b004")
wandb.finish()


Create sweep with ID: xx5umkfg
Sweep URL: https://wandb.ai/cs20b004/CS6910_Assignment3/sweeps/xx5umkfg


[34m[1mwandb[0m: Agent Starting Run: n8w8fde7 with config:
[34m[1mwandb[0m: 	decoder_num_layers: 1
[34m[1mwandb[0m: 	decoder_type: gru
[34m[1mwandb[0m: 	embedding_size: 64
[34m[1mwandb[0m: 	encoder_num_layers: 1
[34m[1mwandb[0m: 	encoder_type: gru
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 384
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.8
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
