## Assignment 3 - CS20B021 - CS6910

### Importing Libraries

In [11]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import os
import time
import math
import pandas as pd
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import wandb
wandb.login()

True

### Global Variables and Helper Functions

In [12]:
MAX_LEN = 40
SOS_token = 0
EOS_token = 1

def filterPair(p):
    return len(p[0]) < MAX_LEN and \
        len(p[1]) < MAX_LEN

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

### Data Preprocessing

In [13]:
# Load data
train_data = pd.read_csv('tel_train.csv')
valid_data = pd.read_csv('tel_valid.csv')
test_data = pd.read_csv('tel_test.csv')

# Prepare data in pairs
train_pairs = []
for line in train_data.values:
    train_pairs.append([line[0], line[1]])
    
valid_pairs = []
for line in valid_data.values:
    valid_pairs.append([line[0], line[1]])

test_pairs = []
for line in test_data.values:
    test_pairs.append([line[0], line[1]])

# Filter pairs with length > MAX_LEN
train_pairs = filterPairs(train_pairs)
valid_pairs = filterPairs(valid_pairs)
test_pairs = filterPairs(test_pairs)

### Language Class

In [14]:
class Lang:
    def __init__(self, type):
        self.type = type
        self.char2index = {}
        self.char2count = {}
        self.index2char = {0: "SOS", 1: "EOS"}
        self.n_chars = 2  # Count SOS and EOS

    def addChar(self, char):
        if char not in self.char2index:
            self.char2index[char] = self.n_chars
            self.char2count[char] = 1
            self.index2char[self.n_chars] = char
            self.n_chars += 1
        else:
            self.char2count[char] += 1

    def addWord(self, word):
        for char in word:
            self.addChar(char)

input_lang = Lang('input')
output_lang = Lang('output')

for pair in train_pairs:
    input_lang.addWord(pair[0])
    output_lang.addWord(pair[1])

def indexesFromWord(lang, word):
    return [lang.char2index[char] for char in word]

def tensorFromWord(lang, word):
    indexes = indexesFromWord(lang, word)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorFromPair(pair):
    input_tensor = tensorFromWord(input_lang, pair[0])
    target_tensor = tensorFromWord(output_lang, pair[1])
    return (input_tensor, target_tensor)

### Encoder Class

In [15]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size = 128, embed_size = 64, num_layers = 1, cell_type = 'gru', dropout = 0.1, bidirectional = False):
        super(Encoder, self).__init__()
        # Class Variables
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.bidirectional = bidirectional
        self.cell_type = cell_type

        # Layers and Cells Initialized with parameters
        self.embedding = nn.Embedding(input_size, embed_size)
        if cell_type == 'gru':
            self.cell = nn.GRU(input_size = self.embed_size, hidden_size = self.hidden_size, num_layers = self.num_layers, dropout = self.dropout, bidirectional = self.bidirectional)
        elif cell_type == 'rnn':
            self.cell = nn.RNN(input_size = self.embed_size, hidden_size = self.hidden_size, num_layers = self.num_layers, dropout = self.dropout, bidirectional = self.bidirectional)
        elif cell_type == 'lstm':
            self.cell = nn.LSTM(input_size = self.embed_size, hidden_size = self.hidden_size, num_layers = self.num_layers, dropout = self.dropout, bidirectional = self.bidirectional)
        else:
            raise ValueError('Invalid cell type specified')
        
    def forward(self, input, hidden, cell):
        embedded = self.embedding(input).view(1, 1, -1)

        # Forward pass through the cell
        if self.cell_type == 'lstm':
            output, (hidden, cell) = self.cell(embedded, (hidden, cell))
        else:
            output, hidden = self.cell(embedded, hidden)

        return output, hidden, cell
    
    def initHidden(self):
        # Initialize hidden state with zeros
        return torch.zeros(self.num_layers, 1, self.hidden_size, device=device)

### Decoder Class

In [16]:
class Decoder(nn.Module):
    def __init__(self, output_size,hidden_size = 128, embed_size = 64, num_layers = 1, cell_type = 'gru', dropout = 0.1, bidirectional = False):
        super(Decoder, self).__init__()
        # Class Variables
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.bidirectional = bidirectional
        self.cell_type = cell_type

        # Layers and Cells Initialized with parameters
        self.embedding = nn.Embedding(output_size, embed_size)
        if cell_type == 'gru':
            self.cell = nn.GRU(input_size = self.embed_size, hidden_size = self.hidden_size, num_layers = self.num_layers, dropout = self.dropout, bidirectional = self.bidirectional)
        elif cell_type == 'rnn':
            self.cell = nn.RNN(input_size = self.embed_size, hidden_size = self.hidden_size, num_layers = self.num_layers, dropout = self.dropout, bidirectional = self.bidirectional)
        elif cell_type == 'lstm':
            self.cell = nn.LSTM(input_size = self.embed_size, hidden_size = self.hidden_size, num_layers = self.num_layers, dropout = self.dropout, bidirectional = self.bidirectional)
        else:
            raise ValueError('Invalid cell type specified')
        self.out = nn.Linear(self.hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden, cell):
        output = self.embedding(input).view(1, 1, -1)

        # Forward pass through the cell
        if self.cell_type == 'lstm':
            output, (hidden, cell) = self.cell(output, (hidden, cell))
        else:
            output, hidden = self.cell(output, hidden)

        # Output with softmax
        output = self.softmax(self.out(output[0]))
        return output, hidden, cell

    def initHidden(self):
        # Initialize hidden state with zeros
        return torch.zeros(self.num_layers, 1, self.hidden_size, device=device)


### Attention Decoder 

### Sequence to Sequence Model

In [17]:
class Seq2Seq():
    def __init__(self, hidden_size = 128, embed_size = 64, num_layers = 1, cell_type = 'gru', dropout = 0.1, bidirectional = False, optimizer = 'sgd', lr = 0.01):
        
        # Class Variables
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.num_layers = num_layers
        self.cell_type = cell_type
        self.dropout = dropout
        self.bidirectional = bidirectional
        self.optimizer = optimizer
        self.lr = lr

        # Initialize Encoder and Decoder
        self.encoder = Encoder(input_lang.n_chars, hidden_size, embed_size, num_layers, cell_type, dropout, bidirectional).to(device)
        self.decoder = Decoder(output_lang.n_chars, hidden_size, embed_size, num_layers, cell_type, dropout, bidirectional).to(device)

        # Initialize Optimizer
        if optimizer == 'sgd':
            self.encoder_optimizer = optim.SGD(self.encoder.parameters(), lr = self.lr)
            self.decoder_optimizer = optim.SGD(self.decoder.parameters(), lr = self.lr)
        elif optimizer == 'adam':
            self.encoder_optimizer = optim.Adam(self.encoder.parameters(), lr = self.lr)
            self.decoder_optimizer = optim.Adam(self.decoder.parameters(), lr = self.lr)
        else:
            raise ValueError('Invalid optimizer specified')
        
        # Initialize Criterion
        self.criterion = nn.NLLLoss()

    def train(self, input_tensor, target_tensor):

        # Initialize Encoder Hidden State
        encoder_hidden = self.encoder.initHidden()
        encoder_cell = self.encoder.initHidden()
        encoder_outputs = torch.zeros(MAX_LEN, self.encoder.hidden_size, device=device)

        # Zero Gradients
        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()

        # Get input and target length
        input_length = input_tensor.size(0)
        target_length = target_tensor.size(0)

        # Initialize Loss
        loss = 0

        # Forward pass through the encoder
        for ei in range(input_length):
            encoder_output, encoder_hidden, encoder_cell = self.encoder(input_tensor[ei], encoder_hidden, encoder_cell)
            encoder_outputs[ei] = encoder_output[0, 0]

        # Initialize Decoder Hidden State
        decoder_input = torch.tensor([[SOS_token]], device=device)
        decoder_hidden, decoder_cell = encoder_hidden, encoder_cell

        # Use Teacher Forcing
        use_teacher_forcing = True if random.random() < 0.5 else False
        
        # Forward pass through the decoder
        if use_teacher_forcing:
            for di in range(target_length):
                decoder_output, decoder_hidden, decoder_cell = self.decoder(decoder_input, decoder_hidden, decoder_cell)
                loss += self.criterion(decoder_output, target_tensor[di])
                decoder_input = target_tensor[di]
        else:
            for di in range(target_length):
                decoder_output, decoder_hidden, decoder_cell = self.decoder(decoder_input, decoder_hidden, decoder_cell)
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze().detach()
                loss += self.criterion(decoder_output, target_tensor[di])
                if decoder_input.item() == EOS_token:
                    break

        # Backpropagate Loss
        loss.backward()

        # Update Parameters
        self.encoder_optimizer.step()
        self.decoder_optimizer.step()

        return loss.item() / target_length
    
    def evaluate(self, word: str):
        with torch.no_grad():
            input_tensor = tensorFromWord(input_lang, word)
            input_length = input_tensor.size()[0]
            encoder_hidden = self.encoder.initHidden()
            encoder_cell = self.encoder.initHidden()

            encoder_outputs = torch.zeros(MAX_LEN, self.hidden_size, device=device)

            for ei in range(input_length):
                encoder_output, encoder_hidden, encoder_cell = self.encoder(input_tensor[ei],
                                                        encoder_hidden, encoder_cell)
                encoder_outputs[ei] += encoder_output[0, 0]

            decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

            decoder_hidden, decoder_cell = encoder_hidden, encoder_cell

            decoded_words = []

            for di in range(MAX_LEN):
                decoder_output, decoder_hidden, decoder_cell = self.decoder(
                    decoder_input, decoder_hidden, decoder_cell)
                topv, topi = decoder_output.data.topk(1)
                if topi.item() == EOS_token:
                    decoded_words.append('<EOS>')
                    break
                else:
                    decoded_words.append(output_lang.index2char[topi.item()])

                decoder_input = topi.squeeze().detach()

            return decoded_words
        
    def accuracy(self, pairs):
        
        # Returns the accuracy of the model on the given pairs
        correct = 0
        total = 0
        for pair in pairs:
            output_words = self.evaluate(pair[0])
            output_sentence = ''.join(output_words)
            print(pair[0], output_sentence, pair[1])
            temp = pair[1].append('<EOS>')
            if output_sentence == temp:
                correct += 1
            total += 1
            if(total % 1000 == 0):
                print(total)
        return correct / total
        
    def trainEpoch(self, epochs = 5, Log = False):
        
        # Initializations
        start = time.time()
        print_loss_total = 0  # Reset every 1000 steps
        prev_train_acc = 0
        prev_val_acc = 0
        log_loss_total = 0

        # Train for given number of epochs
        for epoch in range(1, epochs + 1):
            print('Epoch: ', epoch)

            # Get Tensor Pairs
            training_pairs = [tensorFromPair(pair) for pair in train_pairs]

            # Train for one epoch
            for i in range(1, len(training_pairs) + 1):
                training_pair = training_pairs[i - 1]
                input_tensor = training_pair[0]
                target_tensor = training_pair[1]

                loss = self.train(input_tensor, target_tensor)
                print_loss_total += loss
                log_loss_total += loss

                # Print Progress
                if i % 1000 == 0:
                    temp = len(train_pairs)
                    print_loss_avg = print_loss_total / 1000
                    print_loss_total = 0
                    print('%s (%d %d%%) %.4f' % (timeSince(start, i / len(train_pairs)),
                                                i, i /  len(train_pairs) * 100, print_loss_avg))

            # Training Loss
            train_loss = log_loss_total / len(train_pairs)

            # Train Accuracy
            train_acc = self.accuracy(train_pairs)
            print('Train Accuracy: ', train_acc)

            # Validation Accuracy
            val_acc = self.accuracy(valid_pairs)
            print('Validation Accuracy: ', val_acc)

            # Check to end
            if val_acc < prev_val_acc or train_acc < prev_train_acc:
                break
                
            # Update previous accuracy
            prev_train_acc = train_acc
            prev_val_acc = val_acc

            # Log to wandb
            if Log:
                wandb.log({
                            "train_loss": train_loss,
                            "train_acc": train_acc, 
                            "val_acc": val_acc,
                            "epoch": epoch
                            })
                
        if val_acc > 0.4:
            torch.save(self.encoder.state_dict(), 'encoder.pth{}|{}|{}|{}|{}|{}|{}|{}',format(epochs, self.hidden_size, self.embed_size, self.cell_type, self.num_layers, self.dropout, self.optimizer, self.lr))
            torch.save(self.decoder.state_dict(), 'decoder.pth{}|{}|{}|{}|{}|{}|{}|{}',format(epochs, self.hidden_size, self.embed_size, self.cell_type, self.num_layers, self.dropout, self.optimizer, self.lr))

        # Test Accuracy
        test_acc = self.accuracy(test_pairs)
        #print('Test Accuracy: ', test_acc)

        # Log to wandb
        if Log:
            wandb.log({"test_acc": test_acc})



### Trial Runs

In [18]:
model = Seq2Seq(hidden_size = 256)
model.trainEpoch(epochs = 2)



Epoch:  1
0m 17s (- 14m 24s) (1000 1%) 2.7920
330m 35s (- 8132m 13s) (2000 3%) 2.7206
330m 50s (- 5315m 32s) (3000 5%) 2.7131
331m 6s (- 3907m 1s) (4000 7%) 2.7144
331m 21s (- 3061m 44s) (5000 9%) 2.7055
331m 37s (- 2498m 10s) (6000 11%) 2.6244
331m 52s (- 2095m 31s) (7000 13%) 2.5846
332m 8s (- 1793m 30s) (8000 15%) 2.4512
332m 23s (- 1558m 32s) (9000 17%) 2.3442
332m 39s (- 1370m 32s) (10000 19%) 2.2176
332m 55s (- 1216m 39s) (11000 21%) 2.0815
333m 11s (- 1088m 23s) (12000 23%) 1.9967
333m 27s (- 979m 48s) (13000 25%) 1.9125
333m 43s (- 886m 42s) (14000 27%) 1.8249
333m 58s (- 805m 59s) (15000 29%) 1.7751
334m 14s (- 735m 18s) (16000 31%) 1.6739
334m 30s (- 672m 55s) (17000 33%) 1.6345
334m 46s (- 617m 27s) (18000 35%) 1.6030
335m 2s (- 567m 47s) (19000 37%) 1.5525
335m 19s (- 523m 4s) (20000 39%) 1.4923
335m 35s (- 482m 35s) (21000 41%) 1.4193
335m 51s (- 445m 45s) (22000 42%) 1.4691
336m 7s (- 412m 6s) (23000 44%) 1.3911
336m 24s (- 381m 14s) (24000 46%) 1.3983
336m 40s (- 352m 49

KeyboardInterrupt: 

### Sweeps

In [9]:
sweep_config = {
    'method': 'bayes',
    'metric': {
        'name': 'val_acc',
        'goal': 'maximize'
    },
    'parameters': {
        'hidden_size': {
            'values': [128, 256, 512]
        },
        'embed_size': {
            'values': [64, 128, 256]
        },
        'num_layers': {
            'values': [1, 2, 3]
        },
        'cell_type': {
            'values': ['gru', 'rnn', 'lstm']
        },
        'dropout': {
            'values': [0.1, 0.2, 0.3]
        },
        'optimizer': {
            'values': ['sgd', 'adam']
        },
        'lr': {
            'values': [0.01, 0.001, 0.0001]
        },
        'epoch': {
            'values': [5]
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project="CS20B021_A3")

def train_sweep_run():
    run = wandb.init()
    config = wandb.config
    run.name = "epoch:{}|hid:{}|embed:{}|cell:{}|nlayer:{}|drop:{}|opt:{}|lr:{}".format(config.epoch, config.hidden_size, config.embed_size, config.cell_type, config.num_layers, config.dropout, config.optimizer, config.lr)

    model = Seq2Seq(hidden_size = config.hidden_size, embed_size = config.embed_size, num_layers = config.num_layers, cell_type = config.cell_type, dropout = config.dropout, optimizer = config.optimizer, lr = config.lr)
    model.trainEpoch(epochs = config.epoch, Log = True)
    run.finish()

wandb.agent(sweep_id, train_sweep_run, count = 10, project="CS20B021_A3")

    

Create sweep with ID: aj7uc0ln
Sweep URL: https://wandb.ai/chathur/CS20B021_A3/sweeps/aj7uc0ln


[34m[1mwandb[0m: Agent Starting Run: 8go649jw with config:
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_size: 256
[34m[1mwandb[0m: 	epoch: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer: adam
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch:  1
0m 33s (- 28m 11s) (1000 1%) 2.9314
1m 5s (- 26m 44s) (2000 3%) 2.8027
1m 39s (- 26m 42s) (3000 5%) 2.7877
2m 16s (- 26m 55s) (4000 7%) 2.7643
2m 55s (- 27m 5s) (5000 9%) 2.7325
3m 37s (- 27m 16s) (6000 11%) 2.7013
4m 19s (- 27m 20s) (7000 13%) 2.6868
5m 6s (- 27m 37s) (8000 15%) 2.6523
6m 0s (- 28m 9s) (9000 17%) 2.6392
6m 51s (- 28m 14s) (10000 19%) 2.6188
7m 37s (- 27m 52s) (11000 21%) 2.6144
8m 27s (- 27m 38s) (12000 23%) 2.5971
9m 17s (- 27m 17s) (13000 25%) 2.5783
10m 6s (- 26m 51s) (14000 27%) 2.5820
10m 49s (- 26m 8s) (15000 29%) 2.5683
11m 32s (- 25m 23s) (16000 31%) 2.5316
12m 14s (- 24m 38s) (17000 33%) 2.5070
12m 57s (- 23m 53s) (18000 35%) 2.5009
13m 39s (- 23m 9s) (19000 37%) 2.5029
14m 22s (- 22m 24s) (20000 39%) 2.4838
15m 3s (- 21m 39s) (21000 41%) 2.4462
15m 45s (- 20m 54s) (22000 42%) 2.4640
16m 27s (- 20m 10s) (23000 44%) 2.4836
17m 7s (- 19m 24s) (24000 46%) 2.4895
17m 49s (- 18m 40s) (25000 48%) 2.4840
18m 32s (- 17m 58s) (26000 50%) 2.4700
19m 18s (- 17