In [1]:
import torch
from torch import nn
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F
import copy
from torch.utils.data import Dataset, DataLoader
import random
import wandb

In [2]:
wandb.login(key="1d2423ec9b728fe6cc1e2c0b9a2af0e67a45183c")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcs23m059[0m ([33mcs23m059-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# File paths
train_csv = "/kaggle/input/telugu/te_train.csv"
test_csv = "/kaggle/input/telugu/te_test.csv"
val_csv = "/kaggle/input/telugu/te_val.csv"

In [5]:
# Data loading
train_data = pd.read_csv(train_csv, header=None)
train_input = train_data[0].to_numpy()
train_output = train_data[1].to_numpy()
val_data = pd.read_csv(val_csv, header=None)
val_input = val_data[0].to_numpy()
val_output = val_data[1].to_numpy()
test_data = pd.read_csv(test_csv, header=None)

In [6]:
def pre_processing(train_input, train_output):
    data = {
        "all_characters": [],
        "char_num_map": {},
        "num_char_map": {},
        "source_charToNum": torch.zeros(len(train_input), 30, dtype=torch.int, device=device),
        "source_data": train_input,
        "all_characters_2": [],
        "char_num_map_2": {},
        "num_char_map_2": {},
        "val_charToNum": torch.zeros(len(train_output), 23, dtype=torch.int, device=device),
        "target_data": train_output,
        "source_len": 0,
        "target_len": 0
    }
    
    for i in range(len(train_input)):
        train_input[i] = "{" + train_input[i] + "}" * (29 - len(train_input[i]))
        charToNum = []
        for char in train_input[i]:
            if char not in data["all_characters"]:
                data["all_characters"].append(char)
                index = len(data["all_characters"]) - 1
                data["char_num_map"][char] = index
                data["num_char_map"][index] = char
            else:
                index = data["char_num_map"][char]
            charToNum.append(index)
        data["source_charToNum"][i] = torch.tensor(charToNum, device=device)

        train_output[i] = "{" + train_output[i] + "}" * (22 - len(train_output[i]))
        charToNum1 = []
        for char in train_output[i]:
            if char not in data["all_characters_2"]:
                data["all_characters_2"].append(char)
                index = len(data["all_characters_2"]) - 1
                data["char_num_map_2"][char] = index
                data["num_char_map_2"][index] = char
            else:
                index = data["char_num_map_2"][char]
            charToNum1.append(index)
        data["val_charToNum"][i] = torch.tensor(charToNum1, device=device)
    
    data["source_len"] = len(data["all_characters"])
    data["target_len"] = len(data["all_characters_2"])
    return data

data = pre_processing(copy.copy(train_input), copy.copy(train_output))

In [7]:
def pre_processing_validation(val_input, val_output):
    data2 = {
        "source_charToNum": torch.zeros(len(val_input), 30, dtype=torch.int, device=device),
        "val_charToNum": torch.zeros(len(val_output), 23, dtype=torch.int, device=device)
    }
    m1 = data["char_num_map"]
    m2 = data["char_num_map_2"]
    
    for i in range(len(val_input)):
        val_input[i] = "{" + val_input[i] + "}" * (29 - len(val_input[i]))
        charToNum = [m1[char] for char in val_input[i]]
        data2["source_charToNum"][i] = torch.tensor(charToNum, device=device)
        
        val_output[i] = "{" + val_output[i] + "}" * (22 - len(val_output[i]))
        charToNum1 = [m2[char] for char in val_output[i]]
        data2["val_charToNum"][i] = torch.tensor(charToNum1, device=device)
    
    return data2

data2 = pre_processing_validation(copy.copy(val_input), copy.copy(val_output))

In [8]:
class MyDataset(Dataset):
    def __init__(self, x, y):
        self.source = x
        self.target = y
    
    def __len__(self):
        return len(self.source)
    
    def __getitem__(self, idx):
        return self.source[idx], self.target[idx]

def dataLoaderFun(dataName, batch_size):
    if dataName == 'train':
        dataset = MyDataset(data["source_charToNum"], data['val_charToNum'])
    else:
        dataset = MyDataset(data2["source_charToNum"], data2['val_charToNum'])
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

class Encoder(nn.Module):
    def __init__(self, inputDim, embSize, encoderLayers, hiddenLayerNuerons, cellType, bidirection):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(inputDim, embSize)
        self.encoderLayers = encoderLayers
        self.hiddenLayerNuerons = hiddenLayerNuerons
        self.bidirection = bidirection
        self.num_directions = 2 if bidirection == "Yes" else 1

        if cellType == 'GRU':
            self.rnn = nn.GRU(embSize, hiddenLayerNuerons, 
                            num_layers=encoderLayers,
                            bidirectional=(bidirection == "Yes"),
                            batch_first=True)
        elif cellType == 'LSTM':
            self.rnn = nn.LSTM(embSize, hiddenLayerNuerons,
                             num_layers=encoderLayers,
                             bidirectional=(bidirection == "Yes"),
                             batch_first=True)
        else:
            self.rnn = nn.RNN(embSize, hiddenLayerNuerons,
                            num_layers=encoderLayers,
                            bidirectional=(bidirection == "Yes"),
                            batch_first=True)

    def forward(self, currentInput, prevState):
        embdInput = self.embedding(currentInput)
        return self.rnn(embdInput, prevState)

    def getInitialState(self, batch_size):
        return torch.zeros(self.encoderLayers * self.num_directions, 
                          batch_size, 
                          self.hiddenLayerNuerons, 
                          device=device)

class Decoder(nn.Module):
    def __init__(self, outputDim, embSize, hiddenLayerNuerons, decoderLayers, cellType, dropout_p):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(outputDim, embSize)
        self.decoderLayers = decoderLayers

        if cellType == 'GRU':
            self.rnn = nn.GRU(embSize, hiddenLayerNuerons,
                            num_layers=decoderLayers,
                            batch_first=True)
        elif cellType == 'LSTM':
            self.rnn = nn.LSTM(embSize, hiddenLayerNuerons,
                             num_layers=decoderLayers,
                             batch_first=True)
        else:
            self.rnn = nn.RNN(embSize, hiddenLayerNuerons,
                            num_layers=decoderLayers,
                            batch_first=True)

        self.fc = nn.Linear(hiddenLayerNuerons, outputDim)
        self.softmax = nn.LogSoftmax(dim=2)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, currentInput, prevState):
        embdInput = self.embedding(currentInput)
        output, prevState = self.rnn(embdInput, prevState)
        output = self.dropout(output)
        output = self.softmax(self.fc(output))
        return output, prevState

def init_decoder_state(encoder_state, encoderLayers, decoderLayers, cellType):
    if cellType == 'LSTM':
        h, c = encoder_state
        if encoderLayers >= decoderLayers:
            h_dec = h[-decoderLayers:]
            c_dec = c[-decoderLayers:]
        else:
            h_dec = torch.cat([h] + [h[-1:]]*(decoderLayers-encoderLayers), dim=0)
            c_dec = torch.cat([c] + [c[-1:]]*(decoderLayers-encoderLayers), dim=0)
        return (h_dec, c_dec)
    else:
        h = encoder_state
        if encoderLayers >= decoderLayers:
            h_dec = h[-decoderLayers:]
        else:
            h_dec = torch.cat([h] + [h[-1:]]*(decoderLayers-encoderLayers), dim=0)
        return h_dec

def train(embSize, encoderLayers, decoderLayers, hiddenLayerNuerons, cellType, bidirection, dropout, epochs, batchsize, learningRate, optimizer, tf_ratio):
    dataLoader = dataLoaderFun("train", batchsize)
    lossFunction = nn.NLLLoss()
    
    encoder = Encoder(data["source_len"], embSize, encoderLayers, 
                     hiddenLayerNuerons, cellType, bidirection).to(device)
    decoder = Decoder(data["target_len"], embSize, hiddenLayerNuerons,
                     decoderLayers, cellType, dropout).to(device)

    if optimizer == 'Adam':
        encoderOptimizer = optim.Adam(encoder.parameters(), lr=learningRate)
        decoderOptimizer = optim.Adam(decoder.parameters(), lr=learningRate)
    else:
        encoderOptimizer = optim.NAdam(encoder.parameters(), lr=learningRate)
        decoderOptimizer = optim.NAdam(decoder.parameters(), lr=learningRate)

    for epoch in range(epochs):
        train_accuracy = 0
        train_loss = 0
        total_samples = 0
        
        for batch_num, (sourceBatch, targetBatch) in enumerate(dataLoader):
            current_batch_size = sourceBatch.size(0)
            encoderInitialState = encoder.getInitialState(current_batch_size)
            
            if bidirection == "Yes":
                reversed_batch = torch.flip(sourceBatch, dims=[1])
                sourceBatch = (sourceBatch + reversed_batch) // 2
            
            if cellType == 'LSTM':
                encoderInitialState = (encoderInitialState, 
                                      torch.zeros_like(encoderInitialState))

            encoder_output, encoderCurrentState = encoder(sourceBatch, encoderInitialState)
            
            # Handle bidirectional state reduction
            if bidirection == "Yes":
                if cellType == 'LSTM':
                    encoderCurrentState = (
                        encoderCurrentState[0].view(encoderLayers, 2, current_batch_size, -1).sum(1),
                        encoderCurrentState[1].view(encoderLayers, 2, current_batch_size, -1).sum(1)
                    )
                else:
                    encoderCurrentState = encoderCurrentState.view(
                        encoderLayers, 2, current_batch_size, -1
                    ).sum(1)
            
            # Initialize decoder state
            decoderCurrState = init_decoder_state(
                encoderCurrentState, encoderLayers, decoderLayers, cellType
            )
            
            loss = 0
            sequenceLen = targetBatch.shape[1]
            Output = []
            randNumber = random.random()

            for i in range(sequenceLen):
                if i == 0:
                    decoderInput = targetBatch[:, i].reshape(current_batch_size, 1)
                else:
                    if randNumber < tf_ratio:
                        decoderInput = targetBatch[:, i].reshape(current_batch_size, 1)
                    else:
                        decoderInput = decoderInput.reshape(current_batch_size, 1)

                decoderOutput, decoderCurrState = decoder(decoderInput, decoderCurrState)
                _, topIndeces = decoderOutput.topk(1)
                decoderOutput = decoderOutput[:, -1, :]
                targetChars = targetBatch[:, i].type(dtype=torch.long)
                loss += lossFunction(decoderOutput, targetChars)
                decoderInput = topIndeces.squeeze().detach()
                Output.append(decoderInput)

            tensor_2d = torch.stack(Output)
            Output = tensor_2d.t()
            train_accuracy += (Output == targetBatch).all(dim=1).sum().item()
            train_loss += (loss.item() / sequenceLen)
            total_samples += targetBatch.size(0)

            encoderOptimizer.zero_grad()
            decoderOptimizer.zero_grad()
            loss.backward()
            encoderOptimizer.step()
            decoderOptimizer.step()
        
        val_acc, val_loss = validationAccuracy(encoder, decoder, batchsize, tf_ratio, cellType, bidirection)

        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss / len(dataLoader),
            "train_accuracy": train_accuracy / total_samples,
            "validation_loss": val_loss / len(dataLoaderFun("validation", batchsize)),
            "validation_accuracy": val_acc / sum(len(b) for b, _ in dataLoaderFun("validation", batchsize))
        })

def validationAccuracy(encoder, decoder, batchsize, tf_ratio, cellType, bidirection):
    dataLoader = dataLoaderFun("validation", batchsize)
    encoder.eval()
    decoder.eval()
    validation_accuracy = 0
    validation_loss = 0
    total_samples = 0
    lossFunction = nn.NLLLoss()

    for batch_num, (sourceBatch, targetBatch) in enumerate(dataLoader):
        current_batch_size = sourceBatch.size(0)
        encoderInitialState = encoder.getInitialState(current_batch_size)
        
        if cellType == 'LSTM':
            encoderInitialState = (encoderInitialState, 
                                  torch.zeros_like(encoderInitialState))
        
        if bidirection == "Yes":
            reversed_batch = torch.flip(sourceBatch, dims=[1])
            sourceBatch = (sourceBatch + reversed_batch) // 2

        encoder_output, encoderCurrentState = encoder(sourceBatch, encoderInitialState)
        
        # Handle bidirectional state reduction
        if bidirection == "Yes":
            if cellType == 'LSTM':
                encoderCurrentState = (
                    encoderCurrentState[0].view(encoder.encoderLayers, 2, current_batch_size, -1).sum(1),
                    encoderCurrentState[1].view(encoder.encoderLayers, 2, current_batch_size, -1).sum(1)
                )
            else:
                encoderCurrentState = encoderCurrentState.view(
                    encoder.encoderLayers, 2, current_batch_size, -1
                ).sum(1)
        
        decoderCurrState = init_decoder_state(
            encoderCurrentState, encoder.encoderLayers, decoder.decoderLayers, cellType
        )
        
        loss = 0
        outputSeqLen = targetBatch.shape[1]
        Output = []
        randNumber = random.random()

        for i in range(outputSeqLen):
            if i == 0:
                decoderInputensor = targetBatch[:, i].reshape(current_batch_size, 1)
            else:
                if randNumber < tf_ratio:
                    decoderInputensor = targetBatch[:, i].reshape(current_batch_size, 1)
                else:
                    decoderInputensor = decoderInputensor.reshape(current_batch_size, 1)

            decoderOutput, decoderCurrState = decoder(decoderInputensor, decoderCurrState)
            _, topIndeces = decoderOutput.topk(1)
            decoderOutput = decoderOutput[:, -1, :]
            curr_target_chars = targetBatch[:, i].type(dtype=torch.long)
            loss += lossFunction(decoderOutput, curr_target_chars)
            decoderInputensor = topIndeces.squeeze().detach()
            Output.append(decoderInputensor)

        tensor_2d = torch.stack(Output)
        Output = tensor_2d.t()
        validation_accuracy += (Output == targetBatch).all(dim=1).sum().item()
        validation_loss += (loss.item() / outputSeqLen)
        total_samples += targetBatch.size(0)

    encoder.train()
    decoder.train()
    return validation_accuracy, validation_loss

In [9]:
def main_fun():
    wandb.init(project='DA6401_Assignment_3')
    params = wandb.config
    train(params.embSize, params.encoderLayers, params.decoderLayers,
          params.hiddenLayerNuerons, params.cellType, params.bidirection,
          params.dropout, params.epochs, params.batchsize, params.learningRate,
          params.optimizer, params.tf_ratio)

sweep_params = {
    'method': 'bayes',
    'name': 'DA6401_Assignment_3',
    'metric': {
        'goal': 'maximize',
        'name': 'validation_accuracy',
    },
    'parameters': {
        'embSize': {'values': [16, 32, 64]},
        'encoderLayers': {'values': [1, 5, 10]},
        'decoderLayers': {'values': [1, 5, 10]},
        'hiddenLayerNuerons': {'values': [64, 256, 512]},
        'cellType': {'values': ['GRU', 'RNN', 'LSTM']},
        'bidirection': {'values': ['no', 'Yes']},
        'dropout': {'values': [0, 0.2, 0.3]},
        'epochs': {'values': [10, 15]},
        'batchsize': {'values': [32, 64]},
        'learningRate': {'values': [1e-2, 1e-3, 1e-4]},
        'optimizer': {'values': ['Adam', 'Nadam']},
        'tf_ratio': {'values': [0.2, 0.4, 0.5]}
    }
}

In [10]:
sweepId = wandb.sweep(sweep_params, project='DA6401_Assignment_3')

Create sweep with ID: d6bed4g5
Sweep URL: https://wandb.ai/cs23m059-iit-madras/DA6401_Assignment_3/sweeps/d6bed4g5


In [10]:
wandb.agent("9rul4elu", function=main_fun, count=30, entity="cs23m059-iit-madras", project="DA6401_Assignment_3")

[34m[1mwandb[0m: Agent Starting Run: ebfih8y0 with config:
[34m[1mwandb[0m: 	batchsize: 32
[34m[1mwandb[0m: 	bidirection: Yes
[34m[1mwandb[0m: 	cellType: GRU
[34m[1mwandb[0m: 	decoderLayers: 5
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embSize: 32
[34m[1mwandb[0m: 	encoderLayers: 10
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hiddenLayerNuerons: 512
[34m[1mwandb[0m: 	learningRate: 0.0001
[34m[1mwandb[0m: 	optimizer: Nadam
[34m[1mwandb[0m: 	tf_ratio: 0.5


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
