# Attention Seq2Seq Report Tester Notebook
This notebook is used to take the best model from the sweep retrain the model using appropriate callbacks and then predict on the test set and save it and also create some visualizations if required. Without much details lets get into the assignment.

In [1]:
# Importing the necessary libraries #
# Importing the necessary libraries needed
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torchvision.transforms.functional as Fn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Subset, Dataset
from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split
import os
from tqdm import tqdm
import wandb
wandb.login(key = "5ef7c4bbfa350a2ffd3c198cb9289f544e3a0910")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mae21b105[0m ([33mRough[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
# Data preparation
# Loading the dataset
df_train = pd.read_csv('/kaggle/input/dl-a3-tamil/ta_lexicons/ta.translit.sampled.train.tsv', sep='\t',  header=None, names=["native","latin","count"])
df_test = pd.read_csv('/kaggle/input/dl-a3-tamil/ta_lexicons/ta.translit.sampled.test.tsv', sep='\t',  header=None, names=["native","latin","count"])
df_val = pd.read_csv('/kaggle/input/dl-a3-tamil/ta_lexicons/ta.translit.sampled.dev.tsv', sep='\t',  header=None, names=["native","latin","count"])

In [None]:
# Preparing the dataset for the model to fit #
class Dataset_Tamil(Dataset):
    def __init__(self, dataframe, build_vocab=True, input_token_index=None, output_token_index=None,
                 max_enc_seq_len=0, max_dec_seq_len=0):
        
        # Input variables
        self.input_df = dataframe
        self.input_words = []
        self.output_words = []
        # Characters of the language
        self.input_characters = set()
        self.output_characters = set()

        # Iterating thorough the rows
        for _, row in self.input_df.iterrows():
            input_word = str(row["latin"])
            output_word = "\t" + str(row["native"]) + "\n"
            self.input_words.append(input_word)
            self.output_words.append(output_word)
        
        if build_vocab:
            self.build_vocab()
        else:
            # Token index for sequence building
            self.input_token_index = input_token_index
            self.output_token_index = output_token_index
            # Heuristics lengths for the encoder decoder
            self.max_enc_seq_len = max_enc_seq_len
            self.max_dec_seq_len = max_dec_seq_len

        # Finding the encoder/decoder tokens 
        self.total_encoder_tokens = len(self.input_token_index)
        self.total_decoder_tokens = len(self.output_token_index)

    def build_vocab(self):
        # Building the vocabulary
        self.input_characters = sorted(set(" ".join(self.input_words)))
        self.output_characters = sorted(set(" ".join(self.output_words)))
        # Adding the padding character if not present
        if " " not in self.input_characters:
            self.input_characters.append(" ")
        if " " not in self.output_characters:
            self.output_characters.append(" ")

        # Fitting/Finding the necessary values from training data
        self.input_token_index = {char: i for i, char in enumerate(self.input_characters)}
        self.output_token_index = {char: i for i, char in enumerate(self.output_characters)}

        self.input_token_index_reversed = {i: char for i, char in enumerate(self.input_characters)}
        self.output_token_index_reversed = {i: char for i, char in enumerate(self.output_characters)}

        self.max_enc_seq_len = max(len(txt) for txt in self.input_words)
        self.max_dec_seq_len = max(len(txt) for txt in self.output_words)

    def __len__(self):
        return len(self.input_words)
    
    def __getitem__(self, index):
        input_word = self.input_words[index]
        output_word = self.output_words[index]

        # Finding the input for each stages of the network
        encoder_input = np.zeros((self.max_enc_seq_len, self.total_encoder_tokens), dtype=np.float32)
        decoder_input = np.zeros((self.max_dec_seq_len, self.total_decoder_tokens), dtype=np.float32)
        decoder_output = np.zeros((self.max_dec_seq_len, self.total_decoder_tokens), dtype=np.float32)

        for t, char in enumerate(input_word):
            if char in self.input_token_index:
                encoder_input[t, self.input_token_index[char]] = 1.0
        for t in range(len(input_word), self.max_enc_seq_len):
            encoder_input[t, self.input_token_index[" "]] = 1.0

        for t, char in enumerate(output_word):
            if char in self.output_token_index:
                decoder_input[t, self.output_token_index[char]] = 1.0
                if t > 0:
                    decoder_output[t - 1, self.output_token_index[char]] = 1.0
        # Fill remaining positions with space character
        for t in range(len(output_word), self.max_dec_seq_len):
            decoder_input[t, self.output_token_index[" "]] = 1.0
            
        for t in range(len(output_word) - 1, self.max_dec_seq_len):
            decoder_output[t, self.output_token_index[" "]] = 1.0

        return (
            torch.from_numpy(encoder_input),
            torch.from_numpy(decoder_input),
            torch.from_numpy(decoder_output)
        )

In [None]:
# Model classes definitions #
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, dropout=0.3, cell_type="RNN", num_layers=1, bi_directional=False):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.cell_type = cell_type.upper()
        self.dropout = dropout
        self.num_layers = num_layers

        if self.cell_type == 'LSTM':
            self.enc = nn.LSTM(input_size, hidden_size, batch_first=True, dropout=self.dropout, num_layers=self.num_layers, bidirectional=bi_directional)
        elif self.cell_type == 'GRU':
            self.enc = nn.GRU(input_size, hidden_size, batch_first=True, dropout=self.dropout, num_layers=self.num_layers, bidirectional=bi_directional)
        else:
            self.enc = nn.RNN(input_size, hidden_size, batch_first=True, dropout=self.dropout, num_layers=self.num_layers, bidirectional=bi_directional)

    def forward(self, x):
        if self.cell_type == "LSTM":
            hidden, (hn, cn) = self.enc(x)
            return hidden, (hn, cn)
        else:
            hidden, out = self.enc(x)
            return hidden, out
        
class Attention_Mechanism(nn.Module):
    def __init__(self, hidden_dim, device="cpu"):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.softmax = nn.Softmax(dim=1)
        self.tanh = nn.Tanh()
        # Creating the matrices for attention calculation
        self.V_att = nn.Parameter(torch.randn(size=(self.hidden_dim, 1), device=device)*0.1)
        self.U_att = nn.Parameter(torch.randn(size=(self.hidden_dim, self.hidden_dim), device=device)*0.1)
        self.W_att = nn.Parameter(torch.randn(size=(self.hidden_dim, self.hidden_dim), device=device)*0.1)

    def forward(self, st_1, c_j, mask):
        # Compute the attention scores and softmax
        """
        st_1 : input of size (bx1xd)
        c_j : input of size (bxLxd)
        """
        #print(st_1.shape, c_j.shape)
        inside = self.tanh(torch.matmul(c_j, self.W_att) + torch.matmul(st_1, self.U_att))
        #print(inside.shape)
        scores = torch.matmul(inside, self.V_att).squeeze(2)
        #print(scores.shape)
        scores[mask] = -torch.inf

        attention = self.softmax(scores)
        return attention
    
class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, dropout=0.3, cell_type='RNN', num_layers=1):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.cell_type = cell_type.upper()
        self.dropout = dropout
        self.num_layers = num_layers

        if self.cell_type == 'LSTM':
            self.dec = nn.LSTM(input_size, hidden_size, batch_first=True, dropout=self.dropout, num_layers=self.num_layers)
        elif self.cell_type == 'GRU':
            self.dec = nn.GRU(input_size, hidden_size, batch_first=True, dropout=self.dropout, num_layers=self.num_layers)
        else:
            self.dec = nn.RNN(input_size, hidden_size, batch_first=True, dropout=self.dropout, num_layers=self.num_layers)

    def forward(self, x, states):
        if states == None:
            hidden, out = self.dec(x)
            return hidden, out
        elif type(states) == tuple:
            hidden, (hn, cn) = self.dec(x, states)
            return hidden, (hn, cn)
        else:
            hidden, out = self.dec(x, states)
            return hidden, out
        
class Attention_Seq2Seq(nn.Module):
    def __init__(self, input_token_index, output_token_index, max_dec_seq_len, embedding_dim,hidden_size_enc, bi_directional=False,
            nature="train", enc_cell="LSTM", dec_cell="LSTM", num_layers=1,dropout=0.2, device="cpu"):
        super().__init__()

        self.input_index_token = input_token_index
        self.output_index_token = output_token_index
        self.max_dec_seq_len = max_dec_seq_len
        self.nature = nature
        self.enc_cell_type = enc_cell.upper()
        self.dec_cell_type = dec_cell.upper()
        self.num_layers= num_layers
        self.bi_directional = bi_directional
        self.hidden_size_enc = hidden_size_enc
        self.hidden_size_dec = (1 + int(self.bi_directional == True))*hidden_size_enc
        self.embedding = nn.Linear(in_features=len(self.input_index_token), out_features=embedding_dim)
        self.embedding_act = nn.Tanh()
        self.encoder = Encoder(input_size=embedding_dim, hidden_size=hidden_size_enc, dropout=dropout, cell_type=enc_cell, num_layers=num_layers, bi_directional=self.bi_directional).to(device)
        self.attention = Attention_Mechanism(hidden_dim=self.hidden_size_dec)
        self.decoder = Decoder(input_size=len(self.output_index_token)+self.hidden_size_dec, hidden_size=self.hidden_size_dec, dropout=dropout, cell_type=dec_cell, num_layers=num_layers).to(device)
        self.device = device
        self.loss_fn = nn.CrossEntropyLoss()
        self.fc = nn.Linear(in_features=self.hidden_size_dec, out_features=len(output_token_index))

    def forward(self, batch):
        ENC_IN, DEC_IN, DEC_OUT = batch
        ENC_IN = ENC_IN.to(self.device)
        DEC_IN = DEC_IN.to(self.device)

        batch_size = ENC_IN.size(0)
        input_embedding = self.embedding_act(self.embedding(ENC_IN))
        mask_ = torch.argmax(ENC_IN, 2) == 2
        hidden_enc, states_enc = self.encoder(input_embedding)

        # Final matrix
        final_out = torch.zeros(batch_size, self.max_dec_seq_len, len(self.output_index_token), device=self.device)

        # Initial decoder input (with start token)
        in_ = DEC_IN[:, 0:1, :].clone()
        for t in range(self.max_dec_seq_len):
            if t==0:
                out_step, states_dec = self.decoder(torch.cat((in_, hidden_enc[:,-1,:].unsqueeze(1)), dim=2), None) 
            else:
                # input for next input
                in_ = DEC_IN[:, t, :].unsqueeze(1).clone()
                att_scores = self.attention(out_step, hidden_enc, mask_)

                in_ = torch.cat((in_, torch.bmm(att_scores.unsqueeze(1), hidden_enc)), dim=2)
                # Output
                out_step, states_dec = self.decoder(in_, states_dec)  

            logits_step = self.fc(out_step.squeeze(1))            
            final_out[:, t, :] = logits_step
   
        return final_out
    
    def predict_greedy(self, batch):
        ENC_IN, DEC_IN, DEC_OUT = batch
        ENC_IN = ENC_IN.to(self.device)
        DEC_IN = DEC_IN.to(self.device)

        batch_size = ENC_IN.size(0)
        input_embedding = self.embedding_act(self.embedding(ENC_IN))
        mask_ = torch.argmax(ENC_IN, 2) == 2
        hidden_enc, states_enc = self.encoder(input_embedding)

        # Final matrix
        final_out = torch.zeros(batch_size, self.max_dec_seq_len, len(self.output_index_token), device=self.device)

        # Initial decoder input (with start token)
        in_ = torch.zeros(batch_size, 1, len(self.output_index_token), device=self.device)
        in_[:, 0, 0] = 1.0

        for t in range(self.max_dec_seq_len):
            if t==0:
                out_step, states_dec = self.decoder(torch.cat((in_, hidden_enc[:,-1,:].unsqueeze(1)), dim=2), None)  
            else:
                out_step, states_dec = self.decoder(in_, states_dec)  

            logits_step = self.fc(out_step.squeeze(1))            
            final_out[:, t, :] = logits_step

            # Greedy argmax for next input
            top1 = torch.argmax(logits_step, dim=1)               
            in_ = torch.zeros(batch_size, 1, len(self.output_index_token), device=self.device)
            in_[torch.arange(batch_size), 0, top1] = 1.0
            att_scores = self.attention(out_step, hidden_enc, mask_)

            in_ = torch.cat((in_, torch.bmm(att_scores.unsqueeze(1), hidden_enc)), dim=2)
        return final_out

In [None]:
# Fucntion for validation of the model # 
def validate_seq2seq(model, val_loader, device, val_type = "greedy", beam_width=None):
    model.eval()
    total_loss = 0.0
    correct_chars = 0
    total_chars = 0
    correct_words = 0
    total_words = 0
    loss_fn = nn.CrossEntropyLoss(ignore_index=2)

    with torch.no_grad():
        tqdm_progress = tqdm(val_loader, desc="Predicting...")
        for batch in tqdm_progress:
            ENC_IN, DEC_IN, DEC_OUT = batch
            ENC_IN = ENC_IN.to(device)
            DEC_IN = DEC_IN.to(device)
            DEC_OUT = DEC_OUT.to(device)

            # Forward pass
            decoder_output = model(batch)

            # Compute loss
            vocab_size = decoder_output.size(-1)
            decoder_output = decoder_output.view(-1, vocab_size)
            decoder_target_indices = DEC_OUT.argmax(dim=-1).view(-1)

            loss = loss_fn(decoder_output, decoder_target_indices)
            total_loss += loss.item()

            # Character-wise accuracy
            if val_type == "greedy":
                decoder_output = model.predict_greedy(batch)
            else:
                decoder_output = model.predict_beam_search(batch, beam_width=beam_width)

            #print(decoder_output.shape)
            pred_tokens = decoder_output.argmax(dim=2)
            true_tokens = DEC_OUT.argmax(dim=2)
            #print(pred_tokens.shape)
            #print(true_tokens.shape)
            
            mask = true_tokens != 2  # Ignore PAD tokens
            correct_chars += (pred_tokens[mask] == true_tokens[mask]).sum().item()
            total_chars += mask.sum().item()

            mask = true_tokens != 2  # Ignore PAD tokens
            #print(mask.shape)
            total_words += decoder_output.shape[0]
            #print(pred_tokens[mask].shape)
            chk_words = (mask.int() - (pred_tokens == true_tokens).int())
            chk_words[mask == False] = 0
            correct_words += (chk_words.sum(dim = 1) == 0).sum().item()

    avg_loss = total_loss / len(val_loader)
    accuracy = correct_chars / total_chars if total_chars > 0 else 0.0
    word_acc = correct_words / total_words if total_words > 0 else 0.0
    return avg_loss, accuracy, word_acc

In [None]:
# Trainloop
def train_seq2seq(model, train_loader, val_loader, optimizer, num_epochs, device, beam_sizes = [3,5], run=None):
    loss_fn = nn.CrossEntropyLoss(ignore_index=2)  # 2 is the padding index
    max_val_char_acc = 0
    max_val_word_acc = 0
    print("Training of the model has started...")
    counter = 0
    patience = 7
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0.0
        tqdm_loader = tqdm(train_loader, desc=f"Epoch : {epoch + 1} ", ncols=100)

        for batch in tqdm_loader:
            ENC_IN, DEC_IN, DEC_OUT = batch
            ENC_IN = ENC_IN.to(device)
            DEC_IN = DEC_IN.to(device)
            DEC_OUT = DEC_OUT.to(device)
            # Move to device
            decoder_output = model(batch)

            # Reshape for loss
            decoder_output = decoder_output.view(-1, decoder_output.size(-1))
            decoder_target_indices = DEC_OUT.argmax(dim=-1).view(-1)

            loss = loss_fn(decoder_output, decoder_target_indices)
            
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            tqdm_loader.set_postfix({"Train Loss": loss.item()})

        avg_loss = epoch_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}] | Train Loss: {avg_loss:.4f}")

        val_loss, val_acc, val_word_acc = validate_seq2seq(model, val_loader, device)
        print(f"Epoch [{epoch+1}/{num_epochs}] | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | Val Word Acc: {val_word_acc:.4f}")

        if run is not None:
            run.log({"train_loss_epoch" : avg_loss, "val_loss_epoch" : val_loss, "val_char_acc" : val_acc, "val_word_acc" : val_word_acc})

        if val_word_acc > max_val_word_acc:
            max_val_char_acc = val_acc
            max_val_word_acc = val_word_acc
            torch.save(model.state_dict(),"Attention_Best_model.pth")
            counter = 0
        else:
            counter += 1

        if counter > patience:
            break

    if run is not None:
        run.summary["max_val_char_acc"] = max_val_char_acc
        run.summary["max_val_word_acc"] = max_val_word_acc



In [16]:
torch.cuda.empty_cache()
config = {
        "learning_rate" : 0.001,
        "dropout_rnn" : 0.2, 
        "batch_size" :  256,
        "epochs" : 20,
        "embedding_dim" : 256,
        "num_layers" : 2,
        "hidden_size_enc" : 128,
        "enc_cell_type" : "GRU",
        "dec_cell_type" : "RNN",
        "bi_directional" : True,
    }
run = wandb.init(entity="A3_DA6401_DL", project="Attention_RNN", name="Best Model Attention S2S", config=config)

# Loading the datasets and dataloaders
train_dataset = Dataset_Tamil(df_train)
val_dataset = Dataset_Tamil(df_val, build_vocab=False, input_token_index=train_dataset.input_token_index, 
                            output_token_index=train_dataset.output_token_index, max_enc_seq_len=train_dataset.max_enc_seq_len,
                            max_dec_seq_len=train_dataset.max_dec_seq_len)
test_dataset = Dataset_Tamil(df_test, build_vocab=False, input_token_index=train_dataset.input_token_index, 
                            output_token_index=train_dataset.output_token_index, max_enc_seq_len=train_dataset.max_enc_seq_len,
                            max_dec_seq_len=train_dataset.max_dec_seq_len)

train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config["batch_size"], shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Attention_Seq2Seq(input_token_index=train_dataset.input_token_index, output_token_index=train_dataset.output_token_index, max_dec_seq_len=train_dataset.max_dec_seq_len,
                embedding_dim=config["embedding_dim"], hidden_size_enc=config["hidden_size_enc"], bi_directional=config["bi_directional"], enc_cell=config["enc_cell_type"], dec_cell=config["dec_cell_type"], 
                num_layers=config["num_layers"], dropout=config["dropout_rnn"], device=device).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=config["learning_rate"])

train_seq2seq(model, train_loader, val_loader, optimizer, num_epochs=config["epochs"], device=device, run=run)

Training of the model has started...


Epoch : 1 : 100%|████████████████████████████████| 267/267 [00:27<00:00,  9.75it/s, Train Loss=1.13]


Epoch [1/20] | Train Loss: 2.1953


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.88it/s]


Epoch [1/20] | Val Loss: 1.0201 | Val Acc: 0.5758 | Val Word Acc: 0.0437


Epoch : 2 : 100%|███████████████████████████████| 267/267 [00:27<00:00,  9.73it/s, Train Loss=0.393]


Epoch [2/20] | Train Loss: 0.6831


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.91it/s]


Epoch [2/20] | Val Loss: 0.4789 | Val Acc: 0.8109 | Val Word Acc: 0.4160


Epoch : 3 : 100%|███████████████████████████████| 267/267 [00:27<00:00,  9.73it/s, Train Loss=0.323]


Epoch [3/20] | Train Loss: 0.4055


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.87it/s]


Epoch [3/20] | Val Loss: 0.4016 | Val Acc: 0.8299 | Val Word Acc: 0.4777


Epoch : 4 : 100%|███████████████████████████████| 267/267 [00:27<00:00,  9.62it/s, Train Loss=0.242]


Epoch [4/20] | Train Loss: 0.3340


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.55it/s]


Epoch [4/20] | Val Loss: 0.3748 | Val Acc: 0.8424 | Val Word Acc: 0.5301


Epoch : 5 : 100%|███████████████████████████████| 267/267 [00:27<00:00,  9.56it/s, Train Loss=0.245]


Epoch [5/20] | Train Loss: 0.2916


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.61it/s]


Epoch [5/20] | Val Loss: 0.3508 | Val Acc: 0.8493 | Val Word Acc: 0.5515


Epoch : 6 : 100%|███████████████████████████████| 267/267 [00:27<00:00,  9.66it/s, Train Loss=0.246]


Epoch [6/20] | Train Loss: 0.2615


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.67it/s]


Epoch [6/20] | Val Loss: 0.3392 | Val Acc: 0.8464 | Val Word Acc: 0.5553


Epoch : 7 : 100%|███████████████████████████████| 267/267 [00:27<00:00,  9.63it/s, Train Loss=0.264]


Epoch [7/20] | Train Loss: 0.2466


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.67it/s]


Epoch [7/20] | Val Loss: 0.3242 | Val Acc: 0.8575 | Val Word Acc: 0.5859


Epoch : 8 : 100%|███████████████████████████████| 267/267 [00:27<00:00,  9.61it/s, Train Loss=0.154]


Epoch [8/20] | Train Loss: 0.2173


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.65it/s]


Epoch [8/20] | Val Loss: 0.3244 | Val Acc: 0.8559 | Val Word Acc: 0.5812


Epoch : 9 : 100%|███████████████████████████████| 267/267 [00:27<00:00,  9.62it/s, Train Loss=0.208]


Epoch [9/20] | Train Loss: 0.2048


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.64it/s]


Epoch [9/20] | Val Loss: 0.3116 | Val Acc: 0.8651 | Val Word Acc: 0.6102


Epoch : 10 : 100%|██████████████████████████████| 267/267 [00:27<00:00,  9.61it/s, Train Loss=0.188]


Epoch [10/20] | Train Loss: 0.1887


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.76it/s]


Epoch [10/20] | Val Loss: 0.3146 | Val Acc: 0.8632 | Val Word Acc: 0.6067


Epoch : 11 : 100%|██████████████████████████████| 267/267 [00:27<00:00,  9.69it/s, Train Loss=0.133]


Epoch [11/20] | Train Loss: 0.1781


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.75it/s]


Epoch [11/20] | Val Loss: 0.3227 | Val Acc: 0.8603 | Val Word Acc: 0.6051


Epoch : 12 : 100%|██████████████████████████████| 267/267 [00:27<00:00,  9.68it/s, Train Loss=0.246]


Epoch [12/20] | Train Loss: 0.1647


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.71it/s]


Epoch [12/20] | Val Loss: 0.3167 | Val Acc: 0.8678 | Val Word Acc: 0.6164


Epoch : 13 : 100%|██████████████████████████████| 267/267 [00:27<00:00,  9.63it/s, Train Loss=0.122]


Epoch [13/20] | Train Loss: 0.1603


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.49it/s]


Epoch [13/20] | Val Loss: 0.3198 | Val Acc: 0.8637 | Val Word Acc: 0.6127


Epoch : 14 : 100%|██████████████████████████████| 267/267 [00:27<00:00,  9.54it/s, Train Loss=0.188]


Epoch [14/20] | Train Loss: 0.1545


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.35it/s]


Epoch [14/20] | Val Loss: 0.3204 | Val Acc: 0.8586 | Val Word Acc: 0.6111


Epoch : 15 : 100%|██████████████████████████████| 267/267 [00:28<00:00,  9.49it/s, Train Loss=0.135]


Epoch [15/20] | Train Loss: 0.1433


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.51it/s]


Epoch [15/20] | Val Loss: 0.3353 | Val Acc: 0.8630 | Val Word Acc: 0.6096


Epoch : 16 : 100%|██████████████████████████████| 267/267 [00:28<00:00,  9.44it/s, Train Loss=0.116]


Epoch [16/20] | Train Loss: 0.1331


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.50it/s]


Epoch [16/20] | Val Loss: 0.3285 | Val Acc: 0.8651 | Val Word Acc: 0.6225


Epoch : 17 : 100%|██████████████████████████████| 267/267 [00:28<00:00,  9.49it/s, Train Loss=0.112]


Epoch [17/20] | Train Loss: 0.1311


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.37it/s]


Epoch [17/20] | Val Loss: 0.3320 | Val Acc: 0.8590 | Val Word Acc: 0.6115


Epoch : 18 : 100%|██████████████████████████████| 267/267 [00:28<00:00,  9.48it/s, Train Loss=0.155]


Epoch [18/20] | Train Loss: 0.1285


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.42it/s]


Epoch [18/20] | Val Loss: 0.3427 | Val Acc: 0.8567 | Val Word Acc: 0.6070


Epoch : 19 : 100%|██████████████████████████████| 267/267 [00:27<00:00,  9.54it/s, Train Loss=0.129]


Epoch [19/20] | Train Loss: 0.1268


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.58it/s]


Epoch [19/20] | Val Loss: 0.3309 | Val Acc: 0.8632 | Val Word Acc: 0.6130


Epoch : 20 : 100%|██████████████████████████████| 267/267 [00:27<00:00,  9.55it/s, Train Loss=0.125]


Epoch [20/20] | Train Loss: 0.1103


Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.58it/s]

Epoch [20/20] | Val Loss: 0.3385 | Val Acc: 0.8693 | Val Word Acc: 0.6244





In [18]:
model_best = Attention_Seq2Seq(input_token_index=train_dataset.input_token_index, output_token_index=train_dataset.output_token_index, max_dec_seq_len=train_dataset.max_dec_seq_len,
                embedding_dim=config["embedding_dim"], hidden_size_enc=config["hidden_size_enc"], bi_directional=config["bi_directional"], enc_cell=config["enc_cell_type"], dec_cell=config["dec_cell_type"], 
                num_layers=config["num_layers"], dropout=config["dropout_rnn"], device=device).to(device)

model_best.load_state_dict(torch.load("/kaggle/working/Attention_Best_model.pth", weights_only=True))

<All keys matched successfully>

In [19]:
validate_seq2seq(model_best, val_loader, device)

Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.57it/s]


(0.33853095521529514, 0.8693159257318023, 0.6244324007616816)

In [21]:
# Test data
test_loader = DataLoader(test_dataset, batch_size=config["batch_size"], shuffle=False)
_, test_char_acc, test_word_acc = validate_seq2seq(model_best, test_loader, device)

print(test_char_acc, test_word_acc)

if run is not None:
    run.summary["test_char_acc"] = test_char_acc
    run.summary["test_word_acc"] = test_word_acc

Predicting...: 100%|██████████| 27/27 [00:02<00:00, 10.61it/s]

0.8589120478549397 0.5996503496503497





In [22]:
def write_outputs(model, data_loader, filename_csv):
    tqdm_progress = tqdm(data_loader, desc="Writing...")

    list_in, list_out, list_pred = [], [], []
    for batch in tqdm_progress:
        ENC_IN, DEC_IN, DEC_OUT = batch
        out_pred = model.predict_greedy(batch)

        for ix in range(ENC_IN.shape[0]):
            str_in, str_out, str_pred = "", "", ""
            input_word_vec = ENC_IN[ix].argmax(1)
            output_word_vec = DEC_OUT[ix].argmax(1)
            output_pred_vec = out_pred[ix].argmax(1)

            for jx in range(train_dataset.max_dec_seq_len):
                char = train_dataset.output_token_index_reversed[output_word_vec[jx].item()]
                if char == "\n":
                    break
                str_out += char                

            for jx in range(train_dataset.max_dec_seq_len):
                char = train_dataset.output_token_index_reversed[output_pred_vec[jx].item()]
                if char == "\n":
                    break
                str_pred += char      

            for jx in range(train_dataset.max_enc_seq_len):
                char = train_dataset.input_token_index_reversed[input_word_vec[jx].item()]
                if char == " ":
                    break
                str_in += char     

            list_in.append(str_in)
            list_out.append(str_out)
            list_pred.append(str_pred)

    pds = pd.DataFrame(data = {"english" : list_in, "truth tamil" : list_out, "pred tamil" : list_pred})
    pds.to_csv(filename_csv, encoding="utf-8", index=False)       

In [23]:
write_outputs(model, test_loader, "Attention_predictions_test.csv")

Writing...: 100%|██████████| 27/27 [00:03<00:00,  8.89it/s]
