In [1]:
import wandb
wandb.login(key="5fb34431b405eb21dc0f263e5b3cf2c15fdc7471")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mma24m004[0m ([33mma24m004-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import wandb
from torch.nn.utils.rnn import pad_sequence

# Encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, cell_type, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = getattr(nn, cell_type)(emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        if isinstance(hidden, tuple):  # LSTM
            return hidden[0].contiguous(), hidden[1].contiguous()
        return hidden.contiguous(), None  # RNN, GRU

# Decoder
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, cell_type, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = getattr(nn, cell_type)(emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell=None):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        if cell is not None:
            hidden = hidden.contiguous()
            cell = cell.contiguous()
            output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        else:
            hidden = hidden.contiguous()
            output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden, cell

# Seq2Seq Model with Beam Search
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)

        input = trg[:, 0]
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t] = output
            teacher_force = np.random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1
        return outputs

    def predict(self, src, max_len=30, beam_size=3):
        self.eval()
        batch_size = src.size(0)
        hidden, cell = self.encoder(src)
        outputs = []

        for i in range(batch_size):
            h = hidden[:, i:i+1].contiguous()
            c = cell[:, i:i+1].contiguous() if cell is not None else None
            beams = [(torch.tensor([1], device=self.device), 0.0, h, c)]  # [sequence, score, hidden, cell]
            for _ in range(max_len):
                new_beams = []
                for seq, score, h, c in beams:
                    input = seq[-1].unsqueeze(0)
                    output, h_new, c_new = self.decoder(input, h, c)
                    probs = torch.log_softmax(output, dim=1).squeeze(0)
                    topk = torch.topk(probs, beam_size)
                    for idx, prob in zip(topk.indices, topk.values):
                        new_seq = torch.cat([seq, idx.unsqueeze(0)])
                        new_beams.append((new_seq, score + prob.item(), h_new.contiguous(), c_new.contiguous() if c_new is not None else None))
                beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_size]
                if beams[0][0][-1].item() == 2:  # Stop if <EOS>
                    break
            outputs.append(beams[0][0][1:])
        return outputs

# Dataset
class DakshinaDataset(Dataset):
    def __init__(self, data, input_vocab, output_vocab):
        self.data = data
        self.input_vocab = input_vocab
        self.output_vocab = output_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src = [self.input_vocab.get(c, self.input_vocab['<UNK>']) for c in self.data.iloc[idx, 1]] + [self.input_vocab['<EOS>']]
        tgt = [self.output_vocab['<SOS>']] + [self.output_vocab.get(c, self.output_vocab['<UNK>']) for c in self.data.iloc[idx, 0]] + [self.output_vocab['<EOS>']]
        return torch.tensor(src), torch.tensor(tgt)

# Vocab Creation
def create_vocab(data, column):
    vocab = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
    for seq in data[column]:
        if pd.notna(seq):
            for char in seq:
                if char not in vocab:
                    vocab[char] = len(vocab)
    return vocab

# Collate
def pad_collate(batch):
    src_batch, tgt_batch = zip(*batch)
    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=0)
    tgt_padded = pad_sequence(tgt_batch, batch_first=True, padding_value=0)
    return src_padded, tgt_padded


In [3]:

# Train
def train_model(config=None):
    with wandb.init(config=config):
        config = wandb.config
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # Load data
        train_data = pd.read_csv('/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv', sep='\t', header=None, dtype=str).dropna()
        dev_data = pd.read_csv('/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv', sep='\t', header=None, dtype=str).dropna()

        # Create vocabularies
        input_vocab = create_vocab(train_data, 1)  # English (source)
        output_vocab = create_vocab(train_data, 0)  # Hindi (target)

        # Prepare datasets and loaders
        train_dataset = DakshinaDataset(train_data, input_vocab, output_vocab)
        dev_dataset = DakshinaDataset(dev_data, input_vocab, output_vocab)
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=pad_collate)
        dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=False, collate_fn=pad_collate)

        # Initialize model
        encoder = Encoder(len(input_vocab), config.emb_dim, config.hidden_dim, config.num_layers, config.cell_type, config.dropout)
        decoder = Decoder(len(output_vocab), config.emb_dim, config.hidden_dim, config.num_layers, config.cell_type, config.dropout)
        model = Seq2Seq(encoder, decoder, device).to(device)

        # Loss and optimizer
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        criterion = nn.CrossEntropyLoss(ignore_index=0)

        best_val_accuracy = 0

        # Training loop
        for epoch in range(config.epochs):
            model.train()
            train_loss, train_correct, train_total = 0, 0, 0
            for src, tgt in train_loader:
                src, tgt = src.to(device), tgt.to(device)
                optimizer.zero_grad()
                output = model(src, tgt)
                output = output[:, 1:].reshape(-1, output.shape[-1])
                tgt = tgt[:, 1:].reshape(-1)
                loss = criterion(output, tgt)
                loss.backward()
                optimizer.step()

                train_loss += loss.item()
                preds = output.argmax(dim=1)
                non_pad_mask = tgt != 0
                train_correct += (preds == tgt)[non_pad_mask].sum().item()
                train_total += non_pad_mask.sum().item()

            train_accuracy = train_correct / train_total

            # Validation
            model.eval()
            val_loss, val_correct, val_total = 0, 0, 0
            with torch.no_grad():
                for src, tgt in dev_loader:
                    src, tgt = src.to(device), tgt.to(device)
                    output = model(src, tgt, teacher_forcing_ratio=0)
                    output = output[:, 1:].reshape(-1, output.shape[-1])
                    tgt = tgt[:, 1:].reshape(-1)
                    loss = criterion(output, tgt)
                    val_loss += loss.item()
                    preds = output.argmax(dim=1)
                    non_pad_mask = tgt != 0
                    val_correct += (preds == tgt)[non_pad_mask].sum().item()
                    val_total += non_pad_mask.sum().item()

            val_accuracy = val_correct / val_total
            patience_counter = 0
            # Save best model
            if val_accuracy > best_val_accuracy:
                best_val_accuracy = val_accuracy
                torch.save(model.state_dict(), '/kaggle/working/best_model.pt')
            else:
                patience_counter += 1
                if patience_counter >= 3:
                    print("Early stopping triggered.")
                    break

            # Log metrics to wandb
            wandb.log({
                'epoch': epoch,
                'train_loss': train_loss / len(train_loader),
                'val_loss': val_loss / len(dev_loader),
                'train_accuracy': train_accuracy,
                'val_accuracy': val_accuracy
            })

            # Log sample predictions
            src_sample, tgt_sample = next(iter(dev_loader))
            src_sample, tgt_sample = src_sample.to(device), tgt_sample.to(device)
            preds = model.predict(src_sample[:5], beam_size=config.beam_size)

            inv_input_vocab = {v: k for k, v in input_vocab.items()}
            inv_output_vocab = {v: k for k, v in output_vocab.items()}
            table = wandb.Table(columns=["Input", "Target", "Prediction"])
            for i in range(len(preds)):
                input_str = ''.join([inv_input_vocab.get(id.item(), '?') for id in src_sample[i] if id.item() not in [0, input_vocab['<EOS>']]])
                target_str = ''.join([inv_output_vocab.get(id.item(), '?') for id in tgt_sample[i] if id.item() not in [0, output_vocab['<EOS>']]])
                pred_str = ''.join([inv_output_vocab.get(id.item(), '?') for id in preds[i]])
                table.add_data(input_str, target_str, pred_str)
            wandb.log({"Predictions": table})


In [5]:

# Sweep
sweep_config = {
    'method': 'random',
    'metric': {'name': 'val_accuracy', 'goal': 'maximize'},
    'parameters': {
        'emb_dim': {'values': [16, 32, 64, 256]},
        'hidden_dim': {'values': [16, 32, 64, 256]},
        'num_layers': {'values': [1, 2, 3]},
        'cell_type': {'values': ['RNN', 'GRU', 'LSTM']},
        'dropout': {'values': [0, 0.2, 0.3]},
        'epochs': {'values': [5, 10]},
        'beam_size': {'values': [1, 3, 5]}
    }
}

sweep_id = wandb.sweep(sweep_config, project='DL_ASSIGNMENT_3_RNN')
wandb.agent(sweep_id, train_model, count=60)
wandb.finish()

Create sweep with ID: pd9x2ajs
Sweep URL: https://wandb.ai/ma24m004-iit-madras/DL_ASSIGNMENT_3_RNN/sweeps/pd9x2ajs


[34m[1mwandb[0m: Agent Starting Run: krv2nmsb with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▂▄▅▅▆▇▇██
train_loss,█▆▅▄▃▃▂▂▁▁
val_accuracy,▁▂▃▄▅▆▇▇██
val_loss,█▇▅▄▃▃▂▂▁▁

0,1
epoch,9.0
train_accuracy,0.51096
train_loss,1.70208
val_accuracy,0.46767
val_loss,1.88986


[34m[1mwandb[0m: Agent Starting Run: daggptac with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▃▅▆█
train_accuracy,▁▅▆▇█
train_loss,█▃▂▂▁
val_accuracy,▁▄███
val_loss,█▅▁▁▁

0,1
epoch,4.0
train_accuracy,0.24528
train_loss,2.86821
val_accuracy,0.21969
val_loss,3.021


[34m[1mwandb[0m: Agent Starting Run: kityiqav with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▃▅▆█
train_accuracy,▁▃▅▇█
train_loss,█▆▄▂▁
val_accuracy,▁▃▆▇█
val_loss,█▆▃▂▁

0,1
epoch,4.0
train_accuracy,0.46836
train_loss,1.85042
val_accuracy,0.45897
val_loss,1.86345


[34m[1mwandb[0m: Agent Starting Run: d91ku0oa with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▅▆▇▇▇▇███
train_loss,█▄▃▂▂▂▂▁▁▁
val_accuracy,▁▅▇▇▅█▆▆▇█
val_loss,█▄▂▁▃▁▂▂▂▁

0,1
epoch,9.0
train_accuracy,0.26538
train_loss,2.7159
val_accuracy,0.22605
val_loss,2.94219


[34m[1mwandb[0m: Agent Starting Run: 1n7yz9wm with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▃▅▆█
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
val_accuracy,▁▄▆▇█
val_loss,█▅▃▂▁

0,1
epoch,4.0
train_accuracy,0.4867
train_loss,1.73923
val_accuracy,0.48681
val_loss,1.73488


[34m[1mwandb[0m: Agent Starting Run: n8uskvhs with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▄▆▆▇▇▇▇█▇
train_loss,█▄▃▃▂▂▂▁▁▁
val_accuracy,▁▅▄▅▆▇▄▅▇█
val_loss,█▄▄▄▂▁▃▃▂▁

0,1
epoch,9.0
train_accuracy,0.27507
train_loss,2.6867
val_accuracy,0.24838
val_loss,2.85513


[34m[1mwandb[0m: Agent Starting Run: eziys4zb with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▃▅▆▆▇▇▇██
train_loss,█▅▄▃▂▂▂▁▁▁
val_accuracy,▁▃▄▅▆▇▇▇██
val_loss,█▆▄▃▃▂▂▁▁▁

0,1
epoch,9.0
train_accuracy,0.46552
train_loss,1.89394
val_accuracy,0.41351
val_loss,2.12707


[34m[1mwandb[0m: Agent Starting Run: jkipv6s1 with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	num_layers: 1




0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▃▄▅▆▆▇▇██
train_loss,█▆▅▄▃▂▂▂▁▁
val_accuracy,▁▂▄▅▆▆▇▇██
val_loss,█▆▅▄▃▃▂▂▁▁

0,1
epoch,9.0
train_accuracy,0.4941
train_loss,1.70071
val_accuracy,0.45916
val_loss,1.82521


[34m[1mwandb[0m: Agent Starting Run: do1f2u9n with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▅▆▇▇▇████
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁▄▆▆▇▇████
val_loss,█▄▃▂▂▁▁▁▁▁

0,1
epoch,9.0
train_accuracy,0.73982
train_loss,0.84941
val_accuracy,0.67849
val_loss,1.12693


[34m[1mwandb[0m: Agent Starting Run: 62btgdkr with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	num_layers: 1


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▄▅▆▇▇▇███
train_loss,█▅▄▃▂▂▂▁▁▁
val_accuracy,▁▄▅▆▇▇████
val_loss,█▅▄▃▂▂▂▁▁▁

0,1
epoch,9.0
train_accuracy,0.46095
train_loss,1.80774
val_accuracy,0.39471
val_loss,2.08209


[34m[1mwandb[0m: Agent Starting Run: g5a4juxs with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	num_layers: 1


0,1
epoch,▁▃▅▆█
train_accuracy,▁▅▆▇█
train_loss,█▄▃▂▁
val_accuracy,▁▄▅▅█
val_loss,█▆▄▃▁

0,1
epoch,4.0
train_accuracy,0.30315
train_loss,2.56054
val_accuracy,0.26161
val_loss,2.77817


[34m[1mwandb[0m: Agent Starting Run: vyd10e69 with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▄▆▇▇▇▇▇▇█
train_loss,█▄▃▂▁▁▂▂▂▁
val_accuracy,▁▄▆█▅█▆▅▇▅
val_loss,█▆▃▁▄▂▃▄▃▄

0,1
epoch,9.0
train_accuracy,0.256
train_loss,2.80947
val_accuracy,0.21457
val_loss,3.03175


[34m[1mwandb[0m: Agent Starting Run: i7ub1zxo with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▃▅▆█
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
val_accuracy,▃▁▂▅█
val_loss,▇█▆▂▁

0,1
epoch,4.0
train_accuracy,0.27279
train_loss,2.67694
val_accuracy,0.23704
val_loss,2.93087


[34m[1mwandb[0m: Agent Starting Run: 3nlcu3xs with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	num_layers: 1




0,1
epoch,▁▃▅▆█
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
val_accuracy,▁▄▆▇█
val_loss,█▅▃▂▁

0,1
epoch,4.0
train_accuracy,0.55941
train_loss,1.47354
val_accuracy,0.53816
val_loss,1.55123


[34m[1mwandb[0m: Agent Starting Run: kp7law0l with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	num_layers: 1




0,1
epoch,▁▃▅▆█
train_accuracy,▁▄▆██
train_loss,█▅▃▂▁
val_accuracy,▁▆▇██
val_loss,█▅▃▂▁

0,1
epoch,4.0
train_accuracy,0.29671
train_loss,2.57123
val_accuracy,0.2565
val_loss,2.7822


[34m[1mwandb[0m: Agent Starting Run: wucczmap with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▃▅▆█
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
val_accuracy,▁▄▆▇█
val_loss,█▅▃▂▁

0,1
epoch,4.0
train_accuracy,0.51987
train_loss,1.62403
val_accuracy,0.51385
val_loss,1.63891


[34m[1mwandb[0m: Agent Starting Run: jfz9rifo with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	num_layers: 1




0,1
epoch,▁▃▅▆█
train_accuracy,▁▄▆▇█
train_loss,█▄▃▂▁
val_accuracy,▁▄▆▇█
val_loss,█▅▃▂▁

0,1
epoch,4.0
train_accuracy,0.56894
train_loss,1.42768
val_accuracy,0.51296
val_loss,1.64224


[34m[1mwandb[0m: Agent Starting Run: 8fbz68zf with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▃▅▆█
train_accuracy,▁▄▆▇█
train_loss,█▅▂▂▁
val_accuracy,▁▅▇██
val_loss,█▄▂▁▁

0,1
epoch,4.0
train_accuracy,0.73885
train_loss,0.86265
val_accuracy,0.68795
val_loss,1.05917


[34m[1mwandb[0m: Agent Starting Run: wp80w1kk with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▃▅▆█
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
val_accuracy,▁▃▄▆█
val_loss,█▇▄▂▁

0,1
epoch,4.0
train_accuracy,0.30418
train_loss,2.51736
val_accuracy,0.29908
val_loss,2.55402


[34m[1mwandb[0m: Agent Starting Run: dkigx8tc with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▃▅▆█
train_accuracy,▁▅▅▇█
train_loss,█▄▃▂▁
val_accuracy,▁▃▂▆█
val_loss,██▇▄▁

0,1
epoch,4.0
train_accuracy,0.24791
train_loss,2.85222
val_accuracy,0.21734
val_loss,3.02932


[34m[1mwandb[0m: Agent Starting Run: 83oxt40b with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇▇██████
train_loss,█▃▂▂▂▁▁▁▁▁
val_accuracy,▄▅█▆▃▅▁▂▆▆
val_loss,▅▄▁▃▇▄█▅▄▃

0,1
epoch,9.0
train_accuracy,0.23501
train_loss,2.89542
val_accuracy,0.18548
val_loss,3.18117


[34m[1mwandb[0m: Agent Starting Run: 0ox3rrvj with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▃▅▆█
train_accuracy,▁▆▇██
train_loss,█▃▂▁▁
val_accuracy,▁▆▇██
val_loss,█▃▁▁▁

0,1
epoch,4.0
train_accuracy,0.8589
train_loss,0.47966
val_accuracy,0.70857
val_loss,1.09114


[34m[1mwandb[0m: Agent Starting Run: lv0wb6nf with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▃▅▆█
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
val_accuracy,▁▄▆▇█
val_loss,█▄▃▂▁

0,1
epoch,4.0
train_accuracy,0.60381
train_loss,1.32254
val_accuracy,0.57742
val_loss,1.40278


[34m[1mwandb[0m: Agent Starting Run: 8elza42z with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▃▅▆█
train_accuracy,▁▅▆▇█
train_loss,█▄▃▁▁
val_accuracy,▁▃▆▇█
val_loss,█▆▂▂▁

0,1
epoch,4.0
train_accuracy,0.26246
train_loss,2.75951
val_accuracy,0.23453
val_loss,2.94255


[34m[1mwandb[0m: Agent Starting Run: erjx8gtx with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▃▅▆█
train_accuracy,▁▆▇██
train_loss,█▄▂▁▁
val_accuracy,▃▄▆▁█
val_loss,█▆▄█▁

0,1
epoch,4.0
train_accuracy,0.26925
train_loss,2.72099
val_accuracy,0.23466
val_loss,2.95966


[34m[1mwandb[0m: Agent Starting Run: fny9uefy with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▃▅▆█
train_accuracy,▁▅▆▇█
train_loss,█▄▂▂▁
val_accuracy,▁▅▆▇█
val_loss,█▄▃▂▁

0,1
epoch,4.0
train_accuracy,0.62863
train_loss,1.20378
val_accuracy,0.58062
val_loss,1.3641


[34m[1mwandb[0m: Agent Starting Run: ft18xqv3 with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▃▄▆▆▇▇███
train_loss,█▆▄▃▂▂▂▁▁▁
val_accuracy,▁▃▅▆▆▇▇███
val_loss,█▅▄▃▃▂▂▁▁▁

0,1
epoch,9.0
train_accuracy,0.68801
train_loss,1.05611
val_accuracy,0.60166
val_loss,1.36891


[34m[1mwandb[0m: Agent Starting Run: w6zydejo with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	num_layers: 1




0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▃▄▅▆▆▇▇██
train_loss,█▆▄▃▃▂▂▁▁▁
val_accuracy,▁▂▃▅▅▆▇▇██
val_loss,█▆▅▄▃▂▂▁▁▁

0,1
epoch,9.0
train_accuracy,0.48772
train_loss,1.68609
val_accuracy,0.45557
val_loss,1.83927


[34m[1mwandb[0m: Agent Starting Run: ikjp3zui with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▃▅▆█
train_accuracy,▁▅▇▇█
train_loss,█▄▂▂▁
val_accuracy,▁▅▇▆█
val_loss,█▅▂▂▁

0,1
epoch,4.0
train_accuracy,0.26648
train_loss,2.74294
val_accuracy,0.22961
val_loss,2.93372


[34m[1mwandb[0m: Agent Starting Run: gtdpiqca with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▃▄▅▅▆▇▇██
train_loss,█▅▄▄▃▃▂▂▁▁
val_accuracy,▁▁▄▃▅▆▅█▇█
val_loss,██▆▆▅▄▄▂▁▁

0,1
epoch,9.0
train_accuracy,0.25933
train_loss,2.78432
val_accuracy,0.24222
val_loss,2.91559


[34m[1mwandb[0m: Agent Starting Run: hd7ws3yv with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▅▆▇▇▇████
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁▄▆▇▇█████
val_loss,█▄▃▂▁▁▁▁▁▁

0,1
epoch,9.0
train_accuracy,0.81118
train_loss,0.63639
val_accuracy,0.67327
val_loss,1.17496


[34m[1mwandb[0m: Agent Starting Run: lszcmlh7 with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	num_layers: 1


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▄▅▆▇▇▇███
train_loss,█▅▃▃▂▂▂▁▁▁
val_accuracy,▁▃▅▆▇▇▇███
val_loss,█▅▄▃▂▂▂▁▁▁

0,1
epoch,9.0
train_accuracy,0.68313
train_loss,1.04601
val_accuracy,0.57475
val_loss,1.4424


[34m[1mwandb[0m: Agent Starting Run: w1659ueu with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▃▄▅▅▆▇▇▇█
train_loss,█▆▅▄▃▃▂▂▁▁
val_accuracy,▁▂▃▄▅▅▆▇██
val_loss,█▇▆▅▄▃▃▂▁▁

0,1
epoch,9.0
train_accuracy,0.42805
train_loss,1.99516
val_accuracy,0.40731
val_loss,2.08023


[34m[1mwandb[0m: Agent Starting Run: 1pq51mq6 with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▃▅▆█
train_accuracy,▁▅▇▇█
train_loss,█▄▂▁▁
val_accuracy,▁▅▆▇█
val_loss,█▄▂▂▁

0,1
epoch,4.0
train_accuracy,0.6452
train_loss,1.15078
val_accuracy,0.60133
val_loss,1.31344


[34m[1mwandb[0m: Agent Starting Run: nvyedji4 with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▃▅▆▇▇▇███
train_loss,█▆▄▂▂▂▁▁▁▁
val_accuracy,▃▁▅█▆█▇▇▇▇
val_loss,▇█▃▂▂▁▁▁▂▁

0,1
epoch,9.0
train_accuracy,0.28514
train_loss,2.60523
val_accuracy,0.22226
val_loss,2.92541


[34m[1mwandb[0m: Agent Starting Run: doceojxg with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▅▆▇▇▇████
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁▅▆▇▇▇▇▇██
val_loss,█▃▂▁▂▂▂▃▁▃

0,1
epoch,9.0
train_accuracy,0.88009
train_loss,0.39747
val_accuracy,0.71784
val_loss,1.11826


[34m[1mwandb[0m: Agent Starting Run: c6kr1ieo with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▃▅▆█
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
val_accuracy,▁▄▆▇█
val_loss,█▆▄▂▁

0,1
epoch,4.0
train_accuracy,0.40643
train_loss,2.21089
val_accuracy,0.38702
val_loss,2.32224


[34m[1mwandb[0m: Agent Starting Run: 4gxmbw2t with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	num_layers: 1




0,1
epoch,▁▃▅▆█
train_accuracy,▁▄▆▇█
train_loss,█▄▃▂▁
val_accuracy,▁▄▅▇█
val_loss,█▅▄▂▁

0,1
epoch,4.0
train_accuracy,0.33958
train_loss,2.46079
val_accuracy,0.3163
val_loss,2.61417


[34m[1mwandb[0m: Agent Starting Run: 1w3d861a with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	num_layers: 1


0,1
epoch,▁▃▅▆█
train_accuracy,▁▅▆██
train_loss,█▄▂▂▁
val_accuracy,▁▅▇█▇
val_loss,█▄▂▁▁

0,1
epoch,4.0
train_accuracy,0.28565
train_loss,2.6621
val_accuracy,0.23839
val_loss,2.89108


[34m[1mwandb[0m: Agent Starting Run: 93i6xr98 with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▃▅▆█
train_accuracy,▁▄▆▇█
train_loss,█▅▃▂▁
val_accuracy,▁▄▆▆█
val_loss,█▅▃▂▁

0,1
epoch,4.0
train_accuracy,0.3187
train_loss,2.41063
val_accuracy,0.32128
val_loss,2.41775


[34m[1mwandb[0m: Agent Starting Run: v4pvu293 with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	num_layers: 1




0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▃▅▆▆▇▇███
train_loss,█▅▄▃▂▂▂▁▁▁
val_accuracy,▁▃▅▆▇▇▇▇██
val_loss,█▅▄▃▂▂▁▁▁▁

0,1
epoch,9.0
train_accuracy,0.65204
train_loss,1.13116
val_accuracy,0.59968
val_loss,1.33295


[34m[1mwandb[0m: Agent Starting Run: gn6ms7q9 with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▄▅▇▇▇████
train_loss,█▅▄▃▂▂▁▁▁▁
val_accuracy,▅▁▇▇▇█▇███
val_loss,▄█▂▂▂▁▂▁▁▁

0,1
epoch,9.0
train_accuracy,0.25853
train_loss,2.78705
val_accuracy,0.22176
val_loss,2.99498


[34m[1mwandb[0m: Agent Starting Run: wb8clsx5 with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▃▅▆▆▇▇███
train_loss,█▆▄▃▂▂▂▁▁▁
val_accuracy,▁▄▅▆▇▇▇███
val_loss,█▅▃▃▂▂▁▁▁▁

0,1
epoch,9.0
train_accuracy,0.65941
train_loss,1.11861
val_accuracy,0.6143
val_loss,1.2791


[34m[1mwandb[0m: Agent Starting Run: dt51xpbp with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▄▅▆▆▆▇▇██
train_loss,█▅▄▃▂▂▂▁▁▁
val_accuracy,▃▁▅▆▅▇▆█▇█
val_loss,▇█▄▃▃▂▃▁▂▁

0,1
epoch,9.0
train_accuracy,0.25376
train_loss,2.81557
val_accuracy,0.23301
val_loss,2.95409


[34m[1mwandb[0m: Agent Starting Run: 7vgeo0oo with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	num_layers: 1


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▂▃▄▅▆▇███
train_loss,█▆▅▄▄▂▂▁▁▁
val_accuracy,▁▃▂▂▆▅▆█▇▇
val_loss,█▆▆▆▃▃▂▁▂▂

0,1
epoch,9.0
train_accuracy,0.33491
train_loss,2.43999
val_accuracy,0.26794
val_loss,2.85393


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▅▆▇▇█████
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁▅▇▇▇█▇██▆
val_loss,█▅▃▂▂▁▂▁▁▂

0,1
epoch,9.0
train_accuracy,0.2594
train_loss,2.79907
val_accuracy,0.21306
val_loss,3.04769


[34m[1mwandb[0m: Agent Starting Run: 12th7s4z with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	num_layers: 1




0,1
epoch,▁▃▅▆█
train_accuracy,▁▅▇██
train_loss,█▅▂▁▁
val_accuracy,▁▆▆█▅
val_loss,█▃▂▁▃

0,1
epoch,4.0
train_accuracy,0.26372
train_loss,2.72263
val_accuracy,0.21431
val_loss,3.00501


[34m[1mwandb[0m: Agent Starting Run: wi4c51tv with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	num_layers: 1




0,1
epoch,▁▃▅▆█
train_accuracy,▁▆▇▇█
train_loss,█▃▂▂▁
val_accuracy,▁▇█▄▅
val_loss,█▂▁▂▂

0,1
epoch,4.0
train_accuracy,0.26216
train_loss,2.76131
val_accuracy,0.21339
val_loss,3.02459


[34m[1mwandb[0m: Agent Starting Run: xctkwo1h with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▃▅▆█
train_accuracy,▁▃▅▇█
train_loss,█▆▄▂▁
val_accuracy,▁▅▂▇█
val_loss,█▆▆▂▁

0,1
epoch,4.0
train_accuracy,0.26219
train_loss,2.73049
val_accuracy,0.22318
val_loss,2.96941


[34m[1mwandb[0m: Agent Starting Run: venlvfv6 with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▄▅▆▇▇▇███
train_loss,█▅▃▂▂▂▁▁▁▁
val_accuracy,▁▄▅▆▇▇████
val_loss,█▅▃▂▂▂▁▁▁▁

0,1
epoch,9.0
train_accuracy,0.7036
train_loss,0.96201
val_accuracy,0.65055
val_loss,1.16326


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 1z29g3gu with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▃▅▆█
train_accuracy,▁▄▅▆█
train_loss,█▅▄▃▁
val_accuracy,▁▆▄▆█
val_loss,█▄▅▄▁

0,1
epoch,4.0
train_accuracy,0.28857
train_loss,2.62565
val_accuracy,0.23611
val_loss,2.93243


[34m[1mwandb[0m: Agent Starting Run: mc3njpq9 with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	num_layers: 1




0,1
epoch,▁▃▅▆█
train_accuracy,▁▅▆▇█
train_loss,█▄▃▂▁
val_accuracy,▁▅▆▇█
val_loss,█▄▃▂▁

0,1
epoch,4.0
train_accuracy,0.31008
train_loss,2.52793
val_accuracy,0.26174
val_loss,2.71952


[34m[1mwandb[0m: Agent Starting Run: kyijqhft with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▅▆▇▇█████
train_loss,█▄▃▂▂▁▁▁▁▁
val_accuracy,▁▄▆▆▇▆█▅▇█
val_loss,█▅▃▃▂▂▂▃▁▁

0,1
epoch,9.0
train_accuracy,0.25104
train_loss,2.84759
val_accuracy,0.22176
val_loss,3.02357


[34m[1mwandb[0m: Agent Starting Run: 0fkyvu3x with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	num_layers: 1




0,1
epoch,▁▃▅▆█
train_accuracy,▁▅▆▇█
train_loss,█▄▃▂▁
val_accuracy,▁▁█▆▇
val_loss,█▆▂▂▁

0,1
epoch,4.0
train_accuracy,0.27449
train_loss,2.69676
val_accuracy,0.225
val_loss,2.92445


[34m[1mwandb[0m: Agent Starting Run: 6dq6yrff with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▃▅▆█
train_accuracy,▁▅▅▆█
train_loss,█▃▃▂▁
val_accuracy,███▄▁
val_loss,▂▁▃▇█

0,1
epoch,4.0
train_accuracy,0.23022
train_loss,2.94461
val_accuracy,0.17634
val_loss,3.2139


[34m[1mwandb[0m: Agent Starting Run: prj57ist with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	num_layers: 1




0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▅▆▇▇▇▇███
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁▅▆▆▇█████
val_loss,█▄▂▂▁▁▁▁▁▂

0,1
epoch,9.0
train_accuracy,0.84043
train_loss,0.52321
val_accuracy,0.69386
val_loss,1.12716


[34m[1mwandb[0m: Agent Starting Run: wtn6je29 with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	num_layers: 2


0,1
epoch,▁▃▅▆█
train_accuracy,▁▅▆▇█
train_loss,█▄▃▂▁
val_accuracy,▁▅▆▄█
val_loss,█▅▅▄▁

0,1
epoch,4.0
train_accuracy,0.24527
train_loss,2.86508
val_accuracy,0.20286
val_loss,3.08607


[34m[1mwandb[0m: Agent Starting Run: vwl6tx81 with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▅▆▇▇▇████
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁▅▆▇▇█████
val_loss,█▃▂▁▁▁▁▁▁▁

0,1
epoch,9.0
train_accuracy,0.87349
train_loss,0.41591
val_accuracy,0.73123
val_loss,1.06099


[34m[1mwandb[0m: Agent Starting Run: zjw5zoye with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	num_layers: 1


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▃▄▅▆▆▇▇██
train_loss,█▆▄▄▃▂▂▁▁▁
val_accuracy,▁▃▄▅▆▆▇▇██
val_loss,█▆▅▄▃▂▂▂▁▁

0,1
epoch,9.0
train_accuracy,0.57864
train_loss,1.40791
val_accuracy,0.48845
val_loss,1.74818


[34m[1mwandb[0m: Agent Starting Run: iuvhz5nx with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	num_layers: 3


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▄▆▆▇▇▇███
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁▄▆▆▇▇████
val_loss,█▄▃▂▂▂▁▁▁▁

0,1
epoch,9.0
train_accuracy,0.70723
train_loss,0.94979
val_accuracy,0.65576
val_loss,1.15449


In [None]:
import random
import torch
from torch.utils.data import DataLoader
import pandas as pd
import wandb
from IPython.display import display, HTML

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize Wandb
wandb.init(project="DL_ASSIGNMENT_3_RNN", name="vanilla-test-inference")

# Load train data to recreate vocabularies
train_data = pd.read_csv(
    '/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv',
    sep='\t', header=None, dtype=str).dropna()

# Create vocabularies
input_vocab = create_vocab(train_data, 1)  # English
output_vocab = create_vocab(train_data, 0)  # Hindi
inv_input_vocab = {v: k for k, v in input_vocab.items()}
inv_output_vocab = {v: k for k, v in output_vocab.items()}

# Load test data
test_data = pd.read_csv(
    '/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv',
    sep='\t', header=None, dtype=str).dropna()
test_dataset = DakshinaDataset(test_data, input_vocab, output_vocab)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=pad_collate)

# Model Config
emb_dim = 64
hidden_dim = 256
num_layers = 3
cell_type = 'LSTM'
dropout = 0.3
beam_size = 3

# Instantiate and Load Model
encoder = Encoder(len(input_vocab), emb_dim, hidden_dim, num_layers, cell_type, dropout)
decoder = Decoder(len(output_vocab), emb_dim, hidden_dim, num_layers, cell_type, dropout)
model = Seq2Seq(encoder, decoder, device).to(device)
model.load_state_dict(torch.load('/kaggle/working/best_model.pt', map_location=device, weights_only=True))
model.eval()

# Inference
correct_words = 0
total_words = 0
total_correct_chars = 0
total_chars = 0
predictions = []

with torch.no_grad():
    for src, tgt in test_loader:
        src, tgt = src.to(device), tgt.to(device)
        batch_size = src.size(0)
        preds = model.predict(src, max_len=30, beam_size=beam_size)

        for i in range(batch_size):
            pred_indices = preds[i].tolist()
            if 2 in pred_indices:
                pred_indices = pred_indices[:pred_indices.index(2)]
            pred_str = ''.join([inv_output_vocab.get(idx, '?') for idx in pred_indices])

            tgt_indices = tgt[i, 1:].tolist()
            if 2 in tgt_indices:
                tgt_indices = tgt_indices[:tgt_indices.index(2)]
            tgt_str = ''.join([inv_output_vocab.get(idx, '?') for idx in tgt_indices])

            input_indices = src[i].tolist()
            input_str = ''.join([inv_input_vocab.get(idx, '?') for idx in input_indices if idx not in [0, input_vocab['<EOS>']]])

            is_correct_word = pred_str == tgt_str
            correct_words += is_correct_word
            total_words += 1

            correct_chars_sample = sum(1 for p, t in zip(pred_str, tgt_str) if p == t)
            total_correct_chars += correct_chars_sample
            total_chars += len(tgt_str)

            predictions.append({
                'input': input_str,
                'target': tgt_str,
                'prediction': pred_str,
                'correct_word': is_correct_word,
                'correct_chars': correct_chars_sample,
                'total_chars': len(tgt_str)
            })

# Accuracies
word_accuracy = (correct_words / total_words) * 100
char_accuracy = (total_correct_chars / total_chars) * 100 if total_chars > 0 else 0

# Log metrics to Wandb
wandb.log({
    "test_word_accuracy": word_accuracy,
    "test_char_accuracy": char_accuracy
})

print(f"\nTest Word Accuracy: {word_accuracy:.2f}%")
print(f"Test Character Accuracy: {char_accuracy:.2f}%")

# Convert predictions to DataFrame
df_predictions = pd.DataFrame(predictions)

# Save All Predictions - CSV (no color)
csv_path = '/kaggle/working/predictions_vanilla.csv'
df_predictions.to_csv(csv_path, index=False)
print(f"CSV saved to {csv_path}")

# Save All Predictions - HTML (no color)
all_html_path = '/kaggle/working/predictions_all.html'
df_predictions.to_html(all_html_path, index=False)
print(f"All HTML (no color) saved to {all_html_path}")

# Color Function for Sample
def highlight_row(row):
    color = 'background-color: #d4edda;' if row['correct_word'] else 'background-color: #f8d7da;'
    return [color] * len(row)

# Save Sample Predictions - HTML (colored)
sample_df = df_predictions.sample(n=min(10, len(df_predictions)), random_state=42)
styled_sample = sample_df.style.apply(highlight_row, axis=1)\
    .set_table_styles([{'selector': 'th', 'props': [('text-align', 'center')]}])\
    .set_properties(**{
        'text-align': 'left',
        'padding': '8px',
        'font-size': '14px',
        'border': '1px solid #ccc'
    }).hide(axis="index")

sample_html_path = '/kaggle/working/predictions_sample_colored.html'
with open(sample_html_path, 'w', encoding='utf-8') as f:
    f.write(f"<h3>Sample Predictions (Color-Coded)</h3>\n{styled_sample.to_html()}")
print(f"Sample colored HTML saved to {sample_html_path}")

# Log to Wandb
artifact = wandb.Artifact('predictions_vanilla', type='predictions')
artifact.add_file(csv_path)
artifact.add_file(all_html_path)
artifact.add_file(sample_html_path)
wandb.log_artifact(artifact)

# Display Sample in Notebook
display(HTML("<h3>Vanilla Sample Predictions (Color-Coded)</h3>"))
display(styled_sample)

# Finish Wandb
wandb.finish()


## Setup For Devanagari Lipi

In [10]:
!fc-list | grep Devanagari

In [11]:
!wget https://noto-website-2.storage.googleapis.com/pkgs/NotoSansDevanagari-hinted.zip
!unzip NotoSansDevanagari-hinted.zip

--2025-05-19 16:03:53--  https://noto-website-2.storage.googleapis.com/pkgs/NotoSansDevanagari-hinted.zip
Resolving noto-website-2.storage.googleapis.com (noto-website-2.storage.googleapis.com)... 142.250.125.207, 173.194.194.207, 142.251.184.207, ...
Connecting to noto-website-2.storage.googleapis.com (noto-website-2.storage.googleapis.com)|142.250.125.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6201435 (5.9M) [application/zip]
Saving to: ‘NotoSansDevanagari-hinted.zip’


2025-05-19 16:03:54 (66.5 MB/s) - ‘NotoSansDevanagari-hinted.zip’ saved [6201435/6201435]

Archive:  NotoSansDevanagari-hinted.zip
  inflating: LICENSE_OFL.txt         
  inflating: NotoSansDevanagari-Black.ttf  
  inflating: NotoSansDevanagari-Bold.ttf  
  inflating: NotoSansDevanagari-Condensed.ttf  
  inflating: NotoSansDevanagari-CondensedBlack.ttf  
  inflating: NotoSansDevanagari-CondensedBold.ttf  
  inflating: NotoSansDevanagari-CondensedExtraBold.ttf  
  inflating: NotoSansD

In [17]:
import os
import matplotlib.pyplot as plt
from matplotlib import font_manager, rcParams

# Use 'Agg' backend for non-GUI environments (optional but safe)
import matplotlib
matplotlib.use("Agg")

# Path to the Noto Sans Devanagari font (adjust if needed)
font_path = "/kaggle/input/notosans-devanagiri/static/NotoSansDevanagari-Regular.ttf"

if os.path.exists(font_path):
    font_manager.fontManager.addfont(font_path)
    dev_font = font_manager.FontProperties(fname=font_path)
    rcParams['font.family'] = dev_font.get_name()
    print(f"✅ Loaded font: {dev_font.get_name()}")
else:
    print("❌ Devanagari font not found. Falling back to default.")
    dev_font = None

# Test plot to render "भारत"
plt.figure(figsize=(6, 2))
plt.text(0.5, 0.5, "भारत", fontsize=30, ha='center', fontproperties=dev_font)
plt.title("Test: Devanagari Font Rendering", fontproperties=dev_font)
plt.axis('off')
plt.tight_layout()
plt.savefig("/kaggle/working/devanagari_test.png")
plt.show()


✅ Loaded font: Noto Sans Devanagari


In [25]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.font_manager as fm
import wandb

# Initialize WandB (if not already initialized)
wandb.init(project="DL_ASSIGNMENT_3_RNN", name="char-wise-confusion", reinit=True)

# Font for Hindi (Devanagari)
font_path = "/kaggle/input/notosans-devanagiri/static/NotoSansDevanagari-Regular.ttf"
prop = fm.FontProperties(fname=font_path)
plt.rcParams['font.family'] = prop.get_name()

# Sample 10 rows from the predictions DataFrame
sample_df = df_predictions.sample(n=10, random_state=42)

# Set up a 4x3 grid of plots
fig, axes = plt.subplots(4, 3, figsize=(18, 20))
axes = axes.flatten()

for idx, row in enumerate(sample_df.itertuples()):
    target = list(row.target)
    pred = list(row.prediction)

    # Pad shorter sequence with "_" to make lengths match
    max_len = max(len(target), len(pred))
    target += ['_'] * (max_len - len(target))
    pred += ['_'] * (max_len - len(pred))

    labels = sorted(set(target + pred))
    cm = confusion_matrix(target, pred, labels=labels)

    ax = axes[idx]
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',  # Changed colormap here
                xticklabels=labels, yticklabels=labels, ax=ax, cbar=True,
                annot_kws={"size": 10, "weight": 'bold', "color": "black"})

    # Add titles with corresponding word strings
    ax.set_title(f"Sample {idx + 1}\nTarget: {row.target}\nPred: {row.prediction}",
                 fontproperties=prop, fontsize=10, color='black')
    ax.set_xlabel("Predicted", fontproperties=prop, color='black')
    ax.set_ylabel("Target", fontproperties=prop, color='black')
    ax.tick_params(axis='x', rotation=45, labelsize=9)
    ax.tick_params(axis='y', rotation=0, labelsize=9)

# Hide any unused subplots
for j in range(len(sample_df), len(axes)):
    fig.delaxes(axes[j])

# Save figure
plt.tight_layout()
char_confusion_path = '/kaggle/working/van_confusion.png'
plt.savefig(char_confusion_path, dpi=300)
print(f"Character-wise confusion matrix grid saved to {char_confusion_path}")

# Log to WandB
wandb.log({"char_confusion_matrix_grid": wandb.Image(char_confusion_path)})


Character-wise confusion matrix grid saved to /kaggle/working/van_confusion.png


## Model with attention