In [3]:
import numpy as np

# Special tokens for sequence processing
START_TOKEN = '\t'
END_TOKEN = '\n'
PAD_TOKEN = ' '

def load_dataset(file_path, as_characters=False):
    """
    Loads data from a TSV file and returns input-output pairs.
    If as_characters is True, returns lists of characters; otherwise, returns strings.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = [line.strip().split('\t') for line in file if line.strip()]
    # Assuming the format: target \t input
    sources = [entry[1] for entry in lines]
    targets = [entry[0] for entry in lines]
    if as_characters:
        sources = [list(text) for text in sources]
        targets = [list(text) for text in targets]
    return sources, targets

def vectorize_sequences(sequences, max_length, char_to_index):
    """
    Converts sequences of characters to sequences of integer indices, padded to max_length.
    """
    return np.array([
        [char_to_index.get(char, char_to_index[PAD_TOKEN]) for char in seq] +
        [char_to_index[PAD_TOKEN]] * (max_length - len(seq))
        for seq in sequences
    ])

def prepare_data(inputs, max_input_len, input_vocab, targets=None, max_output_len=None, output_vocab=None):
    """
    Prepares encoder and decoder inputs and targets for training sequence models.
    """
    encoder_input = vectorize_sequences(inputs, max_input_len, input_vocab)

    decoder_input = None
    decoder_target = None

    if targets is not None and max_output_len is not None and output_vocab is not None:
        # Add start and end tokens to targets
        decoder_input = np.array([
            [output_vocab[START_TOKEN]] +
            [output_vocab[char] for char in seq] +
            [output_vocab[END_TOKEN]] +
            [output_vocab[PAD_TOKEN]] * (max_output_len - len(seq) - 2)
            for seq in targets
        ])
        decoder_target = np.zeros((decoder_input.shape[0], max_output_len, len(output_vocab)), dtype='float32')
        for i, seq in enumerate(decoder_input):
            for t in range(1, len(seq)):
                decoder_target[i, t-1, seq[t]] = 1.0
            # Pad the rest with PAD_TOKEN
            decoder_target[i, t:, output_vocab[PAD_TOKEN]] = 1.0

    return encoder_input, decoder_input, decoder_target

def build_vocabularies(train_src, train_tgt, val_src, val_tgt):
    """
    Builds character-level vocabularies for both input and output languages.
    Returns encoding/decoding dictionaries and max sequence lengths.
    """
    input_chars = set(char for seq in train_src + val_src for char in seq)
    output_chars = set(char for seq in train_tgt + val_tgt for char in seq)

    input_vocab = {char: idx for idx, char in enumerate(sorted(input_chars | {PAD_TOKEN}))}
    input_vocab_dec = sorted(input_vocab, key=lambda k: input_vocab[k])
    max_input_len = max(len(seq) for seq in train_src + val_src)

    # Output vocab includes special tokens
    output_vocab = {START_TOKEN: 0, END_TOKEN: 1}
    idx = 2
    for char in sorted(output_chars | {PAD_TOKEN}):
        if char not in output_vocab:
            output_vocab[char] = idx
            idx += 1
    output_vocab_dec = sorted(output_vocab, key=lambda k: output_vocab[k])
    max_output_len = max(len(seq) for seq in train_tgt + val_tgt) + 2  # +2 for start/end tokens

    print(f"Training samples: {len(train_src)}")
    print(f"Validation samples: {len(val_src)}")
    print(f"Unique input tokens: {len(input_vocab)}")
    print(f"Unique output tokens: {len(output_vocab)}")
    print(f"Max input length: {max_input_len}")
    print(f"Max output length: {max_output_len}")

    return input_vocab, input_vocab_dec, output_vocab, output_vocab_dec, max_input_len, max_output_len

# File paths (update as needed)
train_path = '/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv'
val_path = '/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv'
test_path = '/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv'

# Load datasets
train_inputs, train_targets = load_dataset(train_path)
val_inputs, val_targets = load_dataset(val_path)
test_inputs, test_targets = load_dataset(test_path)

# Build vocabularies and get sequence lengths
(input_vocab, input_vocab_dec, output_vocab, output_vocab_dec,
 max_input_len, max_output_len) = build_vocabularies(train_inputs, train_targets, val_inputs, val_targets)

# Prepare data for model input
train_enc_in, train_dec_in, train_dec_out = prepare_data(
    train_inputs, max_input_len, input_vocab, train_targets, max_output_len, output_vocab)
val_enc_in, val_dec_in, val_dec_out = prepare_data(
    val_inputs, max_input_len, input_vocab, val_targets, max_output_len, output_vocab)
test_enc_in, test_dec_in, test_dec_out = prepare_data(
    test_inputs, max_input_len, input_vocab, test_targets, max_output_len, output_vocab)


Training samples: 44204
Validation samples: 4358
Unique input tokens: 27
Unique output tokens: 66
Max input length: 20
Max output length: 21


In [4]:
print(input_vocab)
print(output_vocab)

{' ': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
{'\t': 0, '\n': 1, ' ': 2, 'ँ': 3, 'ं': 4, 'ः': 5, 'अ': 6, 'आ': 7, 'इ': 8, 'ई': 9, 'उ': 10, 'ऊ': 11, 'ऋ': 12, 'ए': 13, 'ऐ': 14, 'ऑ': 15, 'ओ': 16, 'औ': 17, 'क': 18, 'ख': 19, 'ग': 20, 'घ': 21, 'ङ': 22, 'च': 23, 'छ': 24, 'ज': 25, 'झ': 26, 'ञ': 27, 'ट': 28, 'ठ': 29, 'ड': 30, 'ढ': 31, 'ण': 32, 'त': 33, 'थ': 34, 'द': 35, 'ध': 36, 'न': 37, 'प': 38, 'फ': 39, 'ब': 40, 'भ': 41, 'म': 42, 'य': 43, 'र': 44, 'ल': 45, 'व': 46, 'श': 47, 'ष': 48, 'स': 49, 'ह': 50, '़': 51, 'ा': 52, 'ि': 53, 'ी': 54, 'ु': 55, 'ू': 56, 'ृ': 57, 'ॅ': 58, 'े': 59, 'ै': 60, 'ॉ': 61, 'ो': 62, 'ौ': 63, '्': 64, 'ॐ': 65}


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 1. Dataset Class
class TransliterationDataset(Dataset):
    def __init__(self, enc_inputs, dec_inputs, dec_targets):
        self.enc_inputs = torch.LongTensor(enc_inputs).to(device)
        self.dec_inputs = torch.LongTensor(dec_inputs).to(device)
        self.dec_targets = torch.FloatTensor(dec_targets).to(device)

    def __len__(self):
        return len(self.enc_inputs)

    def __getitem__(self, idx):
        return self.enc_inputs[idx], self.dec_inputs[idx], self.dec_targets[idx]

# 2. Attention Mechanism
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear(enc_hid_dim + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden: (batch_size, dec_hid_dim)
        # encoder_outputs: (batch_size, src_len, enc_hid_dim)
        src_len = encoder_outputs.shape[1]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)  # (batch_size, src_len, dec_hid_dim)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))  # (batch_size, src_len, dec_hid_dim)
        attention = self.v(energy).squeeze(2)  # (batch_size, src_len)
        return torch.softmax(attention, dim=1)

# 3. Encoder (Modified for Attention)
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout, cell_type, bidirectional=False):
        super().__init__()
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        self.num_directions = 2 if bidirectional else 1
        
        self.embedding = nn.Embedding(input_size, embedding_size)
        
        if cell_type == "LSTM":
            self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, 
                              dropout=dropout, bidirectional=bidirectional, batch_first=True)
        elif cell_type == 'RNN':
            self.rnn = nn.RNN(embedding_size, hidden_size, num_layers,
                             dropout=dropout, bidirectional=bidirectional, batch_first=True)
        elif cell_type == 'GRU':
            self.rnn = nn.GRU(embedding_size, hidden_size, num_layers,
                             dropout=dropout, bidirectional=bidirectional, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)  # (batch_size, seq_len, emb_size)
        outputs, hidden = self.rnn(embedded)
        return outputs, hidden

# 4. Decoder with Attention
class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, num_layers, dropout, cell_type, enc_hid_dim):
        super().__init__()
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.cell_type = cell_type
        
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.attention = Attention(enc_hid_dim * 2, hidden_size)  # Account for bidirectional
        
        rnn_input_size = embedding_size + enc_hid_dim * 2  # embedding + context
        
        if cell_type == "LSTM":
            self.rnn = nn.LSTM(rnn_input_size, hidden_size, num_layers, 
                              dropout=dropout, batch_first=True)
        elif cell_type == 'RNN':
            self.rnn = nn.RNN(rnn_input_size, hidden_size, num_layers,
                             dropout=dropout, batch_first=True)
        elif cell_type == 'GRU':
            self.rnn = nn.GRU(rnn_input_size, hidden_size, num_layers,
                             dropout=dropout, batch_first=True)
            
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, encoder_outputs):
        # x: (batch_size, 1)
        # hidden: (num_layers, batch_size, hidden_size) or tuple for LSTM
        # encoder_outputs: (batch_size, src_len, enc_hid_dim * 2)
        
        embedded = self.embedding(x)  # (batch_size, 1, emb_size)
        
        # Get last layer's hidden state
        if self.cell_type == 'LSTM':
            h_n = hidden[0][-1]  # (batch_size, hidden_size)
        else:
            h_n = hidden[-1]  # (batch_size, hidden_size)
        
        # Calculate attention weights
        attn_weights = self.attention(h_n, encoder_outputs)  # (batch_size, src_len)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)  # (batch_size, 1, enc_hid_dim*2)
        
        # Combine input with context
        rnn_input = torch.cat([embedded, context], dim=2)  # (batch_size, 1, emb_size + enc_hid_dim*2)
        
        # RNN step
        output, hidden = self.rnn(rnn_input, hidden)
        prediction = self.fc(output.squeeze(1))  # (batch_size, output_size)
        
        return prediction, hidden

# Modified Seq2Seq Class with Proper Hidden State Handling
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, n_enc_layers, n_dec_layers, cell_type):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.cell_type = cell_type
        self.n_enc_layers = n_enc_layers
        self.n_dec_layers = n_dec_layers
        
        # Bridge layer for bidirectional encoder and layer mismatch
        if encoder.bidirectional:
            self.bridge = nn.Linear(2 * encoder.hidden_size, decoder.hidden_size)
        else:
            self.bridge = nn.Identity()

    def _adapt_hidden(self, hidden):
        """Process encoder hidden states for decoder initialization"""
        if self.cell_type == 'LSTM':
            h, c = hidden
            
            # Handle bidirectional
            if self.encoder.bidirectional:
                # Reshape: (n_enc_layers*2, batch, hidden) -> (n_enc_layers, 2, batch, hidden)
                h = h.view(self.n_enc_layers, 2, -1, self.encoder.hidden_size)
                c = c.view(self.n_enc_layers, 2, -1, self.encoder.hidden_size)
                # Combine directions and project
                h = self.bridge(torch.cat([h[:,0], h[:,1]], dim=-1))
                c = self.bridge(torch.cat([c[:,0], c[:,1]], dim=-1))

            # Handle layer mismatch by padding with zeros
            if h.size(0) < self.n_dec_layers:
                pad_size = self.n_dec_layers - h.size(0)
                h = torch.cat([
                    h,
                    torch.zeros(pad_size, h.size(1), h.size(2)).to(device)
                ], dim=0)
                c = torch.cat([
                    c,
                    torch.zeros(pad_size, c.size(1), c.size(2)).to(device)
                ], dim=0)
            else:
                h = h[:self.n_dec_layers]
                c = c[:self.n_dec_layers]

            return (h.contiguous(), c.contiguous())
        
        else:  # For GRU/RNN
            if self.encoder.bidirectional:
                hidden = hidden.view(self.n_enc_layers, 2, -1, self.encoder.hidden_size)
                hidden = self.bridge(torch.cat([hidden[:,0], hidden[:,1]], dim=-1))
            
            # Handle layer mismatch
            if hidden.size(0) < self.n_dec_layers:
                pad_size = self.n_dec_layers - hidden.size(0)
                hidden = torch.cat([
                    hidden,
                    torch.zeros(pad_size, hidden.size(1), hidden.size(2)).to(device)
                ], dim=0)
            else:
                hidden = hidden[:self.n_dec_layers]
            
            return hidden.contiguous()

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.size(0)
        trg_len = trg.size(1)
        
        # Encode source sequence
        encoder_outputs, hidden = self.encoder(src)
        hidden = self._adapt_hidden(hidden)
        
        # Initialize decoder
        inputs = trg[:, 0].unsqueeze(1)
        outputs = torch.zeros(batch_size, trg_len, self.decoder.output_size).to(device)
        
        for t in range(1, trg_len):
            output, hidden = self.decoder(inputs, hidden, encoder_outputs)
            outputs[:, t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            inputs = trg[:, t].unsqueeze(1) if teacher_force else top1.unsqueeze(1)
            
        return outputs

# Usage Example
# Initialize model components
enc_hid_dim = 256
dec_hid_dim = 256
bidirectional = True

encoder = Encoder(
    input_size=len(input_vocab),
    embedding_size=256,
    hidden_size=enc_hid_dim,
    num_layers=2,
    dropout=0.2,
    cell_type='LSTM',
    bidirectional=bidirectional
).to(device)

decoder = Decoder(
    output_size=len(output_vocab),
    embedding_size=256,
    hidden_size=dec_hid_dim,
    num_layers=3,
    dropout=0.2,
    cell_type='LSTM',
    enc_hid_dim=enc_hid_dim
).to(device)

model = Seq2Seq(encoder, decoder, 2, 3, 'LSTM').to(device)

print(model)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(27, 256)
    (rnn): LSTM(256, 256, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(66, 256)
    (attention): Attention(
      (attn): Linear(in_features=768, out_features=256, bias=True)
      (v): Linear(in_features=256, out_features=1, bias=False)
    )
    (rnn): LSTM(768, 256, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=256, out_features=66, bias=True)
  )
  (bridge): Linear(in_features=512, out_features=256, bias=True)
)


In [6]:
def train(model, dataloader, optimizer, criterion, clip=1.0):
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0
    
    for enc_in, dec_in, dec_out in tqdm(dataloader, desc='Training'):
        optimizer.zero_grad()
        
        # Forward pass
        output = model(enc_in, dec_in)  # (batch_size, trg_len, output_dim)
        
        # Reshape for loss calculation
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)  # (batch*(trg_len-1), output_dim)
        
        # Convert one-hot targets to class indices
        targets = dec_out[:, 1:].argmax(dim=2).reshape(-1)  # (batch*(trg_len-1))
        
        # Calculate loss
        loss = criterion(output, targets)
        
        # Backpropagation
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        # Calculate metrics
        epoch_loss += loss.item()
        preds = output.argmax(1)
        correct += (preds == targets).sum().item()
        total += targets.size(0)
    
    return epoch_loss / len(dataloader), correct / total

def evaluate(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for enc_in, dec_in, dec_out in tqdm(dataloader, desc='Evaluating'):
            output = model(enc_in, dec_in, teacher_forcing_ratio=0.0)
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            targets = dec_out[:, 1:].argmax(dim=2).reshape(-1)
            
            loss = criterion(output, targets)
            
            epoch_loss += loss.item()
            preds = output.argmax(1)
            correct += (preds == targets).sum().item()
            total += targets.size(0)
    
    return epoch_loss / len(dataloader), correct / total

# Initialize model components
enc_hid_dim = 128
dec_hid_dim = 256
bidirectional = True

encoder = Encoder(
    input_size=len(input_vocab),
    embedding_size=128,
    hidden_size=enc_hid_dim,
    num_layers=2,
    dropout=0.4,
    cell_type='RNN',
    bidirectional=True
).to(device)

decoder = Decoder(
    output_size=len(output_vocab),
    embedding_size=128,
    hidden_size=dec_hid_dim,
    num_layers=3,
    dropout=0.2,
    cell_type='RNN',
    enc_hid_dim=enc_hid_dim
).to(device)

# 6. Data Loaders
train_dataset = TransliterationDataset(train_enc_in, train_dec_in, train_dec_out)
val_dataset = TransliterationDataset(val_enc_in, val_dec_in, val_dec_out)
test_dataset = TransliterationDataset(test_enc_in, test_dec_in, test_dec_out)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


model = Seq2Seq(encoder, decoder, 2, 3, 'RNN').to(device)
# Initialize with proper loss function
criterion = nn.CrossEntropyLoss(ignore_index=output_vocab[PAD_TOKEN])


# Restart training with corrected loss
optimizer = optim.Adam(model.parameters())

for epoch in range(1):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_loader, criterion)
    
    print(f'Epoch {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {val_loss:.3f} |  Val. Acc: {val_acc*100:.2f}%')


Training: 100%|██████████| 691/691 [00:33<00:00, 20.47it/s]
Evaluating: 100%|██████████| 69/69 [00:01<00:00, 60.04it/s]

Epoch 01
	Train Loss: 2.183 | Train Acc: 13.02%
	 Val. Loss: 1.465 |  Val. Acc: 16.86%





In [7]:
import wandb

wandb.login(key='43cc4a6022bf573f56ea92522b3e44bac7bd28b6')

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33majay-madkami-iitm[0m ([33majay-madkami-iitm-indian-institute-of-technology-mad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [26]:
# Define the sweep configuration
sweep_config = {
    'method': 'bayes',  # Random search
    'metric': {
        'name': 'accuracy',
        'goal': 'maximize'  # Objective: Maximize accuracy
    },
    'parameters': {
        'cell_type': {'values': ['RNN', 'GRU', 'LSTM']},
        'n_enc_layers': {'values': [2]},
        'n_dec_layers': {'values': [3]},
        'hidden_layer_size': {'values': [16, 32, 64, 256]},
        'emb_size': {'values': [16, 32, 64, 256]},
        'bidirectional': {'values': [True,False]},
        'dropout': {'values': [0.2, 0.3]},
        'epochs': {'values': [3]} 
    }
}


# Initialize a new wandb sweep
sweep_id = wandb.sweep(sweep_config, project="Assignment 3")

Create sweep with ID: i0pmz3un
Sweep URL: https://wandb.ai/ajay-madkami-iitm-indian-institute-of-technology-mad/Assignment%203/sweeps/i0pmz3un


In [27]:
import wandb

# 6. Data Loaders
train_dataset = TransliterationDataset(train_enc_in, train_dec_in, train_dec_out)
val_dataset = TransliterationDataset(val_enc_in, val_dec_in, val_dec_out)
test_dataset = TransliterationDataset(test_enc_in, test_dec_in, test_dec_out)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

def main():
    # Initialize W&B run
    wandb.init()
    hyperparams = wandb.config
    
    # Model parameters from sweep
    encoder = Encoder(
        input_size=len(input_vocab),
        embedding_size=hyperparams.emb_size,
        hidden_size=hyperparams.hidden_layer_size,
        num_layers=hyperparams.n_enc_layers,
        dropout=hyperparams.dropout,
        cell_type=hyperparams.cell_type,
        bidirectional=hyperparams.bidirectional
    ).to(device)
    
    decoder = Decoder(
        output_size=len(output_vocab),
        embedding_size=hyperparams.emb_size,
        hidden_size=hyperparams.hidden_layer_size,
        num_layers=hyperparams.n_dec_layers,
        dropout=hyperparams.dropout,
        cell_type=hyperparams.cell_type,
        enc_hid_dim=hyperparams.hidden_layer_size
    ).to(device)


    model = Seq2Seq(encoder, decoder, hyperparams.n_enc_layers, hyperparams.n_dec_layers, hyperparams.cell_type).to(device)
    
    # Initialize with proper loss function
    criterion = nn.CrossEntropyLoss(ignore_index=output_vocab[PAD_TOKEN])
    # Restart training with corrected loss
    optimizer = optim.Adam(model.parameters())

    
    # Training loop
    for epoch in range(hyperparams.epochs):
        train_loss, train_acc = train(model, train_loader, optimizer, criterion)
        val_loss, val_acc = evaluate(model, val_loader, criterion)
        # Log metrics
        wandb.log({
            "epoch": epoch,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "val_loss": val_loss,
            "val_acc": val_acc
        })
        
        print(f'Epoch {epoch+1:02}')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {val_loss:.3f} |  Val. Acc: {val_acc*100:.2f}%')
    
    # Test evaluation
    # test_loss, test_acc = evaluate(model, test_loader, criterion)
    # wandb.log({"test_loss": test_loss, "test_acc": test_acc})
    # print(f'\nFinal Test Accuracy: {test_acc*100:.2f}%')

# Run the sweep
wandb.agent(sweep_id, function=main, count=1) 
wandb.finish() 

[34m[1mwandb[0m: Agent Starting Run: na9y830j with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_size: 16
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	hidden_layer_size: 256
[34m[1mwandb[0m: 	n_dec_layers: 3
[34m[1mwandb[0m: 	n_enc_layers: 2


Training: 100%|██████████| 691/691 [00:37<00:00, 18.57it/s]
Evaluating: 100%|██████████| 69/69 [00:01<00:00, 52.81it/s]


Epoch 01
	Train Loss: 2.480 | Train Acc: 10.92%
	 Val. Loss: 1.802 |  Val. Acc: 14.53%


Training: 100%|██████████| 691/691 [00:37<00:00, 18.61it/s]
Evaluating: 100%|██████████| 69/69 [00:01<00:00, 52.84it/s]


Epoch 02
	Train Loss: 1.497 | Train Acc: 17.55%
	 Val. Loss: 1.341 |  Val. Acc: 17.68%


Training: 100%|██████████| 691/691 [00:36<00:00, 18.73it/s]
Evaluating: 100%|██████████| 69/69 [00:01<00:00, 52.87it/s]

Epoch 03
	Train Loss: 1.221 | Train Acc: 19.50%
	 Val. Loss: 1.230 |  Val. Acc: 18.48%





0,1
epoch,▁▅█
train_acc,▁▆█
train_loss,█▃▁
val_acc,▁▇█
val_loss,█▂▁

0,1
epoch,2.0
train_acc,0.19504
train_loss,1.22121
val_acc,0.18475
val_loss,1.23013
