In [1]:
import numpy as np

# Special tokens for sequence processing
START_TOKEN = '\t'
END_TOKEN = '\n'
PAD_TOKEN = ' '

def load_dataset(file_path, as_characters=False):
    """
    Loads data from a TSV file and returns input-output pairs.
    If as_characters is True, returns lists of characters; otherwise, returns strings.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = [line.strip().split('\t') for line in file if line.strip()]
    # Assuming the format: target \t input
    sources = [entry[1] for entry in lines]
    targets = [entry[0] for entry in lines]
    if as_characters:
        sources = [list(text) for text in sources]
        targets = [list(text) for text in targets]
    return sources, targets

def vectorize_sequences(sequences, max_length, char_to_index):
    """
    Converts sequences of characters to sequences of integer indices, padded to max_length.
    """
    return np.array([
        [char_to_index.get(char, char_to_index[PAD_TOKEN]) for char in seq] +
        [char_to_index[PAD_TOKEN]] * (max_length - len(seq))
        for seq in sequences
    ])

def prepare_data(inputs, max_input_len, input_vocab, targets=None, max_output_len=None, output_vocab=None):
    """
    Prepares encoder and decoder inputs and targets for training sequence models.
    """
    encoder_input = vectorize_sequences(inputs, max_input_len, input_vocab)

    decoder_input = None
    decoder_target = None

    if targets is not None and max_output_len is not None and output_vocab is not None:
        # Add start and end tokens to targets
        decoder_input = np.array([
            [output_vocab[START_TOKEN]] +
            [output_vocab[char] for char in seq] +
            [output_vocab[END_TOKEN]] +
            [output_vocab[PAD_TOKEN]] * (max_output_len - len(seq) - 2)
            for seq in targets
        ])
        decoder_target = np.zeros((decoder_input.shape[0], max_output_len, len(output_vocab)), dtype='float32')
        for i, seq in enumerate(decoder_input):
            for t in range(1, len(seq)):
                decoder_target[i, t-1, seq[t]] = 1.0
            # Pad the rest with PAD_TOKEN
            decoder_target[i, t:, output_vocab[PAD_TOKEN]] = 1.0

    return encoder_input, decoder_input, decoder_target

def build_vocabularies(train_src, train_tgt, val_src, val_tgt):
    """
    Builds character-level vocabularies for both input and output languages.
    Returns encoding/decoding dictionaries and max sequence lengths.
    """
    input_chars = set(char for seq in train_src + val_src for char in seq)
    output_chars = set(char for seq in train_tgt + val_tgt for char in seq)

    input_vocab = {char: idx for idx, char in enumerate(sorted(input_chars | {PAD_TOKEN}))}
    input_vocab_dec = sorted(input_vocab, key=lambda k: input_vocab[k])
    max_input_len = max(len(seq) for seq in train_src + val_src)

    # Output vocab includes special tokens
    output_vocab = {START_TOKEN: 0, END_TOKEN: 1}
    idx = 2
    for char in sorted(output_chars | {PAD_TOKEN}):
        if char not in output_vocab:
            output_vocab[char] = idx
            idx += 1
    output_vocab_dec = sorted(output_vocab, key=lambda k: output_vocab[k])
    max_output_len = max(len(seq) for seq in train_tgt + val_tgt) + 2  # +2 for start/end tokens

    print(f"Training samples: {len(train_src)}")
    print(f"Validation samples: {len(val_src)}")
    print(f"Unique input tokens: {len(input_vocab)}")
    print(f"Unique output tokens: {len(output_vocab)}")
    print(f"Max input length: {max_input_len}")
    print(f"Max output length: {max_output_len}")

    return input_vocab, input_vocab_dec, output_vocab, output_vocab_dec, max_input_len, max_output_len

# File paths (update as needed)
train_path = '/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv'
val_path = '/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv'
test_path = '/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv'

# Load datasets
train_inputs, train_targets = load_dataset(train_path)
val_inputs, val_targets = load_dataset(val_path)
test_inputs, test_targets = load_dataset(test_path)

# Build vocabularies and get sequence lengths
(input_vocab, input_vocab_dec, output_vocab, output_vocab_dec,
 max_input_len, max_output_len) = build_vocabularies(train_inputs, train_targets, val_inputs, val_targets)

# Prepare data for model input
train_enc_in, train_dec_in, train_dec_out = prepare_data(
    train_inputs, max_input_len, input_vocab, train_targets, max_output_len, output_vocab)
val_enc_in, val_dec_in, val_dec_out = prepare_data(
    val_inputs, max_input_len, input_vocab, val_targets, max_output_len, output_vocab)
test_enc_in, test_dec_in, test_dec_out = prepare_data(
    test_inputs, max_input_len, input_vocab, test_targets, max_output_len, output_vocab)


Training samples: 44204
Validation samples: 4358
Unique input tokens: 27
Unique output tokens: 66
Max input length: 20
Max output length: 21


In [2]:
print(input_vocab)
print(output_vocab)

{' ': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
{'\t': 0, '\n': 1, ' ': 2, 'ँ': 3, 'ं': 4, 'ः': 5, 'अ': 6, 'आ': 7, 'इ': 8, 'ई': 9, 'उ': 10, 'ऊ': 11, 'ऋ': 12, 'ए': 13, 'ऐ': 14, 'ऑ': 15, 'ओ': 16, 'औ': 17, 'क': 18, 'ख': 19, 'ग': 20, 'घ': 21, 'ङ': 22, 'च': 23, 'छ': 24, 'ज': 25, 'झ': 26, 'ञ': 27, 'ट': 28, 'ठ': 29, 'ड': 30, 'ढ': 31, 'ण': 32, 'त': 33, 'थ': 34, 'द': 35, 'ध': 36, 'न': 37, 'प': 38, 'फ': 39, 'ब': 40, 'भ': 41, 'म': 42, 'य': 43, 'र': 44, 'ल': 45, 'व': 46, 'श': 47, 'ष': 48, 'स': 49, 'ह': 50, '़': 51, 'ा': 52, 'ि': 53, 'ी': 54, 'ु': 55, 'ू': 56, 'ृ': 57, 'ॅ': 58, 'े': 59, 'ै': 60, 'ॉ': 61, 'ो': 62, 'ौ': 63, '्': 64, 'ॐ': 65}


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import random

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [4]:

# Dataset class with device transfer
class TransliterationDataset(Dataset):
    def __init__(self, enc_inputs, dec_inputs, dec_targets):
        self.enc_inputs = torch.LongTensor(enc_inputs).to(device)
        self.dec_inputs = torch.LongTensor(dec_inputs).to(device)
        self.dec_targets = torch.FloatTensor(dec_targets).to(device)

    def __len__(self):
        return len(self.enc_inputs)

    def __getitem__(self, idx):
        return self.enc_inputs[idx], self.dec_inputs[idx], self.dec_targets[idx]

# 2. Encoder
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout, cell_type, bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.cell_type = cell_type
        self.bidirectional = bidirectional
        
        if cell_type == "LSTM":
            self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, 
                              dropout=dropout, bidirectional=bidirectional, batch_first=True)
        elif cell_type == 'RNN':
            self.rnn = nn.RNN(embedding_size, hidden_size, num_layers,
                             dropout=dropout, bidirectional=bidirectional, batch_first=True)
        elif cell_type == 'GRU':
            self.rnn = nn.GRU(embedding_size, hidden_size, num_layers,
                             dropout=dropout, bidirectional=bidirectional, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, hidden = self.rnn(embedded)
        return outputs, hidden

# 3. Decoder
class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, num_layers, dropout, cell_type):
        super().__init__()
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.cell_type = cell_type
        
        if cell_type == "LSTM":
            self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, 
                              dropout=dropout, batch_first=True)
        elif cell_type == 'RNN':
            self.rnn = nn.RNN(embedding_size, hidden_size, num_layers,
                             dropout=dropout, batch_first=True)
        elif cell_type == 'GRU':
            self.rnn = nn.GRU(embedding_size, hidden_size, num_layers,
                             dropout=dropout, batch_first=True)
            
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        output, hidden = self.rnn(x, hidden)
        prediction = self.fc(output)
        return prediction, hidden

# 4. Seq2Seq Model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, n_enc_layers, n_dec_layers, cell_type):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.cell_type = cell_type
        self.n_enc_layers = n_enc_layers
        self.n_dec_layers = n_dec_layers

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.fc.out_features
        
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(device)
        _, hidden = self.encoder(src)
        
        # Handle layer mismatch
        if self.n_enc_layers != self.n_dec_layers:
            if self.cell_type == 'LSTM':
                hidden = (hidden[0][:self.n_dec_layers], hidden[1][:self.n_dec_layers])
            else:
                hidden = hidden[:self.n_dec_layers]
        
        input = trg[:, 0].unsqueeze(1)
        
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t] = output.squeeze(1)
            
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(2)
            input = trg[:, t].unsqueeze(1) if teacher_force else top1
            
        return outputs


In [5]:
# 5. Training/Evaluation Functions
def calculate_accuracy(preds, targets):
    preds = preds.argmax(dim=2)
    correct = (preds == targets).float()
    mask = (targets != 0).float()
    return (correct * mask).sum() / mask.sum()

def train(model, dataloader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    
    for enc_in, dec_in, dec_out in tqdm(dataloader, desc='Training'):
        optimizer.zero_grad()
        output = model(enc_in, dec_in)
        loss = criterion(output[:,1:].reshape(-1, output.shape[-1]), 
                        dec_out[:,1:].reshape(-1, dec_out.shape[-1]))
        loss.backward()
        optimizer.step()
        
        acc = calculate_accuracy(output[:,1:], dec_in[:,1:])
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss/len(dataloader), epoch_acc/(2*len(dataloader))

In [6]:
def evaluate(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    
    with torch.no_grad():
        for enc_in, dec_in, dec_out in tqdm(dataloader, desc='Evaluating'):
            output = model(enc_in, dec_in, teacher_forcing_ratio=0)
            loss = criterion(output[:,1:].reshape(-1, output.shape[-1]), 
                            dec_out[:,1:].reshape(-1, dec_out.shape[-1]))
            acc = calculate_accuracy(output[:,1:], dec_in[:,1:])
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
    return epoch_loss/len(dataloader), epoch_acc/(2*len(dataloader))

In [7]:
# 6. Data Loaders
train_dataset = TransliterationDataset(train_enc_in, train_dec_in, train_dec_out)
val_dataset = TransliterationDataset(val_enc_in, val_dec_in, val_dec_out)
test_dataset = TransliterationDataset(test_enc_in, test_dec_in, test_dec_out)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Reinitialize model with fixed architecture
encoder = Encoder(len(input_vocab), 256, 256, 2, 0.2, 'RNN', bidirectional=True).to(device)
decoder = Decoder(len(output_vocab), 256, 256, 3, 0.2, 'RNN').to(device)
model = Seq2Seq(encoder, decoder, 2, 3, 'RNN').to(device)

# Restart training
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

for epoch in range(1):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_loader, criterion)
    
    print(f'Epoch {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {val_loss:.3f} |  Val. Acc: {val_acc*100:.2f}%')


Training: 100%|██████████| 691/691 [00:22<00:00, 30.32it/s]
Evaluating: 100%|██████████| 69/69 [00:00<00:00, 74.00it/s]

Epoch 01
	Train Loss: 0.033 | Train Acc: 32.09%
	 Val. Loss: 0.024 |  Val. Acc: 33.20%





In [8]:
# # Prediction function with proper device handling
# def predict(model, input_seq, output_vocab, output_vocab_dec, max_output_len, device):
#     model.eval()
#     input_tensor = torch.LongTensor(input_seq).unsqueeze(0).to(device)
    
#     with torch.no_grad():
#         _, hidden = model.encoder(input_tensor)
        
#         # Handle layer mismatch and bidirectional
#         if model.n_enc_layers != model.n_dec_layers:
#             if model.cell_type == 'LSTM':
#                 hidden = (hidden[0][:model.n_dec_layers], hidden[1][:model.n_dec_layers])
#             else:
#                 hidden = hidden[:model.n_dec_layers]
        
#         decoder_input = torch.LongTensor([[output_vocab[START_TOKEN]]]).to(device)
#         decoded_chars = []
        
#         for _ in range(max_output_len):
#             output, hidden = model.decoder(decoder_input, hidden)
#             topi = output.argmax(-1)
#             predicted_idx = topi.item()
            
#             if predicted_idx == output_vocab[END_TOKEN]:
#                 break
                
#             decoded_char = output_vocab_dec[predicted_idx]
#             decoded_chars.append(decoded_char)
#             decoder_input = torch.LongTensor([[predicted_idx]]).to(device)
    
#     return ''.join(decoded_chars)

# # Generate predictions for first 100 test samples
# num_samples = 4500
# predictions = []

# for i in tqdm(range(num_samples), desc="Generating Predictions"):
#     input_seq = test_enc_in[i]
#     target_text = test_targets[i]
#     predicted_text = predict(
#         model, 
#         input_seq, 
#         output_vocab, 
#         output_vocab_dec, 
#         max_output_len, 
#         device
#     )
#     predictions.append((test_inputs[i], target_text, predicted_text))

# # Display sample predictions
# print("\nSample Predictions:")
# for i in range(10):
#     src, tgt, pred = predictions[i]
#     print(f"Input: {src}")
#     print(f"Target: {tgt}")
#     print(f"Predicted: {pred}")
#     print("-" * 50)

# # Calculate accuracy
# correct = sum(1 for _, tgt, pred in predictions if pred == tgt)
# print(f"\nAccuracy on {num_samples} samples: {correct/num_samples:.2%}")


In [9]:
import wandb

wandb.login(key='43cc4a6022bf573f56ea92522b3e44bac7bd28b6')

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33majay-madkami-iitm[0m ([33majay-madkami-iitm-indian-institute-of-technology-mad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [12]:
# Define the sweep configuration
sweep_config = {
    'method': 'bayes',  # Random search
    'metric': {
        'name': 'accuracy',
        'goal': 'maximize'  # Objective: Maximize accuracy
    },
    'parameters': {
        'cell_type': {'values': ['RNN', 'GRU', 'LSTM']},
        'n_enc_layers': {'values': [1, 2, 3]},
        'n_dec_layers': {'values': [1, 2, 3]},
        'hidden_layer_size': {'values': [16, 32, 64, 256]},
        'emb_size': {'values': [16, 32, 64, 256]},
        'bidirectional': {'values': [True,False]},
        'dropout': {'values': [0.2, 0.3]},
        'epochs': {'values': [3, 5]} 
    }
}

# Initialize a new wandb sweep
sweep_id = wandb.sweep(sweep_config, project="Assignment 3")

Create sweep with ID: 50qxhud8
Sweep URL: https://wandb.ai/ajay-madkami-iitm-indian-institute-of-technology-mad/Assignment%203/sweeps/50qxhud8


In [15]:
import wandb

def main():
    # Initialize W&B run
    wandb.init()
    hyperparams = wandb.config
    
    # Model parameters from sweep
    encoder = Encoder(
        input_size=len(input_vocab),
        embedding_size=hyperparams.emb_size,
        hidden_size=hyperparams.hidden_layer_size,
        num_layers=hyperparams.n_enc_layers,
        dropout=hyperparams.dropout,
        cell_type=hyperparams.cell_type,
        bidirectional=hyperparams.bidirectional
    ).to(device)

    decoder = Decoder(
        output_size=len(output_vocab),
        embedding_size=hyperparams.emb_size,
        hidden_size=hyperparams.hidden_layer_size,
        num_layers=hyperparams.n_dec_layers,
        dropout=hyperparams.dropout,
        cell_type=hyperparams.cell_type
    ).to(device)

    model = Seq2Seq(
        encoder, 
        decoder,
        n_enc_layers=hyperparams.n_enc_layers,
        n_dec_layers=hyperparams.n_dec_layers,
        cell_type=hyperparams.cell_type
    ).to(device)

    # Training setup
    optimizer = optim.Adam(model.parameters())
    criterion = nn.BCEWithLogitsLoss()
    
    # Training loop
    for epoch in range(hyperparams.epochs):
        train_loss, train_acc = train(model, train_loader, optimizer, criterion)
        val_loss, val_acc = evaluate(model, val_loader, criterion)
        # Log metrics
        wandb.log({
            "epoch": epoch,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "val_loss": val_loss,
            "val_acc": val_acc
        })
        
        print(f'Epoch {epoch+1:02}')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {val_loss:.3f} |  Val. Acc: {val_acc*100:.2f}%')
    
    # Test evaluation
    test_loss, test_acc = evaluate(model, test_loader, criterion)
    # wandb.log({"test_loss": test_loss, "test_acc": test_acc})
    # print(f'\nFinal Test Accuracy: {test_acc*100:.2f}%')

# Run the sweep
wandb.agent(sweep_id, function=main, count=2) 

[34m[1mwandb[0m: Agent Starting Run: yyogsgo7 with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_size: 16
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	hidden_layer_size: 32
[34m[1mwandb[0m: 	n_dec_layers: 3
[34m[1mwandb[0m: 	n_enc_layers: 2


Training: 100%|██████████| 691/691 [00:21<00:00, 31.72it/s]
Evaluating: 100%|██████████| 69/69 [00:00<00:00, 111.02it/s]


Epoch 01
	Train Loss: 0.078 | Train Acc: 31.19%
	 Val. Loss: 0.035 |  Val. Acc: 32.62%


Training: 100%|██████████| 691/691 [00:21<00:00, 31.99it/s]
Evaluating: 100%|██████████| 69/69 [00:00<00:00, 113.11it/s]


Epoch 02
	Train Loss: 0.030 | Train Acc: 31.90%
	 Val. Loss: 0.026 |  Val. Acc: 32.65%


Training: 100%|██████████| 691/691 [00:21<00:00, 31.81it/s]
Evaluating: 100%|██████████| 69/69 [00:00<00:00, 107.97it/s]


Epoch 03
	Train Loss: 0.025 | Train Acc: 32.36%
	 Val. Loss: 0.024 |  Val. Acc: 33.20%


Evaluating: 100%|██████████| 71/71 [00:00<00:00, 114.27it/s]


0,1
epoch,▁▅█
train_acc,▁▅█
train_loss,█▂▁
val_acc,▁▁█
val_loss,█▂▁

0,1
epoch,2.0
train_acc,0.32357
train_loss,0.02465
val_acc,0.33204
val_loss,0.02412


[34m[1mwandb[0m: Agent Starting Run: 4foondc0 with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_size: 16
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	hidden_layer_size: 256
[34m[1mwandb[0m: 	n_dec_layers: 3
[34m[1mwandb[0m: 	n_enc_layers: 2


Training: 100%|██████████| 691/691 [00:31<00:00, 22.13it/s]
Evaluating: 100%|██████████| 69/69 [00:00<00:00, 85.42it/s]


Epoch 01
	Train Loss: 0.029 | Train Acc: 32.61%
	 Val. Loss: 0.020 |  Val. Acc: 34.43%


Training: 100%|██████████| 691/691 [00:30<00:00, 22.36it/s]
Evaluating: 100%|██████████| 69/69 [00:00<00:00, 85.37it/s]


Epoch 02
	Train Loss: 0.020 | Train Acc: 33.42%
	 Val. Loss: 0.019 |  Val. Acc: 34.09%


Training: 100%|██████████| 691/691 [00:31<00:00, 22.28it/s]
Evaluating: 100%|██████████| 69/69 [00:00<00:00, 82.83it/s]


Epoch 03
	Train Loss: 0.019 | Train Acc: 33.73%
	 Val. Loss: 0.018 |  Val. Acc: 34.41%


Evaluating: 100%|██████████| 71/71 [00:00<00:00, 84.64it/s]


0,1
epoch,▁▅█
train_acc,▁▆█
train_loss,█▂▁
val_acc,█▁█
val_loss,█▄▁

0,1
epoch,2.0
train_acc,0.33729
train_loss,0.01913
val_acc,0.34407
val_loss,0.01752
