In [None]:
#imports
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import json
import numpy as np
import re
import string
import pandas as pd
import matplotlib.pyplot as plt
from huggingface_hub import hf_hub_download
from collections import Counter, defaultdict
import os
import zipfile
from tqdm.notebook import tqdm
import unicodedata
import pickle
import random
import multiprocessing as mp
import traceback

random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

In [2]:
torch.set_float32_matmul_precision('high')

  _C._set_float32_matmul_precision(precision)


In [3]:
hidden_size = 333
seq_length = 150
batch_size = 64
dropout_rate = 0.15
learning_rate = 1e-4
clip_value = 1.0
weight_decay = 5e-6
epochs = 50
accumulation_steps = 2
nheads= 9
num_layers_enc= 4
num_layers_dec= 4
dim_feedforward= 4 * hidden_size

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
print(f"Available GPUs: {num_gpus}")
print(f"Primary device: {device}")

Available GPUs: 1
Primary device: cuda


In [5]:
def is_kaggle():
    return "KAGGLE_KERNEL_RUN_TYPE" in os.environ

base = "/kaggle/working/" if is_kaggle() else ""
base

''

Creating separate python scripts helps in easy multiprocessing with gpu
%%writefile magic saves these scripts in the current directory

In [None]:
%%writefile utils.py
# creating a separate utility script containing script for preparing the data and decoding
import math
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import os
import re
import unicodedata
import json
from huggingface_hub import hf_hub_download
from tqdm import tqdm
import pickle

WHITESPACE_MARKER = "▁"

def _decode_token_text(token_bytes):
    if isinstance(token_bytes, bytes):
        try:
            return token_bytes.decode("utf-8")
        except UnicodeDecodeError:
            return str(token_bytes)[2:-1]
    return str(token_bytes)

def detokenize_sentence(tokens, tokenizer_data):
    vocab, _ = tokenizer_data
    id_to_text = [_decode_token_text(tok) for tok in vocab]
    byte_array = bytearray()
    for token_id in tokens:
        if 0 <= token_id < len(id_to_text):
            token_text = id_to_text[token_id]
            if token_text not in ("<s>", "</s>", "<pad>", "<unk>"):
                byte_array.extend(vocab[token_id])
    detok = byte_array.decode('utf-8', errors='replace')
    detok = detok.replace(WHITESPACE_MARKER, " ").strip()
    return re.sub(r'\s+', ' ', detok)

def encode_and_pad(vocab_map, sent_ids, max_length):
    sos = [vocab_map["<s>"]]
    eos = [vocab_map["</s>"]]
    pad = [vocab_map["<pad>"]]
    if len(sent_ids) < max_length - 2:
        n_pads = max_length - 2 - len(sent_ids)
        return sos + sent_ids + eos + pad * n_pads
    else:
        truncated = sent_ids[:max_length - 2]
        return sos + truncated + eos

def prepare(lang, vocab_size=15000):
    lng = hf_hub_download(
        repo_id="Arnab-Datta-240185/CS779-Capstone-Project",
        filename=f"{lang}_prepared_data.pkl",
        repo_type="dataset"
    )
    with open(lng, "rb") as f:
        dat = pickle.load(f)
    id_val = dat["id_val"]
    en_tokenizer_data = dat["en_tokenizer_data"]
    target_tokenizer_data = dat["target_tokenizer_data"]
    train_dl = dat["train_dl"]
    test_ds = dat["test_ds"]
    return id_val, en_tokenizer_data, target_tokenizer_data, train_dl, test_ds

def save_model(encoder, decoder, lang, epoch=None):
    base = "/kaggle/working/" if "KAGGLE_KERNEL_RUN_TYPE" in os.environ else ""
    suffix = f"_epoch{epoch}" if epoch is not None else ""
    torch.save(encoder.state_dict(), f"{base}encoder_{lang}{suffix}.pt")
    torch.save(decoder.state_dict(), f"{base}decoder_{lang}{suffix}.pt")
    print(f"Model saved for {lang}{suffix}")

def _subsequent_mask(sz, device):
    # Additive mask for causal self-attention: -inf above diagonal
    mask = torch.full((sz, sz), float('-inf'), device=device)
    mask = torch.triu(mask, 1)
    return mask

def _makes_repeat_ngram(seq, n, new_tok):
    if n <= 0:
        return False
    tmp = seq + [new_tok]
    if len(tmp) < n:
        return False
    last = tuple(tmp[-n:])
    for i in range(len(tmp) - n):
        if tuple(tmp[i:i+n]) == last:
            return True
    return False

@torch.no_grad()
def beam_search_translate_transformer(
    encoder, decoder, src_tensor,
    SOS_ID, EOS_ID,
    src_pad_id=None,
    beam_width=5, max_length=150,
    length_penalty=0.7,
    no_repeat_ngram_size=3,
    repetition_penalty=1.2,
    min_length=5,
    temperature=1.0,
):
    device = src_tensor.device
    src_key_padding_mask = (src_tensor == src_pad_id) if (src_pad_id is not None) else None
    memory = encoder(src_tensor, src_key_padding_mask=src_key_padding_mask)  # [1, S, d]

    beams = [([SOS_ID], 0.0)]
    completed = []

    for step in range(1, max_length + 1):
        candidates = []
        B = len(beams)
        mem = memory.repeat(B, 1, 1)
        if src_key_padding_mask is not None:
            mem_kpm = src_key_padding_mask.repeat(B, 1)
        else:
            mem_kpm = None

        max_len = max(len(b[0]) for b in beams)
        T = max_len
        tgt_tensor = torch.full((B, T), SOS_ID, device=device, dtype=torch.long)
        for i, (tokens, _) in enumerate(beams):
            tgt_tensor[i, :len(tokens)] = torch.tensor(tokens, device=device, dtype=torch.long)

        tgt_mask = _subsequent_mask(T, device)

        logits = decoder(
            tgt_tensor,
            mem,
            tgt_mask=tgt_mask,
            tgt_key_padding_mask=None,
            memory_key_padding_mask=mem_kpm
        )  # [B, T, V]
        step_logits = logits[:, -1, :] / max(temperature, 1e-6)
        log_probs = torch.log_softmax(step_logits, dim=-1)

        for b_idx, (tokens, score) in enumerate(beams):
            if repetition_penalty != 1.0 and tokens:
                uniq = list(set(tokens))
                log_probs[b_idx, uniq] = log_probs[b_idx, uniq] - math.log(repetition_penalty)

            if step < min_length:
                log_probs[b_idx, EOS_ID] = -1e9

            top_log_probs, top_idx = log_probs[b_idx].topk(beam_width)
            for i in range(beam_width):
                tok = top_idx[i].item()
                if no_repeat_ngram_size and _makes_repeat_ngram(tokens, no_repeat_ngram_size, tok):
                    continue
                new_tokens = tokens + [tok]
                new_score = score + top_log_probs[i].item()
                if tok == EOS_ID:
                    completed.append((new_tokens, new_score))
                else:
                    candidates.append((new_tokens, new_score))

        if not candidates and completed:
            break

        def norm_score(s, l):
            return s / (max(1, l) ** length_penalty)

        if candidates:
            candidates.sort(key=lambda x: norm_score(x[1], len(x[0])), reverse=True)
            beams = candidates[:beam_width]
        else:
            break

        if len(completed) >= beam_width:
            break

    final = completed if completed else beams
    final.sort(key=lambda x: (x[0][-1] == EOS_ID, x[1] / (max(1, len(x[0])) ** length_penalty)), reverse=True)
    best_tokens = final[0][0]
    trimmed = []
    for t in best_tokens:
        if t == SOS_ID:
            continue
        if t == EOS_ID:
            break
        trimmed.append(t)
    return trimmed

@torch.no_grad()
def translate(encoder, decoder, test_ds, SOS_ID, EOS_ID, target_tokenizer_data, src_tokenizer_data, use_beam_search=True):
    encoder.eval()
    decoder.eval()
    val_outs = []
    device = next(encoder.parameters()).device

    src_vocab_map = {_decode_token_text(tok): i for i, tok in enumerate(src_tokenizer_data[0])}
    SRC_PAD_ID = src_vocab_map.get("<pad>", None)

    for i in tqdm(range(len(test_ds)), desc=f"Translating"):
        input_tensor = test_ds[i][0].unsqueeze(0).to(device)  # [1, S]
        if use_beam_search:
            try:
                result_tokens = beam_search_translate_transformer(
                    encoder, decoder, input_tensor,
                    SOS_ID, EOS_ID,
                    src_pad_id=SRC_PAD_ID,
                    beam_width=5, max_length=150,
                    length_penalty=0.7,
                    no_repeat_ngram_size=3,
                    repetition_penalty=1.2,
                    min_length=5,
                    temperature=1.0
                )
            except Exception:
                mem = encoder(input_tensor, src_key_padding_mask=(input_tensor == SRC_PAD_ID) if SRC_PAD_ID is not None else None)
                result_tokens = []
                cur = torch.tensor([[SOS_ID]], device=device, dtype=torch.long)
                for _ in range(150):
                    T = cur.size(1)
                    tgt_mask = _subsequent_mask(T, device)
                    logits = decoder(cur, mem, tgt_mask=tgt_mask)
                    next_id = logits[:, -1, :].argmax(-1)
                    nid = next_id.item()
                    if nid == EOS_ID:
                        break
                    result_tokens.append(nid)
                    cur = torch.cat([cur, next_id.view(1,1)], dim=1)
        else:
            mem = encoder(input_tensor, src_key_padding_mask=(input_tensor == SRC_PAD_ID) if SRC_PAD_ID is not None else None)
            result_tokens = []
            cur = torch.tensor([[SOS_ID]], device=device, dtype=torch.long)
            for _ in range(150):
                T = cur.size(1)
                tgt_mask = _subsequent_mask(T, device)
                logits = decoder(cur, mem, tgt_mask=tgt_mask)
                next_id = logits[:, -1, :].argmax(-1)
                nid = next_id.item()
                if nid == EOS_ID:
                    break
                result_tokens.append(nid)
                cur = torch.cat([cur, next_id.view(1,1)], dim=1)

        translated_sentence = detokenize_sentence(result_tokens, target_tokenizer_data)
        val_outs.append(translated_sentence)

    return val_outs

Writing utils.py


In [7]:
%%writefile worker.py
import torch
import torch.nn as nn
import math
import pandas as pd
import os
from tqdm import tqdm
import traceback

from utils import prepare, save_model, translate, _decode_token_text

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        if d_model % 2 == 1:
            pe[:, 1::2] = torch.cos(position * div_term[:-1])
        else:
            pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # [1, max_len, d_model]
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: [B, T, d]
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

class TransformerEncoder(nn.Module):
    def __init__(self, src_vocab_size, d_model, nhead, num_layers, dim_feedforward, dropout, pad_id):
        super().__init__()
        self.pad_id = pad_id
        self.embed = nn.Embedding(src_vocab_size, d_model, padding_idx=pad_id)
        self.pos = PositionalEncoding(d_model, dropout=dropout)
        layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward,
            dropout=dropout, batch_first=True, activation='relu'
        )
        self.enc = nn.TransformerEncoder(layer, num_layers=num_layers)

    def forward(self, src, src_key_padding_mask=None):
        x = self.embed(src)  # [B, S, d]
        x = self.pos(x)
        memory = self.enc(x, src_key_padding_mask=src_key_padding_mask)  # [B, S, d]
        return memory

class TransformerDecoder(nn.Module):
    def __init__(self, d_model, tgt_vocab_size, nhead, num_layers, dim_feedforward, dropout, pad_id):
        super().__init__()
        self.pad_id = pad_id
        self.embed = nn.Embedding(tgt_vocab_size, d_model, padding_idx=pad_id)
        self.pos = PositionalEncoding(d_model, dropout=dropout)
        layer = nn.TransformerDecoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward,
            dropout=dropout, batch_first=True, activation='relu'
        )
        self.dec = nn.TransformerDecoder(layer, num_layers=num_layers)
        self.generator = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, tgt, memory, tgt_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        y = self.embed(tgt)  # [B, T, d]
        y = self.pos(y)
        out = self.dec(
            y, memory,
            tgt_mask=tgt_mask,
            tgt_key_padding_mask=tgt_key_padding_mask,
            memory_key_padding_mask=memory_key_padding_mask
        )  # [B, T, d]
        logits = self.generator(out)  # [B, T, V]
        return logits

def _subsequent_mask(sz, device):
    mask = torch.full((sz, sz), float('-inf'), device=device)
    mask = torch.triu(mask, 1)
    return mask

def train_language(lang, device_id, hp):
    try:
        if device_id >= 0 and torch.cuda.is_available():
            device = torch.device(f'cuda:{device_id}')
            torch.cuda.set_device(device_id)
        else:
            device = torch.device('cpu')

        base = "/kaggle/working/" if "KAGGLE_KERNEL_RUN_TYPE" in os.environ else ""

        print(f"[{lang}] Preparing data on {device}...")
        id_val, en_tok_data, target_tok_data, train_dl, test_ds = prepare(lang, vocab_size=hp.get('vocab_size', 15000))

        # Vocab sizes and special IDs
        en_vocab_size = len(en_tok_data[0])
        de_vocab_size = len(target_tok_data[0])
        de_vocab_map = {_decode_token_text(tok): i for i, tok in enumerate(target_tok_data[0])}
        en_vocab_map = {_decode_token_text(tok): i for i, tok in enumerate(en_tok_data[0])}

        SOS_ID = de_vocab_map["<s>"]
        EOS_ID = de_vocab_map["</s>"]
        PAD_ID_TGT = de_vocab_map["<pad>"]
        PAD_ID_SRC = en_vocab_map["<pad>"]

        # Transformer hyperparameters
        d_model = hp.get('hidden_size', 300)
        nhead = hp.get('nheads', 8)
        num_layers_enc = hp.get('num_layers_enc', 4)
        num_layers_dec = hp.get('num_layers_dec', 4)
        dim_ff = hp.get('dim_feedforward', 4 * d_model)
        dropout = hp.get('dropout_rate', 0.1)

        encoder = TransformerEncoder(en_vocab_size, d_model, nhead, num_layers_enc, dim_ff, dropout, PAD_ID_SRC).to(device)
        decoder = TransformerDecoder(d_model, de_vocab_size, nhead, num_layers_dec, dim_ff, dropout, PAD_ID_TGT).to(device)

        criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID_TGT, reduction='sum')
        enc_optimizer = torch.optim.Adam(encoder.parameters(), lr=hp.get('learning_rate', 1e-4), weight_decay=hp.get('weight_decay', 5e-6))
        dec_optimizer = torch.optim.Adam(decoder.parameters(), lr=hp.get('learning_rate', 1e-4), weight_decay=hp.get('weight_decay', 5e-6))

        epochs = hp.get('epochs', 60)
        clip_value = hp.get('clip_value', 1.0)

        for epoch in range(epochs):
            encoder.train()
            decoder.train()
            epoch_losses = []

            for batch in tqdm(train_dl, desc=f"Epoch {epoch+1} {lang}"):
                enc_optimizer.zero_grad()
                dec_optimizer.zero_grad()

                src = batch[0].to(device)  # [B, S]
                tgt = batch[1].to(device)  # [B, S]

                tgt_in = tgt[:, :-1]      # [B, S-1], starts with <s>
                tgt_out = tgt[:, 1:]      # [B, S-1], next tokens including </s>

                src_key_padding_mask = (src == PAD_ID_SRC)            # [B, S]
                tgt_key_padding_mask = (tgt_in == PAD_ID_TGT)         # [B, S-1]
                T = tgt_in.size(1)
                tgt_mask = _subsequent_mask(T, device)                # [T, T]

                memory = encoder(src, src_key_padding_mask=src_key_padding_mask)  # [B, S, d]
                logits = decoder(
                    tgt_in, memory,
                    tgt_mask=tgt_mask,
                    tgt_key_padding_mask=tgt_key_padding_mask,
                    memory_key_padding_mask=src_key_padding_mask
                )  # [B, T, V]

                loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))
                nonpad = (tgt_out != PAD_ID_TGT).sum().clamp_min(1)
                loss = loss / nonpad

                if torch.isnan(loss):
                    continue

                loss.backward()
                torch.nn.utils.clip_grad_norm_(encoder.parameters(), max_norm=clip_value)
                torch.nn.utils.clip_grad_norm_(decoder.parameters(), max_norm=clip_value)
                enc_optimizer.step()
                dec_optimizer.step()

                if not math.isnan(loss.item()):
                    epoch_losses.append(loss.item())

            avg_loss = sum(epoch_losses) / len(epoch_losses) if epoch_losses else float('inf')
            print(f"Epoch {epoch+1} [{lang}] Avg loss: {avg_loss:.4f}")

            if avg_loss < float('inf'):
                save_model(encoder, decoder, lang, epoch+1)

        save_model(encoder, decoder, lang)
        print(f"\n[{lang}] Translating with beam search...")
        # Pass both tokenizers so translate can build src masks
        val_outs = translate(encoder, decoder, test_ds, SOS_ID, EOS_ID, target_tok_data, en_tok_data, use_beam_search=True)

        df = pd.DataFrame({"ID": id_val, "Translation": val_outs})
        df.to_csv(f"{base}answers{lang[0]}.csv", index=False)
        print(f"[{lang}] Saved translations.")

    except Exception:
        print(f"--- FATAL ERROR IN PROCESS FOR {lang} ---")
        traceback.print_exc()

Writing worker.py


In [None]:
if __name__ == '__main__':
    try:
        import sys
        sys.path.insert(0, base if base else './')

        if num_gpus > 1:
            try:
                mp.set_start_method('spawn', force=True)
                print(f"Multi-GPU mode: Using {num_gpus} GPUs with multiprocessing")
            except RuntimeError:
                print("Multiprocessing context already set.")
        else:
            print(f"Single GPU/CPU mode: Running languages sequentially")
    except Exception as e:
        print(f"Setup warning: {e}")

    hyperparameters = {
        "vocab_size": 15000,
        "hidden_size": hidden_size,
        "dropout_rate": dropout_rate,
        "learning_rate": learning_rate,
        "weight_decay": weight_decay,
        "epochs": epochs,
        "clip_value": clip_value,
        "nheads": nheads,
        "num_layers_enc": num_layers_enc,
        "num_layers_dec": num_layers_dec,
        "dim_feedforward": dim_feedforward
    }
    langs = ["Bengali", "Hindi"]

    if num_gpus > 1:
        from worker import train_language
        processes = []
        for i, lang in enumerate(langs):
            device_id = i % num_gpus
            process = mp.Process(target=train_language, args=(lang, device_id, hyperparameters))
            processes.append(process)
            process.start()
            print(f"Launched process for {lang} on GPU {device_id}")

        for process in processes:
            process.join()
    else:
        from worker import train_language
        device_id = 0 if num_gpus == 1 else -1
        for lang in langs:
            print(f"\nTraining {lang} on {'GPU 0' if device_id >= 0 else 'CPU'}...")
            train_language(lang, device_id, hyperparameters)

    print("\nAll training processes have completed.")

    try:
        df1 = pd.read_csv(f"{base}answersB.csv")
        df2 = pd.read_csv(f"{base}answersH.csv")
        df3 = pd.concat([df1, df2])
        combined_csv = f"{base}answersBH.csv"
        df3.to_csv(combined_csv, index=False)
        print(f"Combined results saved to {combined_csv}")

        answer = f"{base}answer.csv"
        zip_filename = f"{base}submission.zip"

        with open(answer, "w", encoding='utf-8') as f:
            f.writelines("ID\tTranslation\n")
            for _, row in df3.iterrows():
                f.writelines(f'{row["ID"]}\t"{row["Translation"]}"\n')

        with zipfile.ZipFile(zip_filename, "w", zipfile.ZIP_DEFLATED) as zf:
            zf.write(answer, arcname=os.path.basename(answer))

        print(f"Created final submission file: {zip_filename}")

    except FileNotFoundError:
        print("\nError: Could not find output CSVs. Check logs for errors.")
    except Exception as e:
        print(f"An error occurred while combining results: {e}")
        traceback.print_exc()

Multi-GPU mode: Using 2 GPUs with multiprocessing
Launched process for Bengali on GPU 0
Launched process for Hindi on GPU 1




[Bengali] Preparing data on cuda:0...
[Hindi] Preparing data on cuda:1...


Epoch 1 Bengali:   7%|▋         | 78/1075 [00:35<07:31,  2.21it/s]