In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!mkdir -p /content/data

!unzip -o "/content/drive/MyDrive/hicm.zip" -d /content/data

Archive:  /content/drive/MyDrive/hicm.zip
  inflating: /content/data/hicm/hicm_corpus.test.txt  
  inflating: /content/data/hicm/hicm_corpus.train.txt  
  inflating: /content/data/hicm/hicm_corpus.valid.txt  
  inflating: /content/data/hicm/hicm_unigram_32000.model  
  inflating: /content/data/hicm/hicm_unigram_32000.vocab  
  inflating: /content/data/hicm/test.txt  
  inflating: /content/data/hicm/train.txt  
  inflating: /content/data/hicm/valid.txt  


In [None]:
# ============================================================
# 1. Setup
# ============================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

import numpy as np
from pathlib import Path
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ============================================================
# 2. File Paths
# ============================================================
BASE = Path("/content/data/hicm")

train_inp = BASE / "hicm_corpus.train.txt"
test_inp  = BASE / "hicm_corpus.test.txt"
valid_inp = BASE / "hicm_corpus.valid.txt"

train_out = BASE / "train.txt"
test_out  = BASE / "test.txt"
valid_out = BASE / "valid.txt"

# ============================================================
# 3. Tokenization and vocab
# ============================================================
def tokenize(text):
    return text.strip().split()

def build_vocab(file, add_special_tokens=True, limit=None):
    vocab = {}
    idx = 0
    if add_special_tokens:
        vocab = {"<PAD>":0, "<UNK>":1, "<BOS>":2, "<EOS>":3}
        idx = 4
    with open(file, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if limit and i >= limit:
                break
            for tok in tokenize(line):
                if tok not in vocab:
                    vocab[tok] = idx
                    idx += 1
    return vocab

# TRAIN_LIMIT = 4000000 ####change this to limit number of training samples

src_vocab = build_vocab(train_inp, limit=TRAIN_LIMIT)
tgt_vocab = build_vocab(train_out, limit=TRAIN_LIMIT)
inv_tgt_vocab = {v:k for k,v in tgt_vocab.items()}

SRC_VOCAB_SIZE = len(src_vocab)
TGT_VOCAB_SIZE = len(tgt_vocab)
print("Hindi vocab size:", SRC_VOCAB_SIZE)
print("English vocab size:", TGT_VOCAB_SIZE)

# ============================================================
# 4. Encoding functions
# ============================================================
MAX_LEN = 40

def encode_src(sentence):
    toks = tokenize(sentence)[:MAX_LEN]
    ids = [src_vocab.get(t, src_vocab["<UNK>"]) for t in toks]
    return ids

def encode_tgt(sentence):
    toks = tokenize(sentence)[:MAX_LEN]
    ids = [tgt_vocab["<BOS>"]] + [tgt_vocab.get(t, tgt_vocab["<UNK>"]) for t in toks] + [tgt_vocab["<EOS>"]]
    return ids

# ============================================================
# 5. Dataset and collate
# ============================================================
class HinEngDataset(Dataset):
    def __init__(self, src_file, tgt_file, limit=None):
        self.src_lines = open(src_file, "r", encoding="utf-8").read().splitlines()
        self.tgt_lines = open(tgt_file, "r", encoding="utf-8").read().splitlines()
        if limit:
            self.src_lines = self.src_lines[:limit]
            self.tgt_lines = self.tgt_lines[:limit]
        assert len(self.src_lines) == len(self.tgt_lines)
        print(f"Loaded {len(self.src_lines):,} samples")

    def __len__(self):
        return len(self.src_lines)

    def __getitem__(self, idx):
        return encode_src(self.src_lines[idx]), encode_tgt(self.tgt_lines[idx])

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    max_src = max(len(x) for x in src_batch)
    max_tgt = max(len(x) for x in tgt_batch)

    src_tensor = torch.full((len(batch), max_src), src_vocab["<PAD>"], dtype=torch.long)
    src_mask   = torch.zeros(len(batch), max_src, dtype=torch.bool)
    tgt_tensor = torch.full((len(batch), max_tgt), tgt_vocab["<PAD>"], dtype=torch.long)

    for i, seq in enumerate(src_batch):
        src_tensor[i, :len(seq)] = torch.tensor(seq)
        src_mask[i, :len(seq)] = 1
    for i, seq in enumerate(tgt_batch):
        tgt_tensor[i, :len(seq)] = torch.tensor(seq)

    return src_tensor.to(device), src_mask.to(device), tgt_tensor.to(device)

train_ds = HinEngDataset(train_inp, train_out, limit=TRAIN_LIMIT)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=collate_fn)

test_ds = HinEngDataset(test_inp, test_out)
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False, collate_fn=collate_fn)

# ============================================================
# 6. Seq2Seq Model with Bahdanau Attention
# ============================================================
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.W_decoder = nn.Linear(hidden_dim, hidden_dim)
        self.W_encoder = nn.Linear(hidden_dim, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1)

    def forward(self, decoder_hidden, encoder_outputs, src_mask):
        # decoder_hidden: [B, hidden_dim]
        # encoder_outputs: [B, src_len, hidden_dim]
        # src_mask: [B, src_len] (True = valid, False = padding)
        batch_size, src_len, hidden_dim = encoder_outputs.shape
        decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)  # [B, src_len, hidden_dim]

        energy = torch.tanh(self.W_decoder(decoder_hidden) + self.W_encoder(encoder_outputs))
        scores = self.v(energy).squeeze(-1)  # [B, src_len]
        scores = scores.masked_fill(~src_mask, -1e10)  # mask padding

        attention_weights = F.softmax(scores, dim=1)  # [B, src_len]
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs).squeeze(1)  # [B, hidden_dim]

        return context, attention_weights

class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, dropout=0.1):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim // 2, num_layers,
                           bidirectional=True, dropout=dropout if num_layers > 1 else 0, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask):
        # src: [B, src_len]
        embedded = self.dropout(self.embed(src))  # [B, src_len, emb_dim]
        outputs, (hidden, cell) = self.lstm(embedded)
        # outputs: [B, src_len, hidden_dim]
        # hidden, cell: [num_layers*2, B, hidden_dim//2]

        # Combine bidirectional hidden states
        hidden = self._bridge(hidden)  # [num_layers, B, hidden_dim]
        cell = self._bridge(cell)

        return outputs, hidden, cell

    def _bridge(self, states):
        # states: [num_layers*2, B, hidden_dim//2]
        num_layers = states.size(0) // 2
        states = states.view(num_layers, 2, states.size(1), states.size(2))
        return torch.cat([states[:, 0], states[:, 1]], dim=-1)  # [num_layers, B, hidden_dim]

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, dropout=0.1):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim)
        self.attention = BahdanauAttention(hidden_dim)
        self.lstm = nn.LSTM(emb_dim + hidden_dim, hidden_dim, num_layers,
                           dropout=dropout if num_layers > 1 else 0, batch_first=True)
        self.out = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_token, hidden, cell, encoder_outputs, src_mask):
        # input_token: [B]
        # hidden, cell: [num_layers, B, hidden_dim]
        # encoder_outputs: [B, src_len, hidden_dim]

        embedded = self.dropout(self.embed(input_token)).unsqueeze(1)  # [B, 1, emb_dim]

        # Compute attention using last layer's hidden state
        context, _ = self.attention(hidden[-1], encoder_outputs, src_mask)  # [B, hidden_dim]

        # Concatenate embedded input and context
        lstm_input = torch.cat([embedded, context.unsqueeze(1)], dim=2)  # [B, 1, emb_dim + hidden_dim]

        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        # output: [B, 1, hidden_dim]

        prediction = self.out(output.squeeze(1))  # [B, vocab_size]

        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, emb_dim=256, hidden_dim=256, num_layers=2, dropout=0.1):
        super().__init__()
        self.encoder = Encoder(src_vocab, emb_dim, hidden_dim, num_layers, dropout)
        self.decoder = Decoder(tgt_vocab, emb_dim, hidden_dim, num_layers, dropout)
        self.tgt_vocab_size = tgt_vocab

    def forward(self, src, src_mask, tgt, teacher_forcing_ratio=0.5):
        # src: [B, src_len]
        # tgt: [B, tgt_len]
        batch_size = src.size(0)
        tgt_len = tgt.size(1)

        # Encode
        encoder_outputs, hidden, cell = self.encoder(src, src_mask)

        # Prepare for decoding
        outputs = []
        input_token = tgt[:, 0]  # <BOS> token

        # Decode step by step
        for t in range(1, tgt_len):
            prediction, hidden, cell = self.decoder(input_token, hidden, cell, encoder_outputs, src_mask)
            outputs.append(prediction)

            # Teacher forcing: use ground truth or prediction
            use_teacher = torch.rand(1).item() < teacher_forcing_ratio
            input_token = tgt[:, t] if use_teacher else prediction.argmax(dim=1)

        return torch.stack(outputs, dim=1)  # [B, tgt_len-1, vocab_size]

model = Seq2Seq(SRC_VOCAB_SIZE, TGT_VOCAB_SIZE).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab["<PAD>"])

print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")

# ============================================================
# 7. Training Loop
# ============================================================
def train_epoch(epoch_num):
    model.train()
    total = 0
    pbar = tqdm(train_loader, desc=f"Seq2Seq Epoch {epoch_num}")
    for src, src_mask, tgt in pbar:
        optimizer.zero_grad()
        logits = model(src, src_mask, tgt)
        # logits: [B, tgt_len-1, vocab_size]
        # Compare with tgt[:,1:] (skip <BOS>)
        loss = criterion(logits.reshape(-1, TGT_VOCAB_SIZE), tgt[:,1:].reshape(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total += loss.item()
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    return total / len(train_loader)

for epoch in range(2):
    loss = train_epoch(epoch+1)
    print(f"Epoch {epoch+1} | Avg Loss: {loss:.4f}")

# ============================================================
# 8. Inference (Greedy Decoding)
# ============================================================
def translate(sentence):
    model.eval()
    with torch.no_grad():
        src_ids = torch.tensor([encode_src(sentence)], dtype=torch.long).to(device)
        mask = torch.ones(1, src_ids.size(1), dtype=torch.bool).to(device)

        # Encode
        encoder_outputs, hidden, cell = model.encoder(src_ids, mask)

        # Decode
        input_token = torch.tensor([tgt_vocab["<BOS>"]], dtype=torch.long).to(device)
        translated = []

        for _ in range(MAX_LEN):
            prediction, hidden, cell = model.decoder(input_token, hidden, cell, encoder_outputs, mask)
            next_tok = prediction.argmax(dim=1)

            if next_tok.item() == tgt_vocab["<EOS>"]:
                break

            translated.append(inv_tgt_vocab.get(next_tok.item(), "<UNK>"))
            input_token = next_tok

    return " ".join(translated)

# ============================================================
# 9. BLEU Evaluation
# ============================================================
!pip install -q sacrebleu

from sacrebleu.metrics import BLEU

def calculate_bleu(num_samples=500):
    test_src_lines = open(test_inp, "r", encoding="utf-8").read().splitlines()[:num_samples]
    test_tgt_lines = open(test_out, "r", encoding="utf-8").read().splitlines()[:num_samples]

    print(f"\nCalculating BLEU on {len(test_src_lines)} test samples...")

    predictions = []
    references = []

    for src_text, ref_text in tqdm(zip(test_src_lines, test_tgt_lines), total=len(test_src_lines)):
        pred_text = translate(src_text)
        predictions.append(pred_text)
        references.append(ref_text)

    bleu = BLEU()
    score = bleu.corpus_score(predictions, [references])

    print(f"\n{'='*60}")
    print(f"Seq2Seq BLEU: {score.score:.2f}")
    print(f"{'='*60}")

    print("\nSample Translations:")
    for i in range(5):
        print(f"\n{i+1}. Source: {test_src_lines[i]}")
        print(f"   Predicted: {predictions[i]}")
        print(f"   Reference: {references[i]}")

    return score.score

bleu_score = calculate_bleu(500)

Using device: cuda
Hindi vocab size: 391146
English vocab size: 206248
Loaded 4,000,000 samples
Loaded 2,507 samples
Total parameters: 208,175,785


Seq2Seq Epoch 1: 100%|██████████| 62500/62500 [2:51:00<00:00,  6.09it/s, loss=3.8249]


Epoch 1 | Avg Loss: 5.0503


Seq2Seq Epoch 2: 100%|██████████| 62500/62500 [2:51:52<00:00,  6.06it/s, loss=3.8258]


Epoch 2 | Avg Loss: 3.8537
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h
Calculating BLEU on 500 test samples...


100%|██████████| 500/500 [00:10<00:00, 45.80it/s]


Seq2Seq BLEU: 1.83

Sample Translations:

1. Source: आपकी Car में black box?
   Predicted: In the the Black Selector
   Reference: A black box in your car?

2. Source: जबकि America के road planner, ध्वस्त होते हुए highway system को सुधारने के लिए money की कमी से जूझ रहे हैं, वहीं बहुत-से people इसका solution छोटे से black box में देख रहे हैं, जो आपकी car के dashboard पर सफ़ाई से fit हो जाता है।
   Predicted: While the road of the the the the the to the the the the system to the the the the the the the the the black black lines in the black of the the the the the the the
   Reference: As America's road planners struggle to find the cash to mend a crumbling highway system, many are beginning to see a solution in a little black box that fits neatly by the dashboard of your car.

3. Source: यह device, जो motor driver द्वारा vehicle चलाए गए प्रत्येक mile को track करती है तथा उस information को officers को transmit करती है, आजकल America की प्रमुख roads का वित्त-पोषण करने के लिए पुराने हो चुक


