In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import json
import numpy as np
import re
import pandas as pd
import heapq
from collections import defaultdict
from tqdm import tqdm
from google.colab import drive

In [None]:

with open('/kaggle/input/new-data/train_data1.json', 'r') as file:
    data_train = json.load(file)

with open('/kaggle/input/new-data/val_data1.json', 'r') as file:
    data_val = json.load(file)

In [None]:
def extract_language_data_train(data, language_pair):
    """
    Extracts train and validation sentences and IDs for a given language pair.

    Args:
        data (dict): Nested dictionary containing all language pairs.
        language_pair (str): The language pair to extract ("English-Bengali", "English-Hindi", etc.)

    Returns:
        tuple: (source_train, target_train, train_ids)
    """
    source_lst, target_lst, ids_lst = [], [], []

    for lp, lp_data in data.items():
        if lp == language_pair:
            for data_type, data_entries in lp_data.items():
                for entry_id, entry_data in data_entries.items():
                    source = entry_data["source"]
                    target = entry_data["target"]

                    source_lst.append(source)
                    target_lst.append(target)
                    ids_lst.append(entry_id)

    return source_lst, target_lst, ids_lst



In [None]:
def extract_language_data_val(data , language_pair):

  source_lst, ids_lst = [], []

  for lp, lp_data in data.items():
      if lp == language_pair:
          for data_type, data_entries in lp_data.items():
              for entry_id, entry_data in data_entries.items():
                  source = entry_data["source"]


                  source_lst.append(source)

                  ids_lst.append(entry_id)

  return source_lst, ids_lst




In [None]:
source_train_ben, target_train_ben , train_ids_ben  = extract_language_data_train(data_train, "English-Bengali")
source_val_ben, val_ids_ben = extract_language_data_val(data_val, "English-Bengali")

source_train_hin, target_train_hin, train_ids_hin = extract_language_data_train(data_train, "English-Hindi")
source_val_hin, val_ids_hin = extract_language_data_val(data_val, "English-Hindi")


In [None]:
class Node:
    def __init__(self, token_id):
        self.token_id = token_id
        self.prev = None
        self.next = None

class BPETokenizer:
    def __init__(self):
        self.vocab = {}
        self.inverse_vocab = {}
        self.merges = {}
        self.token_to_id = {}
        self.id_to_token = {}

    def initialize_vocab(self):
        reserved = ["<PAD>", "<UNK>", "<SOS>", "<EOS>"]
        self.vocab = {i: tok for i, tok in enumerate(reserved)}
        self.inverse_vocab = {tok: i for i, tok in self.vocab.items()}
        return len(self.vocab)

    def build_corpus(self, text, next_token_id):
        words = [list(w) + ["</w>"] for w in text.strip().split()]
        corpus = []
        for w in words:
            head, prev = None, None
            for ch in w:
                if ch not in self.inverse_vocab:
                    self.vocab[next_token_id] = ch
                    self.inverse_vocab[ch] = next_token_id
                    next_token_id += 1
                node = Node(self.inverse_vocab[ch])
                if prev:
                    prev.next, node.prev = node, prev
                else:
                    head = node
                prev = node
            corpus.append(head)
        return corpus, next_token_id

    def count_pairs(self, corpus):
        pair_occurrences = defaultdict(set)
        for head in corpus:
            node = head
            while node and node.next:
                pair_occurrences[(node.token_id, node.next.token_id)].add(node)
                node = node.next
        return pair_occurrences

    def merge_pair(self, pair, new_id, pair_occurrences, heap):
        t1, t2 = pair
        new_tok = self.vocab[t1] + self.vocab[t2]
        self.vocab[new_id] = new_tok
        self.inverse_vocab[new_tok] = new_id
        affected = list(pair_occurrences[pair])
        pair_occurrences[pair].clear()
        for node in affected:
            if not node.next or node.token_id != t1 or node.next.token_id != t2:
                continue
            node.token_id = new_id
            removed = node.next
            node.next = removed.next
            if removed.next:
                removed.next.prev = node
            if node.prev:
                old = (node.prev.token_id, t1)
                pair_occurrences[old].discard(node.prev)
                new = (node.prev.token_id, node.token_id)
                pair_occurrences[new].add(node.prev)
                heapq.heappush(heap, (-len(pair_occurrences[new]), new))
            if node.next:
                old = (t2, node.next.token_id)
                pair_occurrences[old].discard(node)
                new = (node.token_id, node.next.token_id)
                pair_occurrences[new].add(node)
                heapq.heappush(heap, (-len(pair_occurrences[new]), new))

    def train(self, text, vocab_size=5000):
        next_id = self.initialize_vocab()
        corpus, next_id = self.build_corpus(text, next_id)
        pair_occurrences = self.count_pairs(corpus)
        heap = [(-len(nodes), pair) for pair, nodes in pair_occurrences.items()]
        heapq.heapify(heap)
        while len(self.vocab) < vocab_size and heap:
            freq, pair = heapq.heappop(heap)
            freq = -freq
            if freq == 0 or len(pair_occurrences[pair]) != freq:
                continue
            self.merges[pair] = next_id
            self.merge_pair(pair, next_id, pair_occurrences, heap)
            next_id += 1
        self.token_to_id = {tok: tid for tid, tok in self.vocab.items()}
        self.id_to_token = {tid: tok for tid, tok in self.vocab.items()}

    def tokenize(self, text):
        words = [list(w) + ["</w>"] for w in text.strip().split()]
        tokens = []
        for w in words:
            ids = [self.inverse_vocab.get(ch, self.token_to_id["<UNK>"]) for ch in w]
            merged = True
            while merged:
                merged, i = False, 0
                while i < len(ids) - 1:
                    pair = (ids[i], ids[i + 1])
                    if pair in self.merges:
                        ids[i] = self.merges[pair]
                        ids.pop(i + 1)
                        merged = True
                    else:
                        i += 1
            tokens.extend(ids)
        return tokens


    def decode(self, token_ids):
        specials = {self.token_to_id.get("<PAD>"), self.token_to_id.get("<UNK>"),
                    self.token_to_id.get("<SOS>"), self.token_to_id.get("<EOS>")}
        words, cur = [], []
        for tid in token_ids:
            if tid in specials:
                continue
            tok = self.id_to_token.get(tid, "<UNK>")
            if tok.endswith("</w>"):
                cur.append(tok[:-4])
                if cur:
                    words.append("".join(cur))
                cur = []
            else:
                cur.append(tok)
        if cur:
            words.append("".join(cur))
        return " ".join(words).strip()


In [None]:
def preprocess_text(text, lang="en"):
    """Cleans and normalizes text before tokenization."""
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[“”‘’]', '"', text)
    if lang == "hi":
        text = re.sub(r'[^\u0900-\u097F\s.,!?]', '', text)
    elif lang == "bn":
        text = re.sub(r'[^\u0980-\u09FF\s.,!?।]', '', text)
    else:
        text = re.sub(r'[^\w\s.,!?]', '', text)
        text = text.lower()
    return text

In [None]:
eng_ben_bpe = BPETokenizer()
eng_ben_bpe.train(" ".join([preprocess_text(s, "en") for s in source_train_ben]), vocab_size=20000)

ben_bpe = BPETokenizer()
ben_bpe.train(" ".join([preprocess_text(s, "bn") for s in target_train_ben]), vocab_size=20000)

In [None]:
bengali_text = "এই জায়গাগুলো দেখতে ভুলো না।"
tokens = ben_bpe.tokenize(bengali_text)
decoded_sentence = ben_bpe.decode(tokens)
print(decoded_sentence)

In [None]:
eng_hin_bpe = BPETokenizer()
eng_hin_bpe.train(" ".join([preprocess_text(s, "en") for s in source_train_hin]), vocab_size=20000)

hin_bpe = BPETokenizer()
hin_bpe.train(" ".join([preprocess_text(s, "hi") for s in target_train_hin]), vocab_size=20000)

In [None]:
hindi_text = "इन जगहों को देखना मत भूलना।"
tokens = hin_bpe.tokenize(hindi_text)
print("Tokens:", tokens)
decoded_sentence = hin_bpe.decode(tokens)
print("Decoded Sentence:", decoded_sentence)

In [None]:
for tokenizer in [ben_bpe, hin_bpe]:
    for t in ["<PAD>", "<SOS>", "<EOS>", "<UNK>"]:
        if t not in tokenizer.token_to_id:
            new_id = len(tokenizer.token_to_id)
            tokenizer.token_to_id[t] = new_id
            tokenizer.id_to_token[new_id] = t

In [None]:
seq_length = 30

def encode_and_pad(tokenizer, sent, max_length):
    sos = [tokenizer.token_to_id["<SOS>"]]
    eos = [tokenizer.token_to_id["<EOS>"]]
    pad = [tokenizer.token_to_id["<PAD>"]]
    encoded = tokenizer.tokenize(sent)
    if len(encoded) < max_length - 2:
        n_pads = max_length - 2 - len(encoded)
        return sos + encoded + eos + pad * n_pads
    else:
        return sos + encoded[:max_length - 2] + eos

In [None]:
en_train_encoded_ben = [encode_and_pad(eng_ben_bpe, sent, seq_length) for sent in source_train_ben]
de_train_encoded_ben = [encode_and_pad(ben_bpe, sent, seq_length) for sent in target_train_ben]
en_val_encoded_ben = [encode_and_pad(eng_ben_bpe, sent, seq_length) for sent in source_val_ben]
de_val_encoded_ben = [encode_and_pad(ben_bpe, sent, seq_length) for sent in source_val_ben]


en_train_encoded_hin = [encode_and_pad(eng_hin_bpe, sent, seq_length) for sent in source_train_hin]
de_train_encoded_hin = [encode_and_pad(hin_bpe, sent, seq_length) for sent in target_train_hin]
en_val_encoded_hin = [encode_and_pad(eng_hin_bpe, sent, seq_length) for sent in source_val_hin]
de_val_encoded_hin = [encode_and_pad(hin_bpe, sent, seq_length) for sent in source_val_hin]



train_x_ben = np.array(en_train_encoded_ben)
train_y_ben = np.array(de_train_encoded_ben)
test_x_ben = np.array(en_val_encoded_ben)
test_y_ben = np.array(de_val_encoded_ben)

train_x_hin = np.array(en_train_encoded_hin)
train_y_hin = np.array(de_train_encoded_hin)
test_x_hin = np.array(en_val_encoded_hin)
test_y_hin = np.array(de_val_encoded_hin)

batch_size =100

train_ds_ben = TensorDataset(torch.from_numpy(train_x_ben), torch.from_numpy(train_y_ben))
test_ds_ben = TensorDataset(torch.from_numpy(test_x_ben), torch.from_numpy(test_y_ben))
train_dl_ben = DataLoader(train_ds_ben, shuffle=True, batch_size=batch_size, drop_last=True)
test_dl_ben = DataLoader(test_ds_ben, shuffle=False, batch_size=batch_size)


train_ds_hin = TensorDataset(torch.from_numpy(train_x_hin), torch.from_numpy(train_y_hin))
test_ds_hin = TensorDataset(torch.from_numpy(test_x_hin), torch.from_numpy(test_y_hin))
train_dl_hin = DataLoader(train_ds_hin, shuffle=True, batch_size=batch_size, drop_last=True)
test_dl_hin = DataLoader(test_ds_hin, shuffle=False, batch_size=batch_size)

In [None]:
# # import torch
# # import torch.nn as nn
# # import torch.optim as optim
# # import numpy as np
# # import pandas as pd

# # class EncoderRNN(nn.Module):
# #     def __init__(self, input_size, hidden_size):
# #         super(EncoderRNN, self).__init__()
# #         self.hidden_size = hidden_size
# #         self.embedding = nn.Embedding(input_size, hidden_size, padding_idx=0)
# #         self.dropout = nn.Dropout(0.3)  # CHANGE: Added dropout for regularization
# #         self.gru = nn.GRU(hidden_size, hidden_size, num_layers=2, batch_first=True, dropout=0.3)  # CHANGE: 2-layer GRU

# #     def forward(self, x, hidden):
# #         embedded = self.dropout(self.embedding(x))
# #         output, hidden = self.gru(embedded, hidden)
# #         return output, hidden

# #     def init_hidden(self, batch_size, device):
# #         return torch.zeros(2, batch_size, self.hidden_size, device=device)  # CHANGE: num_layers=2


# # class DecoderRNN(nn.Module):
# #     def __init__(self, hidden_size, output_size):
# #         super(DecoderRNN, self).__init__()
# #         self.hidden_size = hidden_size
# #         self.embedding = nn.Embedding(output_size, hidden_size, padding_idx=0)
# #         self.dropout = nn.Dropout(0.3)  
# #         self.gru = nn.GRU(hidden_size, hidden_size, num_layers=2, batch_first=True, dropout=0.3)  # CHANGE
# #         self.fc = nn.Linear(hidden_size, output_size)

# #     def forward(self, x, hidden):
# #         embedded = self.dropout(self.embedding(x))
# #         output, hidden = self.gru(embedded, hidden)
# #         output = self.fc(output)
# #         return output, hidden



# # def train_model(encoder, decoder, train_dl, tokenizer, epochs, lr=0.0025, teacher_forcing_ratio=0.8):  #CHANGE: lower lr
# #     print("training started : ")

# #     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# #     print(device)
# #     encoder.to(device)
# #     decoder.to(device)
# #     criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id["<PAD>"])
# #     enc_opt = optim.AdamW(encoder.parameters(), lr=lr, weight_decay=1e-4)  #CHANGE: AdamW optimizer
# #     dec_opt = optim.AdamW(decoder.parameters(), lr=lr, weight_decay=1e-4)
# #     scheduler = optim.lr_scheduler.ReduceLROnPlateau(enc_opt, 'min', patience=2, factor=0.5)  #CHANGE: LR scheduler
# #     # criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id["<PAD>"])
# #     # enc_opt = optim.Adagrad(encoder.parameters(), lr=lr, weight_decay=1e-5)
# #     # dec_opt = optim.Adagrad(decoder.parameters(), lr=lr, weight_decay=1e-5)
# #     # scheduler = optim.lr_scheduler.ReduceLROnPlateau(enc_opt, 'min', patience=3, factor=0.5)


# #     for epoch in range(epochs):
# #         encoder.train()
# #         decoder.train()
# #         total_loss = 0

# #         for src, tgt in train_dl:
# #             batch_size = src.size(0)
# #             src, tgt = src.to(device), tgt.to(device)
# #             enc_hidden = encoder.init_hidden(batch_size, device)

# #             enc_opt.zero_grad()
# #             dec_opt.zero_grad()

# #             # Encode
# #             _, hidden = encoder(src, enc_hidden)

# #             # Initialize decoder input with <SOS>
# #             dec_input = tgt[:, 0].unsqueeze(1)
# #             dec_hidden = hidden
# #             outputs = []

# #             # Step-by-step decoding
# #             for t in range(1, tgt.size(1)):
# #                 out, dec_hidden = decoder(dec_input, dec_hidden)
# #                 pred = out[:, -1, :]
# #                 outputs.append(pred.unsqueeze(1))

# #                 teacher_force = np.random.rand() < teacher_forcing_ratio
# #                 next_input = tgt[:, t] if teacher_force else pred.argmax(1)
# #                 dec_input = next_input.unsqueeze(1)

# #             outputs = torch.cat(outputs, dim=1)
# #             loss = criterion(outputs.reshape(-1, outputs.shape[-1]), tgt[:, 1:].reshape(-1))
# #             loss.backward()

# #             # CHANGE: Gradient clipping for stability
# #             torch.nn.utils.clip_grad_norm_(encoder.parameters(), 1)
# #             torch.nn.utils.clip_grad_norm_(decoder.parameters(), 1)
# #             enc_opt.step()
# #             dec_opt.step()

# #             total_loss += loss.item()

# #         avg_loss = total_loss / len(train_dl)
# #         scheduler.step(avg_loss)  # CHANGE: adjust learning rate dynamically
# #         print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, LR: {enc_opt.param_groups[0]['lr']:.6f}")


# # def translate(encoder, decoder, dataloader, tokenizer, seq_length=40):
# #     encoder.eval()
# #     decoder.eval()
# #     device = next(encoder.parameters()).device
# #     SOS = tokenizer.token_to_id["<SOS>"]
# #     EOS = tokenizer.token_to_id["<EOS>"]
# #     preds = []

# #     with torch.no_grad():
# #         for src, _ in dataloader:
# #             batch_size = src.size(0)
# #             src = src.to(device)
# #             enc_hidden = encoder.init_hidden(batch_size, device)
# #             _, hidden = encoder(src, enc_hidden)

# #             dec_input = torch.full((batch_size, 1), SOS, dtype=torch.long, device=device)
# #             dec_hidden = hidden
# #             seq_preds = []

# #             for _ in range(seq_length):
# #                 dec_out, dec_hidden = decoder(dec_input, dec_hidden)
# #                 dec_out = dec_out.argmax(-1)
# #                 seq_preds.append(dec_out)
# #                 dec_input = dec_out

# #             seq_preds = torch.stack(seq_preds, dim=1)
# #             for s in seq_preds:
# #                ids = [i.item() for i in s if i.item() not in (SOS, EOS, tokenizer.token_to_id["<PAD>"])]
# #                preds.append(tokenizer.decode(ids))
# #     return preds


# # hidden_size = 256
# # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # encoder_input_size_ben = max(np.max(en_train_encoded_ben), np.max(en_val_encoded_ben)) + 1
# # decoder_output_size_ben = max(np.max(de_train_encoded_ben), np.max(de_val_encoded_ben)) + 1

# # encoder_ben = EncoderRNN(encoder_input_size_ben, hidden_size).to(device)
# # decoder_ben = DecoderRNN(hidden_size, decoder_output_size_ben).to(device)

# # print("Training Bengali model...")
# # train_model(encoder_ben, decoder_ben, train_dl_ben, ben_bpe, epochs=5, lr=0.001, teacher_forcing_ratio=0.6)  #CHANGE: longer training, higher TF ratio



# # encoder_input_size_hin = max(np.max(en_train_encoded_hin), np.max(en_val_encoded_hin)) + 1
# # decoder_output_size_hin = max(np.max(de_train_encoded_hin), np.max(de_val_encoded_hin)) + 1

# # encoder_hin = EncoderRNN(encoder_input_size_hin, hidden_size).to(device)
# # decoder_hin = DecoderRNN(hidden_size, decoder_output_size_hin).to(device)

# # print("\nTraining Hindi model...")
# # train_model(encoder_hin, decoder_hin, train_dl_hin, hin_bpe, epochs=5, lr=0.001, teacher_forcing_ratio=0.6)  #CHANGE



# # print("\nGenerating Bengali translations...")
# # val_outs_ben = translate(encoder_ben, decoder_ben, test_dl_ben, ben_bpe)
# # df_ben = pd.DataFrame({"ID": val_ids_ben, "Translation": val_outs_ben})
# # df_ben.to_csv("answer_ben.csv", index=False)
# # print("Saved Bengali predictions → answer_ben.csv")

# # print("\nGenerating Hindi translations...")
# # val_outs_hin = translate(encoder_hin, decoder_hin, test_dl_hin, hin_bpe)
# # df_hin = pd.DataFrame({"ID": val_ids_hin, "Translation": val_outs_hin})
# # df_hin.to_csv("answer_hi.csv", index=False)
# # print("Saved Hindi predictions → answer_hi.csv")


# import torch
# import torch.nn as nn
# import torch.optim as optim
# import numpy as np
# import pandas as pd



# class EncoderRNN(nn.Module):
#     def __init__(self, input_size, hidden_size=512, num_layers=2, emb_dropout=0.2, rnn_dropout=0.2):
#         super(EncoderRNN, self).__init__()
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers

#         self.embedding = nn.Embedding(input_size, hidden_size, padding_idx=0)
#         self.dropout = nn.Dropout(emb_dropout)
#         self.lstm = nn.LSTM(
#             input_size=hidden_size,
#             hidden_size=hidden_size,
#             num_layers=num_layers,
#             batch_first=True,
#             dropout=rnn_dropout if num_layers > 1 else 0.0
#         )

#     def forward(self, x, hidden):
#         """
#         x: (B, T) long
#         hidden:  LSTM (h0, c0)
#         returns:
#           outputs: (B, T, H)
#           hidden: (hT, cT)
#         """
#         B = x.size(0)
#         device = x.device

#         h0, c0 = self._to_lstm_hidden(hidden, B, device)
#         emb = self.dropout(self.embedding(x))  # (B,T,H)
#         outputs, (hT, cT) = self.lstm(emb, (h0, c0))
#         return outputs, (hT, cT)

#     def init_hidden(self, batch_size, device):
#         """
#         Backward-compatible: returns GRU-style h0 (num_layers,B,H).
#         We create c0 internally in forward().
#         """
#         return torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device)

#     def _to_lstm_hidden(self, hidden, batch_size, device):
#         if isinstance(hidden, tuple):
#             h0, c0 = hidden
#         else:
#             h0 = hidden
#             c0 = torch.zeros_like(h0, device=device)
#         return h0, c0


# class DecoderRNN(nn.Module):
#     def __init__(self, hidden_size=512, output_size=None, num_layers=2, emb_dropout=0.3, rnn_dropout=0.3):
#         super(DecoderRNN, self).__init__()
#         assert output_size is not None, "output_size (target vocab size)"
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
#         self.output_size = output_size

#         self.embedding = nn.Embedding(output_size, hidden_size, padding_idx=0)
#         self.dropout = nn.Dropout(emb_dropout)
#         self.lstm = nn.LSTM(
#             input_size=hidden_size,
#             hidden_size=hidden_size,
#             num_layers=num_layers,
#             batch_first=True,
#             dropout=rnn_dropout if num_layers > 1 else 0.0
#         )
#         self.fc = nn.Linear(hidden_size, output_size)

#     def forward(self, x, hidden):
#         """
#         x: (B,T) or (B,) or (1,B)
#         hidden: LSTM (h,c)
#         returns:
#           logits: (B,T,V) or (B,V) for single-step
#           hidden: (hT, cT)
#         """
#         x, squeeze_back = self._normalize_input_shape(x)
#         B = x.size(0)
#         device = x.device

#         h0, c0 = self._to_lstm_hidden(hidden, B, device)

#         emb = self.dropout(self.embedding(x))        # (B,T,H)
#         out, (hT, cT) = self.lstm(emb, (h0, c0))     # (B,T,H)
#         logits = self.fc(out)                        # (B,T,V)

#         if squeeze_back:
#             logits = logits.squeeze(1)               # (B,V)
#         return logits, (hT, cT)

#     def _normalize_input_shape(self, x):
#         squeeze_back = False
#         if x.dim() == 1:
#             x = x.unsqueeze(1)         # (B,) -> (B,1)
#             squeeze_back = True
#         elif x.dim() == 2 and x.size(0) == 1:
#             x = x.transpose(0, 1)      # (1,B) -> (B,1)
#             squeeze_back = True
#         return x, squeeze_back

#     def _to_lstm_hidden(self, hidden, batch_size, device):
#         if isinstance(hidden, tuple):
#             return hidden
#         else:
#             h = hidden
#             c = torch.zeros_like(h, device=device)
#             return (h, c)


# def train_model(encoder, decoder, train_dl, tokenizer, epochs=600, lr=0.0001, teacher_forcing_ratio=0.78):
#     """
#     train_dl yield: (src, tgt)
#       - src: (B, T_src) LongTensor
#       - tgt: (B, T_tgt) LongTensor with tgt[:,0] == <SOS>
#     tokenizer: used only to get PAD id for loss ignore
#     """
#     print("training started : ")
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     # print("device:", device)

#     encoder.to(device)
#     decoder.to(device)

#     pad_id = tokenizer.token_to_id["<PAD>"]
#     criterion = nn.CrossEntropyLoss(ignore_index=pad_id)

#     enc_opt = optim.AdamW(encoder.parameters(), lr=lr, weight_decay=1e-4)
#     dec_opt = optim.AdamW(decoder.parameters(), lr=lr, weight_decay=1e-4)
#     scheduler = optim.lr_scheduler.ReduceLROnPlateau(enc_opt, mode='min', patience=2, factor=0.5)

#     for epoch in range(epochs):
#         encoder.train()
#         decoder.train()
#         total_loss = 0.0

#         for src, tgt in train_dl:
#             src, tgt = src.to(device), tgt.to(device)
#             batch_size = src.size(0)

#             # init encoder hidden (GRU-style h0; c0 is created internally)
#             enc_hidden = encoder.init_hidden(batch_size, device)

#             enc_opt.zero_grad(set_to_none=True)
#             dec_opt.zero_grad(set_to_none=True)

#             # Encode
#             _, enc_last = encoder(src, enc_hidden)    # enc_last is (hT, cT)

#             # Decode with teacher forcing
#             dec_hidden = enc_last
#             dec_input = tgt[:, 0]                     # (B,) <SOS>
#             outputs = []

#             for t in range(1, tgt.size(1)):
#                 # Decoder step: returns logits (B,V) for single-step input
#                 step_logits, dec_hidden = decoder(dec_input, dec_hidden)   # (B,V)
#                 outputs.append(step_logits.unsqueeze(1))                   # (B,1,V)

#                 teacher_force = (np.random.rand() < teacher_forcing_ratio)
#                 next_input = tgt[:, t] if teacher_force else step_logits.argmax(-1)
#                 dec_input = next_input

#             outputs = torch.cat(outputs, dim=1)       # (B, T_tgt-1, V)
#             loss = criterion(
#                 outputs.reshape(-1, outputs.size(-1)),   # (B*(T-1), V)
#                 tgt[:, 1:].reshape(-1)                   # (B*(T-1),)
#             )

#             loss.backward()
#             torch.nn.utils.clip_grad_norm_(encoder.parameters(), 1.0)
#             torch.nn.utils.clip_grad_norm_(decoder.parameters(), 1.0)
#             enc_opt.step()
#             dec_opt.step()

#             total_loss += float(loss.item())

#         avg_loss = total_loss / max(1, len(train_dl))
#         scheduler.step(avg_loss)
#         print(f"Epoch {epoch+1}/{epochs} | loss: {avg_loss:.4f} | lr: {enc_opt.param_groups[0]['lr']:.6f}")


# @torch.no_grad()
# def translate(encoder, decoder, dataloader, tokenizer, seq_length=40):
#     encoder.eval()
#     decoder.eval()
#     device = next(encoder.parameters()).device

#     SOS = tokenizer.token_to_id["<SOS>"]
#     EOS = tokenizer.token_to_id["<EOS>"]
#     PAD = tokenizer.token_to_id["<PAD>"]

#     preds = []
#     for src, _ in dataloader:
#         src = src.to(device)
#         B = src.size(0)

#         enc_hidden = encoder.init_hidden(B, device)
#         _, enc_last = encoder(src, enc_hidden)

#         dec_hidden = enc_last
#         dec_input = torch.full((B,), SOS, dtype=torch.long, device=device)

#         seq_preds = []
#         for _ in range(seq_length):
#             step_logits, dec_hidden = decoder(dec_input, dec_hidden)  # (B,V)
#             next_ids = step_logits.argmax(-1)                         # (B,)
#             seq_preds.append(next_ids.unsqueeze(1))                   # (B,1)
#             dec_input = next_ids

#         seq_preds = torch.cat(seq_preds, dim=1)  # (B, seq_length)

#         # detokenize each sequence
#         for s in seq_preds:
#             ids = [i.item() for i in s if i.item() not in (SOS, EOS, PAD)]
#             preds.append(tokenizer.decode(ids))
#     return preds

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd

# ======================
#  BiLSTM Encoder (no extra methods; only __init__ and forward)
# ======================

class EncoderRNN(nn.Module):
    """
    Bidirectional LSTM encoder.
    - input_size: source vocab size
    - hidden_size: 'model size' (decoder hidden). Encoder uses hidden_size//2 per direction.
    """
    def __init__(self, input_size, hidden_size=512, num_layers=2,
                 emb_dropout=0.2, rnn_dropout=0.2, padding_idx=0):
        super().__init__()
        assert hidden_size % 2 == 0, "hidden_size must be even for bidirectional encoder."
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.h_per_dir = hidden_size // 2  # concat -> hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size, padding_idx=padding_idx)
        self.dropout   = nn.Dropout(emb_dropout)
        self.rnn = nn.LSTM(
            input_size  = hidden_size,
            hidden_size = self.h_per_dir,
            num_layers  = num_layers,
            batch_first = True,
            bidirectional = True,
            dropout = rnn_dropout if num_layers > 1 else 0.0,
        )

        # Bridge: map final bi states -> decoder initial (per layer).
        # We fuse last-layer forward/backward (cat along dim=1) -> (B, hidden_size)
        # and project to (B, hidden_size) for h and c, then repeat for num_layers.
        self.fc_hidden = nn.Linear(hidden_size, hidden_size)
        self.fc_cell   = nn.Linear(hidden_size, hidden_size)

    def forward(self, x):
        """
        x: LongTensor (B, T_src)
        returns:
          enc_outputs: (B, T_src, hidden_size)  # concat of both directions
          dec_init:    tuple (h0, c0), each (num_layers, B, hidden_size)
        """
        B = x.size(0)
        emb = self.dropout(self.embedding(x))             # (B,T,hidden_size)
        outputs, (h, c) = self.rnn(emb)                   # h,c: (num_layers*2, B, h_per_dir)

        # Take last layer's fwd/back states and fuse
        # indices: last two slices are [-2] (forward), [-1] (backward)
        h_last_f = h[-2]                                  # (B, h_per_dir)
        h_last_b = h[-1]                                  # (B, h_per_dir)
        c_last_f = c[-2]
        c_last_b = c[-1]

        h_cat = torch.cat([h_last_f, h_last_b], dim=1)    # (B, hidden_size)
        c_cat = torch.cat([c_last_f, c_last_b], dim=1)    # (B, hidden_size)

        # bridge to decoder size and stack for each layer
        h0 = torch.tanh(self.fc_hidden(h_cat))            # (B, hidden_size)
        c0 = torch.tanh(self.fc_cell(c_cat))              # (B, hidden_size)

        # repeat to (num_layers, B, hidden_size)
        h0 = h0.unsqueeze(0).repeat(self.num_layers, 1, 1)
        c0 = c0.unsqueeze(0).repeat(self.num_layers, 1, 1)

        # outputs already (B,T,hidden_size) because bidirectional concat
        return outputs, (h0, c0)


# ======================
#  Luong-Attention Decoder (no extra methods; only __init__ and forward)
# ======================

class DecoderRNN(nn.Module):
    """
    Unidirectional LSTM decoder with Luong 'general' attention.
    - hidden_size: decoder hidden/model size (must match encoder hidden_size)
    - output_size: target vocab size
    """
    def __init__(self, hidden_size=512, output_size=None, num_layers=2,
                 emb_dropout=0.3, rnn_dropout=0.3, padding_idx=0):
        super().__init__()
        assert output_size is not None, "output_size must be provided for decoder."

        self.hidden_size = hidden_size
        self.num_layers  = num_layers
        self.output_size = output_size

        self.embedding = nn.Embedding(output_size, hidden_size, padding_idx=padding_idx)
        self.dropout   = nn.Dropout(emb_dropout)

        # Luong 'general' attention: score(h_t, h_s) = h_t^T * W * h_s
        self.attn_proj_enc = nn.Linear(hidden_size, hidden_size, bias=False)

        # Decoder LSTM takes [emb_t ; context_t] as input each step
        self.rnn = nn.LSTM(
            input_size  = hidden_size + hidden_size,
            hidden_size = hidden_size,
            num_layers  = num_layers,
            batch_first = True,
            dropout = rnn_dropout if num_layers > 1 else 0.0,
        )
        self.fc_out = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, encoder_outputs):
        """
        x:              (B,) or (B,1) or (1,B)  previous token ids
        hidden:         (h, c), each (num_layers, B, hidden_size)
        encoder_outputs:(B, T_src, hidden_size)
        returns:
          logits: (B,V)  # single step output
          next_hidden: (h,c)
        """
        # normalize shape
        if x.dim() == 1:             # (B,)
            x = x.unsqueeze(1)       # -> (B,1)
        elif x.dim() == 2 and x.size(0) == 1:
            x = x.transpose(0, 1)    # (1,B) -> (B,1)

        B = x.size(0)
        h, c = hidden
        h_top = h[-1]                                # (B, hidden_size) last layer hidden

        # Attention: energy = (W_enc * H_enc) @ h_top
        enc_proj = self.attn_proj_enc(encoder_outputs)        # (B,T,hidden)
        attn_scores = torch.bmm(enc_proj, h_top.unsqueeze(2)) # (B,T,1)
        attn_weights = torch.softmax(attn_scores.squeeze(2), dim=1)  # (B,T)

        # Context: sum_t (alpha_t * H_enc_t)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)  # (B,hidden)

        # Prepare input to LSTM: concat(emb_t, context)
        emb = self.dropout(self.embedding(x).squeeze(1))       # (B,hidden)
        rnn_in = torch.cat([emb, context], dim=1).unsqueeze(1) # (B,1,hidden*2)

        out, (h_next, c_next) = self.rnn(rnn_in, (h, c))       # out: (B,1,hidden)
        logits = self.fc_out(out.squeeze(1))                   # (B,V)

        return logits, (h_next, c_next)


# ==============
#  Seq2Seq wrapper (internal)
# ==============

class Seq2Seq(nn.Module):
    """Internal wrapper so your external pipeline stays the same."""
    def __init__(self, encoder: EncoderRNN, decoder: DecoderRNN):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self._enc_cache = None  # (enc_outputs, (h0,c0))

    def encode(self, src):
        self._enc_cache = self.encoder(src)   # (enc_outputs, (h0,c0))
        return self._enc_cache

    def decode_step(self, prev_tokens, hidden):
        enc_outputs, _ = self._enc_cache
        # pass encoder outputs automatically (no pipeline change required)
        logits, next_hidden = self.decoder(prev_tokens, hidden, enc_outputs)
        return logits, next_hidden


# ======================
#  Training / Inference (unchanged signatures)
# ======================

def train_model(encoder, decoder, train_dl, tokenizer, epochs=20, lr=3e-4, teacher_forcing_ratio=0.7):
    """
    Unchanged signature. Uses Seq2Seq internally.
    """
    print("training started : ")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("device:", device)

    model = Seq2Seq(encoder.to(device), decoder.to(device)).to(device)

    PAD = tokenizer.token_to_id["<PAD>"]
    criterion = nn.CrossEntropyLoss(ignore_index=PAD)

    enc_opt = optim.AdamW(model.encoder.parameters(), lr=lr, weight_decay=1e-4)
    dec_opt = optim.AdamW(model.decoder.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(enc_opt, mode='min', patience=2, factor=0.5)

    for epoch in range(epochs):
        model.train()
        total = 0.0

        for src, tgt in train_dl:
            src, tgt = src.to(device), tgt.to(device)
            B = src.size(0)

            enc_opt.zero_grad(set_to_none=True)
            dec_opt.zero_grad(set_to_none=True)

            # ---- encode ----
            enc_outputs, dec_hidden = model.encode(src)  # (B,T,H), (h0,c0)

            # ---- decode (teacher forcing) ----
            dec_inp = tgt[:, 0]                          # (B,) <SOS>
            logits_steps = []
            for t in range(1, tgt.size(1)):
                step_logits, dec_hidden = model.decode_step(dec_inp, dec_hidden)  # (B,V)
                logits_steps.append(step_logits.unsqueeze(1))
                use_tf = (np.random.rand() < teacher_forcing_ratio)
                dec_inp = tgt[:, t] if use_tf else step_logits.argmax(-1)

            logits = torch.cat(logits_steps, dim=1)                # (B, T-1, V)
            loss = criterion(logits.reshape(-1, logits.size(-1)),  # (B*(T-1), V)
                             tgt[:, 1:].reshape(-1))               # (B*(T-1),)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.encoder.parameters(), 1.0)
            torch.nn.utils.clip_grad_norm_(model.decoder.parameters(), 1.0)
            enc_opt.step(); dec_opt.step()

            total += float(loss.item())

        avg = total / max(1, len(train_dl))
        scheduler.step(avg)
        print(f"Epoch {epoch+1}/{epochs} | loss: {avg:.4f} | lr: {enc_opt.param_groups[0]['lr']:.6f}")


@torch.no_grad()
def translate(encoder, decoder, dataloader, tokenizer, seq_length=40):
    """
    Unchanged signature. Uses Seq2Seq internally.
    """
    device = next(encoder.parameters()).device
    model = Seq2Seq(encoder.to(device), decoder.to(device)).eval()

    SOS = tokenizer.token_to_id["<SOS>"]
    EOS = tokenizer.token_to_id["<EOS>"]
    PAD = tokenizer.token_to_id["<PAD>"]

    preds = []
    for src, _ in dataloader:
        src = src.to(device)
        enc_outputs, dec_hidden = model.encode(src)

        B = src.size(0)
        dec_inp = torch.full((B,), SOS, dtype=torch.long, device=device)

        seq_ids = []
        for _ in range(seq_length):
            step_logits, dec_hidden = model.decode_step(dec_inp, dec_hidden)  # (B,V)
            next_ids = step_logits.argmax(-1)
            seq_ids.append(next_ids.unsqueeze(1))
            dec_inp = next_ids

        seq_ids = torch.cat(seq_ids, dim=1)  # (B, seq_length)
        for s in seq_ids:
            ids = [i.item() for i in s if i.item() not in (SOS, EOS, PAD)]
            preds.append(tokenizer.decode(ids))
    return preds

HIDDEN_SIZE = 1024  # also embedding dim
EPOCHS = 70
LR = 0.0001
TF_RATIO = 0.78

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Bengali
encoder_input_size_ben = int(max(np.max(en_train_encoded_ben), np.max(en_val_encoded_ben))) + 1
decoder_output_size_ben = int(max(np.max(de_train_encoded_ben), np.max(de_val_encoded_ben))) + 1

encoder_ben = EncoderRNN(encoder_input_size_ben, hidden_size=HIDDEN_SIZE).to(device)
decoder_ben = DecoderRNN(hidden_size=HIDDEN_SIZE, output_size=decoder_output_size_ben).to(device)

print("Training Bengali model...")
train_model(encoder_ben, decoder_ben, train_dl_ben, ben_bpe,
            epochs=EPOCHS, lr=LR, teacher_forcing_ratio=TF_RATIO)

# Hindi
encoder_input_size_hin = int(max(np.max(en_train_encoded_hin), np.max(en_val_encoded_hin))) + 1
decoder_output_size_hin = int(max(np.max(de_train_encoded_hin), np.max(de_val_encoded_hin))) + 1

encoder_hin = EncoderRNN(encoder_input_size_hin, hidden_size=HIDDEN_SIZE).to(device)
decoder_hin = DecoderRNN(hidden_size=HIDDEN_SIZE, output_size=decoder_output_size_hin).to(device)

print("\nTraining Hindi model...")
train_model(encoder_hin, decoder_hin, train_dl_hin, hin_bpe,
            epochs=EPOCHS, lr=LR, teacher_forcing_ratio=TF_RATIO)

print("\nGenerating Bengali translations...")
val_outs_ben = translate(encoder_ben, decoder_ben, test_dl_ben, ben_bpe)
df_ben = pd.DataFrame({"ID": val_ids_ben, "Translation": val_outs_ben})
df_ben.to_csv("answer_ben.csv", index=False)
print("Saved Bengali predictions → answer_ben.csv")

print("\nGenerating Hindi translations")
val_outs_hin = translate(encoder_hin, decoder_hin, test_dl_hin, hin_bpe)
df_hin = pd.DataFrame({"ID": val_ids_hin, "Translation": val_outs_hin})
df_hin.to_csv("answer_hi.csv", index=False)
print("Saved Hindi predictions → answer_hi.csv")

In [None]:
print("\nCreating final submission file")
import pandas as pd
df_ben = pd.read_csv("/kaggle/input/erfiof")
df_hin = pd.read_csv("/kaggle/input/fbthth")

# Combine Bengali first, then Hindi
combined_data = pd.concat([df_ben, df_hin], axis=0, ignore_index=True)

# Save in the exact format required
with open("/kaggle/working/answer.csv", "w") as f:
    f.write("ID\tTranslation\n")
    for i in range(combined_data.shape[0]):
        f.write(f"{combined_data['ID'][i]}\t{combined_data['Translation'][i]}\n")

print("Final submission file created → answer.csv")
print(f"Total rows: {combined_data.shape[0] + 1} (including header)")  # +1 for header
print(f"Bengali entries: {len(df_ben)}")
print(f"Hindi entries: {len(df_hin)}")