In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import spacy
from utils import build_phrase_vocab
from preprocessing import preprocess_with_phrases  # fixed typo: pre_processing -> preprocessing

In [None]:
sequence_len = 128
min_len = 5

# Replace with your English-Hindi parallel data
en_texts = ["Hello world.", "How are you?"]
hi_texts = ["नमस्ते दुनिया।", "आप कैसे हैं?"]
# Preprocessing: phrase extraction is performed on English sentences
en_proc, hi_proc, phrase_tags = preprocess_with_phrases(en_texts, hi_texts, min_len, sequence_len)
phrase2idx = build_phrase_vocab()

In [None]:
class CharPhraseDataset(Dataset):
    def __init__(self, x, y, phrases, sequence_len, ch2i, phrase2idx):
        self.x, self.y, self.phrases = x, y, phrases
        self.sequence_len = sequence_len
        self.ch2i = ch2i
        self.phrase2idx = phrase2idx

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        x = [self.ch2i.get(c, 0) for c in self.x[idx]]
        y = [self.ch2i.get(c, 0) for c in self.y[idx]]
        p = [self.phrase2idx.get(tag, 0) for tag in self.phrases[idx]]
        # Padding if needed
        x = x[:self.sequence_len] + [0]*(self.sequence_len - len(x))
        y = y[:self.sequence_len] + [0]*(self.sequence_len - len(y))
        p = p[:self.sequence_len] + [0]*(self.sequence_len - len(p))
        return torch.tensor(x), torch.tensor(y), torch.tensor(p)
# This dataset is now set up for English-Hindi translation (x: English, y: Hindi)

In [None]:
chars = set(''.join(en_proc + hi_proc))
ch2i = {c: i for i, c in enumerate(['<pad>'] + sorted(list(chars)))}
# ch2i covers both English and Hindi characters

In [None]:
dataset = CharPhraseDataset(en_proc, hi_proc, phrase_tags, sequence_len, ch2i, phrase2idx)
# en_proc: English, hi_proc: Hindi

In [None]:
from transformer import TransformerWithPhrase

from transformer import TransformerConfig
mconfig = TransformerConfig(
    vocab_size=len(ch2i),
    sequence_len=sequence_len,
    nblock=4,
    nhead=8,
    embed_dim=256,
    phrase_emb_dim=16
)
model = TransformerWithPhrase(mconfig, phrase_vocab_size=len(phrase2idx), phrase_emb_dim=16)
# Model is set up for English-to-Hindi translation

In [None]:
from trainer import Trainer, TrainerConfig
trainer_config = TrainerConfig(max_epochs=10, batch_size=64, learning_rate=3e-4, device='cuda' if torch.cuda.is_available() else 'cpu')
trainer = Trainer(model, dataset, trainer_config)
trainer.train()
# Training for English-to-Hindi translation

In [None]:
# Example: test English sentences for translation to Hindi
test_sents = ["This is a test.", "Translate this sentence."]
test_phrases = [extract_7_phrases(s) for s in test_sents]  # phrase extraction on English
test_p = [[phrase2idx.get(tag, 0) for tag in tags] + [0]*(sequence_len-len(tags)) for tags in test_phrases]
test_x = [[ch2i.get(c, 0) for c in s] + [0]*(sequence_len-len(s)) for s in test_sents]
device = trainer_config.device
test_x = torch.tensor(test_x).to(device)
test_p = torch.tensor(test_p).to(device)
with torch.no_grad():
    translations = model.generate(test_x, test_p)

# Decoding: convert output indices to text
# Build inverse mapping
i2ch = {i: c for c, i in ch2i.items()}

def decode(indices):
    return ''.join([i2ch.get(idx, '') for idx in indices if idx != 0])

for sent in translations.cpu().numpy():
    print(decode(sent))