In [1]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import spacy
from utils import build_phrase_vocab
from preprocessing import preprocess_with_phrases  # fixed typo: pre_processing -> preprocessing

In [None]:
# Set the maximum sequence length and minimum sentence length for training
sequence_len = 128  # Maximum number of characters in a sentence
min_len = 5         # Minimum number of characters in a sentence

# Provide your English-Hindi sentence pairs for training
en_texts = ["Hello world.", "How are you?"]  # List of English sentences
hi_texts = ["नमस्ते दुनिया।", "आप कैसे हैं?"]  # List of corresponding Hindi sentences

# Preprocess the data: clean, filter by length, and extract phrase tags from English
en_proc, hi_proc, phrase_tags = preprocess_with_phrases(en_texts, hi_texts, min_len, sequence_len)

# Build a vocabulary for phrase tags
phrase2idx = build_phrase_vocab()

In [None]:
class CharPhraseDataset(Dataset):
    def __init__(self, x, y, phrases, sequence_len, ch2i, phrase2idx):
        # Store the processed sentences, phrase tags, and vocabularies
        self.x, self.y, self.phrases = x, y, phrases
        self.sequence_len = sequence_len
        self.ch2i = ch2i
        self.phrase2idx = phrase2idx

    def __len__(self):
        # Return the number of sentence pairs in the dataset
        return len(self.x)

    def __getitem__(self, idx):
        # Convert characters to indices for both English and Hindi sentences
        x = [self.ch2i.get(c, 0) for c in self.x[idx]]
        y = [self.ch2i.get(c, 0) for c in self.y[idx]]
        # Convert phrase tags to indices
        p = [self.phrase2idx.get(tag, 0) for tag in self.phrases[idx]]
        # Pad or trim sequences to the fixed length
        x = x[:self.sequence_len] + [0]*(self.sequence_len - len(x))
        y = y[:self.sequence_len] + [0]*(self.sequence_len - len(y))
        p = p[:self.sequence_len] + [0]*(self.sequence_len - len(p))
        # Return tensors for model input
        return torch.tensor(x), torch.tensor(y), torch.tensor(p)

In [None]:
# Create a set of all unique characters in both English and Hindi sentences
chars = set(''.join(en_proc + hi_proc))
# Create a mapping from character to index, with <pad> as the first token
ch2i = {c: i for i, c in enumerate(['<pad>'] + sorted(list(chars)))}

In [None]:
# Create the dataset object for training
dataset = CharPhraseDataset(en_proc, hi_proc, phrase_tags, sequence_len, ch2i, phrase2idx)

In [None]:
# Import the model and configuration classes
from transformer import TransformerWithPhrase
from transformer import TransformerConfig

# Set up the model configuration (adjust parameters as needed)
mconfig = TransformerConfig(
    vocab_size=len(ch2i),           # Number of unique characters
    sequence_len=sequence_len,      # Maximum sequence length
    nblock=4,                       # Number of transformer blocks
    nhead=8,                        # Number of attention heads
    embed_dim=256,                  # Embedding dimension
    phrase_emb_dim=16,              # Phrase embedding dimension
)
# Create the transformer model
model = TransformerWithPhrase(mconfig, phrase_vocab_size=len(phrase2idx))

In [None]:
# Import the training classes
from trainer import Trainer, TrainerConfig
# Set up training configuration (adjust parameters as needed)
trainer_config = TrainerConfig(max_epochs=10, batch_size=64, learning_rate=3e-4, device='cuda' if torch.cuda.is_available() else 'cpu')
# Create the trainer object
trainer = Trainer(model, dataset, trainer_config)
# Start the training process
trainer.train()

Epoch 1, Loss: 3.7379
Epoch 2, Loss: 3.0042
Epoch 3, Loss: 2.7939
Epoch 3, Loss: 2.7939
Epoch 4, Loss: 2.6110
Epoch 4, Loss: 2.6110
Epoch 5, Loss: 2.3312
Epoch 6, Loss: 1.8875
Epoch 5, Loss: 2.3312
Epoch 6, Loss: 1.8875
Epoch 7, Loss: 1.2686
Epoch 8, Loss: 0.7959
Epoch 7, Loss: 1.2686
Epoch 8, Loss: 0.7959
Epoch 9, Loss: 0.4958
Epoch 10, Loss: 0.3126
Epoch 9, Loss: 0.4958
Epoch 10, Loss: 0.3126


In [None]:
# Test the model with new English sentences and print Hindi translations
from preprocessing import extract_7_phrases

# List of English sentences to translate
test_sents = ["This is a test.", "Translate this sentence."]

# Extract phrase tags for each test sentence
test_phrases = [extract_7_phrases(s) for s in test_sents]

# Convert phrase tags and characters to indices, pad to sequence length
test_p = [[phrase2idx.get(tag, 0) for tag in tags] + [0]*(sequence_len-len(tags)) for tags in test_phrases]
test_x = [[ch2i.get(c, 0) for c in s] + [0]*(sequence_len-len(s)) for s in test_sents]

# Move tensors to the correct device (CPU or GPU)
device = trainer_config.device
test_x = torch.tensor(test_x).to(device)
test_p = torch.tensor(test_p).to(device)

# Generate translations using the trained model
with torch.no_grad():
    translations = model.generate(test_x, test_p)

# Convert output indices back to text
i2ch = {i: c for c, i in ch2i.items()}

def decode(indices):
    # Convert a list of indices to a string, ignoring padding
    return ''.join([i2ch.get(idx, '') for idx in indices if idx != 0])

# Print the translated Hindi sentences
for sent in translations.cpu().numpy():
    print(decode(sent))

दुनिे दुनिया।या।ुनिया।ुनिया।प 
दुनaसे हैं?ा।प दुनिया।aदुनिया
