# Importing and loading datasets

In [None]:
import gc
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq

# Load the Banglish-to-Bengali dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Split into training and validation sets
train_test_split = dataset['train'].train_test_split(test_size=0.2)
train_data = train_test_split['train']
val_data = train_test_split['test']


# Tokenizing and selecting model to preprocess and separate train_test data

In [None]:

tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")

# Preprocessing function for tokenizing Banglish and Bengali text
def preprocess_data(example):
    inputs = tokenizer(example['rm'], padding="max_length", truncation=True, max_length=30)  # Shorter max_length
    targets = tokenizer(example['bn'], padding="max_length", truncation=True, max_length=30)  # Shorter max_length
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply preprocessing to training and validation datasets
train_data = train_data.map(preprocess_data, batched=True)
val_data = val_data.map(preprocess_data, batched=True)

# temporarily using fewer samples for slow pc
#train_data = train_data.select(range(500))


# setting up training args

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training Arguments with adjustments for my slow laptop 
training_args = TrainingArguments(
    output_dir="./results",               
    evaluation_strategy="epoch",          # Evaluate after each epoch
    learning_rate=5e-5,                   
    per_device_train_batch_size=4,        # Smaller batch size
    per_device_eval_batch_size=4,         # Smaller batch size
    num_train_epochs=2,                   
    weight_decay=0.01,                    
    logging_dir="./logs",                 
    logging_steps=10,                     
    save_steps=500,                       
    gradient_accumulation_steps=8,        # Gradient accumulation for efficient training
    fp16=True,                            # Mixed precision training to save memory
)

# Training and saving model

In [None]:

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train the model
try:
    trainer.train()
except Exception as e:
    print(f"Training failed: {e}")

# Save the trained model and tokenizer
model.save_pretrained("./banglish-to-bengali-transliteration")
tokenizer.save_pretrained("./banglish-to-bengali-transliteration")


# predicting the transliteration

In [None]:
def predict_banglish_to_bengali(text):
    # Tokenize the input text
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=30, return_tensors="pt")
    
    # Generate prediction
    with torch.no_grad():
        output = model.generate(inputs["input_ids"], max_length=50, num_beams=4, early_stopping=True)
    
    # Decode the output tokens to Bengali
    bengali_translation = tokenizer.decode(output[0], skip_special_tokens=True)
    return bengali_translation


# Sample Banglish input
sample_banglish = "ami banglay likhte chai" 

# Get the translation from Banglish to Bengali
predicted_bengali = predict_banglish_to_bengali(sample_banglish)

# Output the results
print(f"Input Banglish: {sample_banglish}")
print(f"Predicted Bengali: {predicted_bengali}")
# Input Banglish : "ami banglay likhte chai"
# Predicted Bengali: "আমি বাংলায় লিখতে চাই"