# Banglish to Bengali Transliteration
This notebook implements a neural machine translation model to convert Banglish (Bengali written in English letters) to proper Bengali script.

In [None]:
# Install required packages
!pip install -q transformers datasets sacrebleu

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    MBartForConditionalGeneration,
    MBartTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

## Load and Preprocess Data

In [None]:
# Load dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data")
df = dataset['train'].to_pandas()

# Basic cleaning
df = df[(df['rm'].str.len() > 2) & (df['rm'].str.len() < 200)]
df = df[(df['bn'].str.len() > 2) & (df['bn'].str.len() < 200)]

# Display sample data
print("Dataset size:", len(df))
print("\nSample data:")
print(df.head())

In [None]:
# Split into train and validation
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
print(f"\nTraining samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")

## Create Dataset Class

In [None]:
class BanglishDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        banglish = str(self.df.iloc[idx]['rm'])    # Changed to 'rm' (Roman)
        bangla = str(self.df.iloc[idx]['bn'])      # Changed to 'bn' (Bengali)
        
        source = self.tokenizer(
            banglish,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        target = self.tokenizer(
            bangla,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': source['input_ids'].squeeze(),
            'attention_mask': source['attention_mask'].squeeze(),
            'labels': target['input_ids'].squeeze()
        }

## Initialize Model and Tokenizer

In [None]:
# Initialize tokenizer and model
model_name = "facebook/mbart-large-cc25"
tokenizer = MBartTokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Create datasets
train_dataset = BanglishDataset(train_df, tokenizer)
val_dataset = BanglishDataset(val_df, tokenizer)

## Set up Training

In [None]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./banglish_bengali_model",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    save_total_limit=2,
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

## Train the Model

In [None]:
# Train the model
trainer.train()

## Test the Model

In [None]:
def translate_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    # Move inputs to GPU if available
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
        model.cuda()
    
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test examples
test_examples = [
    "ami tomake bhalobashi",
    "kemon acho",
    "bangladesh amar desh"
]

print("Test Results:")
for text in test_examples:
    translated = translate_text(text)
    print(f"Input: {text}")
    print(f"Output: {translated}\n")

## Save the Model (Optional)

In [None]:
# Save the model
trainer.save_model("./banglish_bengali_model_final")
print("Model saved successfully!")