## Challenge 1

In [8]:
!pip install transformers datasets

# Import libraries
import os
import warnings
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)



In [9]:
warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
dataset = load_dataset("SKNahin/bengali-transliteration-data")
dataset

train_data = dataset['train'].train_test_split(test_size=0.2)

In [10]:
print(dataset['train'].column_names)

['bn', 'rm']


In [11]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

def preprocess(example):
    source = tokenizer(example['rm'], truncation=True, padding='max_length', max_length=128)
    target = tokenizer(example['bn'], truncation=True, padding='max_length', max_length=128)
    return {
        'input_ids': source['input_ids'],
        'attention_mask': source['attention_mask'],
        'labels': target['input_ids']
    }


In [12]:
processed_dataset = train_data.map(preprocess, batched=True)

Map:   0%|          | 0/4004 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

In [13]:
processed_dataset = train_data.map(preprocess, batched=True)

Map:   0%|          | 0/4004 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Replaces deprecated evaluation_strategy
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,  # Use mixed precision for faster training
    run_name="Banglish-to-Bangla-Transliteration"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset['train'],
    eval_dataset=processed_dataset['test'],
    data_collator=data_collator
)

trainer.train()


Epoch,Training Loss,Validation Loss
1,No log,0.035394


In [None]:
model.save_pretrained("banglish_to_bangla_model")
tokenizer.save_pretrained("banglish_to_bangla_model")

print("Model and tokenizer saved successfully!")

In [None]:
def transliterate(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
test_text = "ami bhalo achi"
result = transliterate(test_text)
print(f"Input (Banglish): {test_text}")
print(f"Output (Bengali): {result}")