In [11]:
!pip install datasets transformers



In [13]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load the dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Split the dataset into training and validation sets
dataset = dataset["train"].train_test_split(test_size=0.2)
train_data = dataset["train"]
val_data = dataset["test"]

print(f"Training examples: {len(train_data)}")
print(f"Validation examples: {len(val_data)}")



Training examples: 4004
Validation examples: 1002


In [14]:
# Initialize the tokenizer
model_name = "t5-small"  # Replace with your chosen model
tokenizer = AutoTokenizer.from_pretrained(model_name)




In [16]:
print(dataset["train"].column_names)

['bn', 'rm']


In [17]:
def preprocess_function(examples):
    # Use 'rm' for Banglish and 'bn' for Bengali
    inputs = examples["rm"]  # Banglish text (Romanized)
    targets = examples["bn"]  # Bengali text
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    # Tokenize the targets (Bengali text)
    labels = tokenizer(targets, max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [18]:
# Tokenize the datasets
tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_val = val_data.map(preprocess_function, batched=True)


Map:   0%|          | 0/4004 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

In [20]:
from transformers import AutoModelForSeq2SeqLM

# Load the model (ensure the model matches the tokenizer used earlier)
model_name = "t5-small"  # Replace with your chosen model name
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [21]:
from transformers import DataCollatorForSeq2Seq

# Create a data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [61]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",  # Directory to save model checkpoints
    evaluation_strategy="epoch",  # Evaluate after every epoch
    learning_rate=5e-5,  # Learning rate
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    num_train_epochs=11,  # Number of training epochs
    weight_decay=0.01,  # Weight decay for regularization
    save_total_limit=3,  # Limit the number of saved checkpoints
    save_steps=10,  # Save model every 10 steps
    logging_dir='./logs',  # Directory for logs
    logging_steps=10,  # Log every 10 steps
    report_to="none",  # Disable W&B logging
)






In [63]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Trainer(


In [64]:


trainer.train()


Epoch,Training Loss,Validation Loss
1,0.0605,0.052949
2,0.0664,0.049737
3,0.049,0.039289
4,0.0459,0.042025
5,0.0527,0.038758
6,0.0385,0.058014
7,0.0601,0.035388
8,0.0445,0.038542
9,0.0377,0.044595
10,0.0371,0.038619


TrainOutput(global_step=2761, training_loss=0.050066601640894964, metrics={'train_runtime': 1777.807, 'train_samples_per_second': 24.774, 'train_steps_per_second': 1.553, 'total_flos': 551676444672000.0, 'train_loss': 0.050066601640894964, 'epoch': 11.0})

# New Section

In [65]:
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.038733068853616714, 'eval_runtime': 2.0516, 'eval_samples_per_second': 488.397, 'eval_steps_per_second': 30.708, 'epoch': 11.0}


In [67]:
import torch
model.eval()
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the selected device
model.to(device)

# Tokenize the input and move input tensors to the same device
banglish_text = "amar sonar bangla"
inputs = tokenizer(banglish_text, return_tensors="pt", max_length=128, truncation=True).input_ids.to(device)
tokens = tokenizer(banglish_text, return_tensors="pt", max_length=128, truncation=True)
print("Input tokens:", tokens)
# Decode the input tokens
input_text = tokenizer.decode(tokens['input_ids'][0], skip_special_tokens=True)
print("Decoded input tokens:", input_text)


# Generate output (without decoding yet)
outputs = model.generate(inputs, max_length=256, num_beams=8, early_stopping=True)



# Decode the output and print
bengali_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Banglish:", banglish_text)
print("Bengali:", bengali_text)


Input tokens: {'input_ids': tensor([[   3,    9, 1635,  520,  291, 4514, 7002,    1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
Decoded input tokens: amar sonar bangla
Banglish: amar sonar bangla
Bengali:   


In [None]:
print(model.config)


In [68]:
# Create a function to generate predictions for the entire validation dataset
def evaluate_on_validation_data(val_data, model, tokenizer):
    all_predictions = []
    all_labels = []

    # Iterate through the validation data
    for example in val_data:
        # Tokenize the input
        input_ids = tokenizer(example["rm"], return_tensors="pt", padding=True, truncation=True).input_ids.to(device)

        # Generate predictions
        with torch.no_grad():
            output_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)

        # Decode the predictions
        prediction = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        all_predictions.append(prediction)

        # Store the ground truth (target)
        target = example["bn"]
        all_labels.append(target)

    return all_predictions, all_labels

# Run the evaluation
predictions, labels = evaluate_on_validation_data(val_data, model, tokenizer)




In [55]:
!pip install evaluate
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.0.0 sacrebleu-2.4.3


In [69]:
import evaluate

# Initialize the BLEU metric
metric = evaluate.load("sacrebleu")

# Compute the BLEU score for the predictions and the ground truth labels
results = metric.compute(predictions=predictions, references=[[label] for label in labels])

print("Evaluation results:", results)


Evaluation results: {'score': 0.031152512586746464, 'counts': [691, 282, 117, 42], 'totals': [956, 598, 394, 264], 'precisions': [72.28033472803347, 47.15719063545151, 29.695431472081218, 15.909090909090908], 'bp': 0.0008745150934069176, 'sys_len': 956, 'ref_len': 7688}
