In [None]:
import gc
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq

# Load the Banglish-to-Bengali dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Split into training and validation sets
train_test_split = dataset['train'].train_test_split(test_size=0.2)
train_data = train_test_split['train']
val_data = train_test_split['test']


# tokenizing and selecting model to preprocess and separate train_test data

In [1]:

tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")

# Preprocessing function for tokenizing Banglish and Bengali text
def preprocess_data(example):
    inputs = tokenizer(example['rm'], padding="max_length", truncation=True, max_length=30)  # Shorter max_length
    targets = tokenizer(example['bn'], padding="max_length", truncation=True, max_length=30)  # Shorter max_length
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply preprocessing to training and validation datasets
train_data = train_data.map(preprocess_data, batched=True)
val_data = val_data.map(preprocess_data, batched=True)

# temporarily using fewer samples for slow pc
train_data = train_data.select(range(500))



NameError: name 'AutoTokenizer' is not defined

# setting up training args

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training Arguments with adjustments for a less powerful machine
training_args = TrainingArguments(
    output_dir="./results",               # Output directory for results
    evaluation_strategy="epoch",          # Evaluate after each epoch
    learning_rate=5e-5,                   # Learning rate (adjusted)
    per_device_train_batch_size=2,        # Smaller batch size
    per_device_eval_batch_size=2,         # Smaller batch size
    num_train_epochs=1,                   # Number of epochs
    weight_decay=0.01,                    # Weight decay
    logging_dir="./logs",                 # Log directory
    logging_steps=10,                     # Log every 10 steps
    save_steps=500,                       # Save model every 500 steps
    gradient_accumulation_steps=8,        # Gradient accumulation for efficient training
    fp16=True,                            # Mixed precision training to save memory
)

# training and saving model

In [None]:

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train the model
try:
    trainer.train()
except Exception as e:
    print(f"Training failed: {e}")

# Save the trained model and tokenizer
model.save_pretrained("./banglish-to-bengali-transliteration")
tokenizer.save_pretrained("./banglish-to-bengali-transliteration")
