In [22]:
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

dataset = load_dataset("SKNahin/bengali-transliteration-data")

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def tokenize_function(examples):
    model_inputs = tokenizer(examples['rm'], padding="max_length", truncation=True, max_length=128)
    labels = tokenizer(examples['bn'], padding="max_length", truncation=True, max_length=128)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

train_dataset = dataset["train"].map(tokenize_function, batched=True)

train_dataset = train_dataset.filter(lambda example: 5 <= len(example['rm'].split()) <= 50)

train_dataset = train_dataset.train_test_split(test_size=0.2)
train_data = train_dataset['train']
val_data = train_dataset['test']

model = T5ForConditionalGeneration.from_pretrained("t5-small")

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=200,
    save_steps=1000,
    load_best_model_at_end=True,
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
)

trainer.train()

eval_results = trainer.evaluate()
print(eval_results)

model.save_pretrained("./model")
tokenizer.save_pretrained("./model")


DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 5006
    })
})


  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,No log,0.046952


KeyboardInterrupt: 

In [23]:
# Save the fine-tuned model and tokenizer after stopping the training
model.save_pretrained("./fine_tuned_mbart_banglish_to_bangla")
tokenizer.save_pretrained("./fine_tuned_mbart_banglish_to_bangla")


('./fine_tuned_mbart_banglish_to_bangla/tokenizer_config.json',
 './fine_tuned_mbart_banglish_to_bangla/special_tokens_map.json',
 './fine_tuned_mbart_banglish_to_bangla/spiece.model',
 './fine_tuned_mbart_banglish_to_bangla/added_tokens.json')

In [24]:
import wandb
wandb.finish()


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁█
train/global_step,▁█
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
eval/loss,0.04695
eval/runtime,234.9247
eval/samples_per_second,2.835
eval/steps_per_second,0.047
train/epoch,1.1976
train/global_step,200.0
train/grad_norm,0.17682
train/learning_rate,3e-05
train/loss,1.0647
