#### Fine Tuning a Tagalog <-> Cebuano Model

In [3]:
import importlib
import utils
import pandas as pd
importlib.reload(utils)
from utils import *

**Load Parallel Corpus**

In [4]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

df = pd.read_csv("Parallel_Corpora/Tagalog_Cebuano_parallel.csv")
df = df[["text_Tagalog", "text_Cebuano"]].rename(columns={
    "text_Tagalog": "source",
    "text_Cebuano": "target"
})

train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df)
})
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['source', 'target', '__index_level_0__'],
        num_rows: 3244
    })
    test: Dataset({
        features: ['source', 'target', '__index_level_0__'],
        num_rows: 361
    })
})


**Load Base Model**

In [5]:
from transformers import MarianMTModel, MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-en-tl"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)


**Tokenize the Tagalog-Cebuano Data**

In [6]:
def preprocess_function(examples):
    inputs = examples["source"]
    targets = examples["target"]
    model_inputs = tokenizer(inputs, text_target=targets, truncation=True, max_length=128)
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3244/3244 [00:01<00:00, 2232.67 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 361/361 [00:00<00:00, 1861.26 examples/s]


In [7]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="./results_tl_ceb_finetune",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=False,
    logging_dir='./logs',
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()


  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Epoch,Training Loss,Validation Loss
1,No log,2.923749
2,3.739000,2.563377
3,2.760300,2.473841




TrainOutput(global_step=1218, training_loss=3.1337402043084204, metrics={'train_runtime': 2021.2916, 'train_samples_per_second': 4.815, 'train_steps_per_second': 0.603, 'total_flos': 272395607998464.0, 'train_loss': 3.1337402043084204, 'epoch': 3.0})

In [8]:
trainer.save_model("./opus-mt-tl-ceb-finetuned")
tokenizer.save_pretrained("./opus-mt-tl-ceb-finetuned")


('./opus-mt-tl-ceb-finetuned\\tokenizer_config.json',
 './opus-mt-tl-ceb-finetuned\\special_tokens_map.json',
 './opus-mt-tl-ceb-finetuned\\vocab.json',
 './opus-mt-tl-ceb-finetuned\\source.spm',
 './opus-mt-tl-ceb-finetuned\\target.spm',
 './opus-mt-tl-ceb-finetuned\\added_tokens.json')

#### BLEU Evaluation

In [10]:
from transformers import MarianMTModel, MarianTokenizer

model_dir = "./opus-mt-tl-ceb-finetuned"

model = MarianMTModel.from_pretrained(model_dir)
tokenizer = MarianTokenizer.from_pretrained(model_dir)

print("Model successfully loaded from opus-mt-tl-ceb-finetuned")



Model successfully loaded from opus-mt-tl-ceb-finetuned


In [11]:
def translate_tl_to_ceb(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [20]:
tagalog_text = "Magandang gabi gabby"
translation = translate_tl_to_ceb(tagalog_text)

print("Tagalog:", tagalog_text)
print("Cebuano translation:", translation)

Tagalog: Magandang gabi gabby
Cebuano translation: Ug gibutang nga nag-ubangan


In [19]:
import pandas as pd
import evaluate
from tqdm import tqdm

bleu = evaluate.load("sacrebleu")

# Load a manageable sample of your corpus
df = pd.read_csv("Parallel_Corpora/Tagalog_Cebuano_parallel.csv")
df = df.sample(200, random_state=42).reset_index(drop=True)

preds, refs = [], []

for _, row in tqdm(df.iterrows(), total=len(df)):
    pred = translate_tl_to_ceb(row["text_Tagalog"])  # uses your fine-tuned model
    preds.append(pred)
    refs.append(row["text_Cebuano"])

results = bleu.compute(predictions=preds, references=[[r] for r in refs])
print(f"ðŸŸ© Fine-tuned model BLEU: {results['score']:.2f}")


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 200/200 [04:11<00:00,  1.26s/it]


ðŸŸ© Fine-tuned model BLEU: 15.08
