### Project 3: Machine Translation


In [13]:
import importlib
import utils
import pandas as pd
importlib.reload(utils)
from utils import *

#### Load Language Modal

In [3]:
from transformers import MarianMTModel, MarianTokenizer

# Load English â†’ Tagalog model
en_tl_model_name = "Helsinki-NLP/opus-mt-en-tl"
en_tl_tokenizer = MarianTokenizer.from_pretrained(en_tl_model_name)
en_tl_model = MarianMTModel.from_pretrained(en_tl_model_name)

def translate_en_to_tl(text):
    inputs = en_tl_tokenizer(text, return_tensors="pt", truncation=True)
    translated = en_tl_model.generate(**inputs)
    return en_tl_tokenizer.decode(translated[0], skip_special_tokens=True)

print(translate_en_to_tl("In the beginning God created the heaven and the earth."))


ValueError: Due to a serious vulnerability issue in `torch.load`, even with `weights_only=True`, we now require users to upgrade torch to at least v2.6 in order to use the function. This version restriction does not apply when loading files with safetensors.
See the vulnerability report here https://nvd.nist.gov/vuln/detail/CVE-2025-32434

#### Creating Parallel Corpora by Reusing Function from MCO1

In [3]:
csv_files = {
    "Hiligaynon": ["Bible/CSV/Hiligaynon_Exodus.csv","Bible/CSV/Hiligaynon_Genesis.csv","Bible/CSV/Hiligaynon_Leviticus.csv"],
    "Tagalog": ["Bible/CSV/Tagalog_Exodus.csv","Bible/CSV/Tagalog_Genesis.csv","Bible/CSV/Tagalog_Leviticus.csv"],
    "Cebuano": ["Bible/CSV/Cebuano_Exodus.csv","Bible/CSV/Cebuano_Genesis.csv","Bible/CSV/Cebuano_Leviticus.csv"],
}
pairs = [
    ("Tagalog", "Hiligaynon"),
    ("Tagalog", "Cebuano"),
]

create_parallel_corpora(csv_files, pairs, "Parallel_Corpora")

Saved: Parallel_Corpora\Tagalog_Hiligaynon_parallel.csv
Saved: Parallel_Corpora\Tagalog_Cebuano_parallel.csv


#### Read Tagalog To Cebuano Parallel Corpora

In [4]:
df = pd.read_csv("Parallel_Corpora/Tagalog_Cebuano_parallel.csv")
print(df.columns)
df.head()

Index(['book', 'chapter', 'verse', 'text_Tagalog', 'text_Cebuano'], dtype='object')


Unnamed: 0,book,chapter,verse,text_Tagalog,text_Cebuano
0,Exodus,1,1,Ito ang mga pangalan ng mga anak ni Israel na ...,Mao kini ang mga ngalan sa mga anak nga lalaki...
1,Exodus,1,2,"sina Ruben, Simeon, Levi, Juda,","si Ruben, si Simeon, si Levi, si Juda,"
2,Exodus,1,3,"Isacar, Zebulon, Benjamin,","si Isacar, si Zabulon, ug si Benjamin,"
3,Exodus,1,4,"Dan, Neftali, Gad, at Aser.","si Dan, si Neftali, si Gad, ug si Aser."
4,Exodus,1,5,Lahat ng taong nagmula sa balakang ni Jacob ay...,"Ug ang tanang kaliwat ni Jacob, 70 ka buok. Si..."


### Loading the M2M100 Model

In [12]:
from transformers import MarianMTModel, MarianTokenizer

# Load models
tl_to_en_model_name = "Helsinki-NLP/opus-mt-tl-en"
en_to_ceb_model_name = "Helsinki-NLP/opus-mt-en-ceb"

tl_to_en_tokenizer = MarianTokenizer.from_pretrained(tl_to_en_model_name)
tl_to_en_model = MarianMTModel.from_pretrained(tl_to_en_model_name)

en_to_ceb_tokenizer = MarianTokenizer.from_pretrained(en_to_ceb_model_name)
en_to_ceb_model = MarianMTModel.from_pretrained(en_to_ceb_model_name)

# Step 1: Tagalog â†’ English
text_tl = "Mahal kita pero kailangan kong umalis."
inputs = tl_to_en_tokenizer(text_tl, return_tensors="pt", padding=True, truncation=True)
translation_en = tl_to_en_model.generate(**inputs)
text_en = tl_to_en_tokenizer.decode(translation_en[0], skip_special_tokens=True)

# Step 2: English â†’ Cebuano
inputs_ceb = en_to_ceb_tokenizer(text_en, return_tensors="pt", padding=True, truncation=True)
translation_ceb = en_to_ceb_model.generate(**inputs_ceb)
text_ceb = en_to_ceb_tokenizer.decode(translation_ceb[0], skip_special_tokens=True)

print("ðŸ‡µðŸ‡­ Tagalog:", text_tl)
print("ðŸ‡¬ðŸ‡§ English:", text_en)
print("ðŸ‡¨ðŸ‡­ Cebuano:", text_ceb)


ValueError: Due to a serious vulnerability issue in `torch.load`, even with `weights_only=True`, we now require users to upgrade torch to at least v2.6 in order to use the function. This version restriction does not apply when loading files with safetensors.
See the vulnerability report here https://nvd.nist.gov/vuln/detail/CVE-2025-32434

In [5]:
from datasets import Dataset

# Keep only the necessary columns
train_data = df[['text_Tagalog', 'text_Cebuano']].dropna().rename(
    columns={"text_Tagalog": "source", "text_Cebuano": "target"}
)

dataset = Dataset.from_pandas(train_data)
dataset = dataset.train_test_split(test_size=0.1)


In [6]:
def preprocess_function(examples):
    inputs = [ex for ex in examples["source"]]
    targets = [ex for ex in examples["target"]]
    model_inputs = tokenizer(inputs, text_target=targets, truncation=True, max_length=128)
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3244/3244 [00:01<00:00, 1665.70 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 361/361 [00:00<00:00, 2134.08 examples/s]


#### Computing the Baseline BLEU Score Before Fine Tuning

In [7]:
import evaluate
import numpy as np

# Load BLEU metric
metric = evaluate.load("sacrebleu")

def compute_bleu(model, tokenizer, dataset, num_samples=100):
    """Compute BLEU for a model on a small test subset."""
    preds, refs = [], []
    samples = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
    
    for row in samples:
        inputs = tokenizer(row["source"], return_tensors="pt", truncation=True)
        generated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.get_lang_id("ceb")
        )
        pred = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
        preds.append(pred)
        refs.append([row["target"]])
    
    bleu = metric.compute(predictions=preds, references=refs)
    print("Sample predictions:")
    print(preds[:3])
    return bleu["score"]

# âœ… Baseline (before fine-tuning)
bleu_before = compute_bleu(model, tokenizer, dataset)
print(f"Baseline BLEU (before fine-tuning): {bleu_before:.2f}")


RuntimeError: Tensor on device cpu is not on the expected device meta!

### Fine Tuning the Model for Tagalog-Cebuano Translation

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

batch_size = 8

training_args = Seq2SeqTrainingArguments(
    output_dir="./results_tl_ceb",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=False,  # set to True if you have GPU with mixed precision
    logging_dir='./logs',
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
