### Project 3: Machine Translation


In [1]:
import importlib
import utils
import pandas as pd
importlib.reload(utils)
from utils import *

#### Load Language Modal

In [58]:
from transformers import MarianMTModel, MarianTokenizer

# Load English → Tagalog model
en_tl_model_name = "Helsinki-NLP/opus-mt-en-tl"
en_tl_tokenizer = MarianTokenizer.from_pretrained(en_tl_model_name)
en_tl_model = MarianMTModel.from_pretrained(en_tl_model_name)

def translate_en_to_tl(text):
    inputs = en_tl_tokenizer(text, return_tensors="pt", truncation=True)
    translated = en_tl_model.generate(**inputs)
    return en_tl_tokenizer.decode(translated[0], skip_special_tokens=True)

print(translate_en_to_tl("In the beginning God created the heaven and the earth."))


Nang pasimula ay nilalang ng Diyos ang langit at ang lupa.


In [59]:
from transformers import MarianMTModel, MarianTokenizer

# Tagalog ↔ English
tl_en_model_name = "Helsinki-NLP/opus-mt-tl-en"
en_tl_model_name = "Helsinki-NLP/opus-mt-en-tl"

tl_en_tokenizer = MarianTokenizer.from_pretrained(tl_en_model_name)
tl_en_model = MarianMTModel.from_pretrained(tl_en_model_name)

en_tl_tokenizer = MarianTokenizer.from_pretrained(en_tl_model_name)
en_tl_model = MarianMTModel.from_pretrained(en_tl_model_name)

# Cebuano ↔ English
ceb_en_model_name = "Helsinki-NLP/opus-mt-ceb-en"
en_ceb_model_name = "Helsinki-NLP/opus-mt-en-ceb"

ceb_en_tokenizer = MarianTokenizer.from_pretrained(ceb_en_model_name)
ceb_en_model = MarianMTModel.from_pretrained(ceb_en_model_name)

en_ceb_tokenizer = MarianTokenizer.from_pretrained(en_ceb_model_name)
en_ceb_model = MarianMTModel.from_pretrained(en_ceb_model_name)

print("All pivot models loaded successfully!")


All pivot models loaded successfully!


In [None]:
def translate(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

def tl_to_ceb(text):
    en = translate(text, tl_en_tokenizer, tl_en_model)
    ceb = translate(en, en_ceb_tokenizer, en_ceb_model)
    return ceb

def ceb_to_tl(text):
    en = translate(text, ceb_en_tokenizer, ceb_en_model)
    tl = translate(en, en_tl_tokenizer, en_tl_model)
    return tl


In [60]:
tagalog_text = "Mahal kita ngayon at magpakailanman"
cebuano_text = "Gihigugma tika karon  ug sa kahangturan"

print("Tagalog → Cebuano:", tl_to_ceb(tagalog_text))
print("Cebuano → Tagalog:", ceb_to_tl(cebuano_text))


Tagalog → Cebuano: Gihigugma ko ikaw karon ug sa walay kataposan
Cebuano → Tagalog: Mahal kita ngayon at magpakailanman


#### Creating Parallel Corpora by Reusing Function from MCO1

In [3]:
csv_files = {
    "Hiligaynon": ["Bible/CSV/Hiligaynon_Exodus.csv","Bible/CSV/Hiligaynon_Genesis.csv","Bible/CSV/Hiligaynon_Leviticus.csv"],
    "Tagalog": ["Bible/CSV/Tagalog_Exodus.csv","Bible/CSV/Tagalog_Genesis.csv","Bible/CSV/Tagalog_Leviticus.csv"],
    "Cebuano": ["Bible/CSV/Cebuano_Exodus.csv","Bible/CSV/Cebuano_Genesis.csv","Bible/CSV/Cebuano_Leviticus.csv"],
}
pairs = [
    ("Tagalog", "Hiligaynon"),
    ("Tagalog", "Cebuano"),
]

create_parallel_corpora(csv_files, pairs, "Parallel_Corpora")

Saved: Parallel_Corpora\Tagalog_Hiligaynon_parallel.csv
Saved: Parallel_Corpora\Tagalog_Cebuano_parallel.csv


#### Computing BLEU Score of TGL <-> CEB Pivot Translations

In [25]:
import pandas as pd
import evaluate
from tqdm import tqdm


df = pd.read_csv("Parallel_Corpora/Tagalog_Cebuano_parallel.csv")
df = df.sample(1000, random_state=42).reset_index(drop=True)
print(df.head())

        book  chapter  verse  \
0  Leviticus       11     31   
1     Exodus       35      6   
2    Genesis       31     18   
3  Leviticus       14     31   
4    Genesis       10      3   

                                        text_Tagalog  \
0  Ang mga ito'y marumi sa inyo mula sa lahat ng ...   
1  lanang asul, kulay-ube at pula; hinabing pinon...   
2  At kanyang dinala ang lahat niyang mga hayop, ...   
3  Kung alin ang abot ng kanyang kaya, ang isa'y ...   
4  Ang mga anak ni Gomer: sina Askenaz, Rifat, at...   

                                        text_Cebuano  
0  Kini hugaw alang kaninyo taliwala sa tanang mg...  
1  asul ug tapol ug lutong pula nga delana ug pin...  
2  Ug gipauna niya ang tanan niyang kahayopan, ug...  
3  ang usa alang sa halad tungod sa sala ug ang u...  
4  Ang mga anak nga lalaki ni Gomer: si Askenas, ...  


In [26]:
bleu = evaluate.load("sacrebleu")
preds, refs = [], []

# Tagalog to Cebuano via pivot 
for _, row in tqdm(df.iterrows(), total=len(df)):
    pred = tl_to_ceb(row["text_Tagalog"])  
    preds.append(pred)
    refs.append(row["text_Cebuano"])

# Compute BLEU
results = bleu.compute(predictions=preds, references=[[r] for r in refs])
print(f"\nBLEU (Tagalog→Cebuano Pivot): {results['score']:.2f}")

100%|██████████| 1000/1000 [22:54<00:00,  1.37s/it]



BLEU (Tagalog→Cebuano Pivot): 23.21


In [30]:
preds, refs = [], []

for _, row in tqdm(df.iterrows(), total=len(df)):
    pred = ceb_to_tl(row["text_Cebuano"])
    preds.append(pred)
    refs.append(row["text_Tagalog"])

results_rev = bleu.compute(predictions=preds, references=[[r] for r in refs])
print(f"BLEU (Cebuano→Tagalog Pivot): {results_rev['score']:.2f}")

100%|██████████| 1000/1000 [24:38<00:00,  1.48s/it]


BLEU (Cebuano→Tagalog Pivot): 24.92


In [57]:
text = "Gikapoy nako aning Term"

print("Translation: ",ceb_to_tl(text))

Translation:  Sawa na ako sa Trem na Ito
