In [1]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import re
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bertscore


In [2]:
import pandas as pd
import requests

url_english = "https://drive.google.com/uc?export=download&id=1J5zhBA3qvKFPbADziKRv1xBQ9A_jMdRs"
text = requests.get(url_english).text
lines = text.splitlines()
df_english = pd.DataFrame({"text": lines})
url_indo = "https://drive.google.com/uc?export=download&id=1_jMC6ImrPz2KJzj4aWFdTiPS3_2-DcG6"
text = requests.get(url_indo).text
lines = text.splitlines()
df_indo = pd.DataFrame({"text": lines})


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

device = "cuda" if torch.cuda.is_available() else "cpu"

BASE_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
LORA_DIR = "qwen1.5b-lora-translation/qwen1.5"   
tokenizer = AutoTokenizer.from_pretrained(LORA_DIR, trust_remote_code=True)
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

model = PeftModel.from_pretrained(
    base_model,
    LORA_DIR,
    torch_dtype=torch.float16
)

model.eval()
print("Loaded Qwen LoRA model successfully!")

`torch_dtype` is deprecated! Use `dtype` instead!


Loaded Qwen LoRA model successfully!


In [5]:
FEW_SHOT = """
English: In 2010, South Korea experienced a particularly cold winter.
Indonesian: Pada 2010, Korea Selatan mengalami musim dingin yang tidak biasa.

English: People couldn't activate their smartphones while wearing gloves, 
so they began wielding snack sausages—causing one company to see a 40% rise in sausage sales.
Indonesian: Orang-orang tidak dapat menyalakan ponsel pintar mereka saat mengenakan sarung tangan, 
jadi mereka mulai menggunakan camilan sosis, yang menyebabkan satu perusahaan mengalami kenaikan penjualan sosis sebesar 40%.
""".strip()

def prompt_zero_shot(text):
    return f"""Translate to Indonesian:

English: "{text}"
Indonesian:""".strip()

def prompt_few_shot(text):
    return f"""Translate English to Indonesian following the examples.

{FEW_SHOT}

English: "{text}"
Indonesian:""".strip()

def prompt_role_based(text):
    return f"""You are a professional bilingual educator who creates flashcards.
Translate the English sentence into natural Indonesian.
Use complete sentences. No explanation. Output ONLY Indonesian.

English: "{text}"
Indonesian:""".strip()

def prompt_cot_guarded(text):
    return f"""Translate the English sentence into Indonesian.
First think through the meaning step-by-step to ensure accuracy.
Then provide the final Indonesian translation ONLY in the last line, prefixed with "Indonesian:".

English: "{text}"

Reasoning:
""".strip()

def prompt_back_translation(text):
    return f"""Step 1: Translate the English sentence to Indonesian.
Step 2: Translate that Indonesian back to English to verify consistency.
Step 3: Output ONLY the final Indonesian translation in the last line, prefixed with "Final Indonesian:".

English: "{text}"

Indonesian:
""".strip()


In [None]:
import re

def extract_back_translation(gen):
    if not isinstance(gen, str):
        return ""

    m = re.search(r'Final Indonesian\s*:\s*"([^"]+)"', gen, flags=re.I)
    if m:
        return m.group(1).strip()
    m2 = re.search(r'Indonesian\s*:\s*"([^"]+)"', gen, flags=re.I)
    if m2:
        return m2.group(1).strip()

    m3 = re.search(r'"([^"]+)"', gen)
    if m3:
        return m3.group(1).strip()

    return gen.split("\n")[0].strip()

In [None]:
def strip_wrappers(s: str):
    if not isinstance(s, str):
        return ""
    s = s.strip()
    s = s.lstrip("['").rstrip("']")
    s = s.strip()
    s = s.strip('"').strip("'").strip()

    return s

In [8]:
def generate_text(prompt, max_new_tokens=60):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.0,
            do_sample=False
        )
    return tokenizer.decode(out[0], skip_special_tokens=True)

def extract_translation(full_output, prompt, method):
    gen = full_output.replace(prompt, "").strip()

    gen = re.sub(r"\r\n?", "\n", gen).strip()

    if method in ["zero_shot", "few_shot", "role_based"]:
        lines = [l.strip() for l in gen.split("\n") if l.strip()]
        if not lines:
            return ""
        m = re.match(r'^(Indonesian|Indonesia|Terjemahan|Translation)\s*:\s*(.*)$', lines[0], re.I)
        return strip_wrappers((m.group(2).strip() if m else lines[0]))
    if method == "back_translation":
        return strip_wrappers(extract_back_translation(gen))
    return gen.split("\n")[0].strip()

def generate_only_translation(prompt, method, max_new_tokens=160):
    full_output = generate_text(prompt, max_new_tokens=max_new_tokens)
    return extract_translation(full_output, prompt, method)

In [9]:
smooth = SmoothingFunction().method1
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

def compute_bleu(ref, hypo):
    return sentence_bleu([ref.split()], hypo.split(), smoothing_function=smooth)

def compute_rougeL(ref, hypo):
    return rouge.score(ref, hypo)["rougeL"].fmeasure

def compute_bertscore(ref, hypo):
    P, R, F1 = bertscore([hypo], [ref], lang="id")
    return F1[0].item()

In [None]:
def run_single_pipeline(df_en, df_id, method_name):
    methods = {
        "zero_shot": prompt_zero_shot,
        "few_shot": prompt_few_shot,
        "role_based": prompt_role_based,
        "back_translation": prompt_back_translation,
    }

    if method_name not in methods:
        raise ValueError(f"Invalid method. Choose from: {list(methods.keys())}")

    method_func = methods[method_name]

    all_outputs = []
    bleu_list, rouge_list, bert_list = [], [], []

    print(f"\n=== RUNNING {method_name.upper()} (NO BATCHING) ===")

    for i in tqdm(range(len(df_en))):

        en = df_en.loc[i, "text"]
        gt = df_id.loc[i, "text"]

        prompt = method_func(en)

        full_output = generate_text(prompt, max_new_tokens=60)
        pred = extract_translation(full_output, prompt, method_name)

        all_outputs.append(pred)

        bleu_list.append(compute_bleu(gt, pred))
        rouge_list.append(compute_rougeL(gt, pred))
        bert_list.append(compute_bertscore(gt, pred))

        torch.cuda.empty_cache()


    scores = {
        "bleu": float(np.mean(bleu_list)),
        "rougeL": float(np.mean(rouge_list)),
        "bertscore": float(np.mean(bert_list)),
    }

    return all_outputs, scores

In [None]:
tokenizer.padding_side = "left"
def generate_batch(prompts, max_new_tokens=60):
    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True,
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            use_cache=False,   
        )

    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [None]:
def run_batched_pipeline(df_en, df_id, method_name, batch_size=4):
    methods = {
        "zero_shot": prompt_zero_shot,
        "few_shot": prompt_few_shot,
        "role_based": prompt_role_based,
        "back_translation": prompt_back_translation,
    }

    if method_name not in methods:
        raise ValueError(f"Invalid method. Choose from: {list(methods.keys())}")

    method_func = methods[method_name]

    all_outputs = []
    bleu_list, rouge_list, bert_list = [], [], []

    print(f"\n=== RUNNING {method_name.upper()} | BATCH SIZE = {batch_size} ===")

    num_samples = len(df_en)

    for start in tqdm(range(0, num_samples, batch_size)):
        end = min(start + batch_size, num_samples)

        en_batch = [df_en.loc[i, "text"] for i in range(start, end)]
        gt_batch = [df_id.loc[i, "text"] for i in range(start, end)]

        prompts = [method_func(en) for en in en_batch]

        raw_outputs = generate_batch(prompts, max_new_tokens=60)

        preds = [
            extract_translation(raw_outputs[j], prompts[j], method_name)
            for j in range(len(raw_outputs))
        ]

        all_outputs.extend(preds)

        for j in range(len(preds)):
            bleu_list.append(compute_bleu(gt_batch[j], preds[j]))
            rouge_list.append(compute_rougeL(gt_batch[j], preds[j]))
            bert_list.append(compute_bertscore(gt_batch[j], preds[j]))


        torch.cuda.empty_cache()

    scores = {
        "bleu": float(np.mean(bleu_list)),
        "rougeL": float(np.mean(rouge_list)),
        "bertscore": float(np.mean(bert_list)),
    }

    return all_outputs, scores

In [13]:
def evaluate_all_methods(df_en, df_id, batch_size=4):
    methods = ["zero_shot", "few_shot", "role_based", "back_translation"]

    results = {}

    for m in methods:
        print(f"\n========== Evaluating {m.upper()} ==========")
        outputs, scores = run_batched_pipeline(df_en, df_id, m, batch_size=batch_size)
        results[m] = scores
        print(scores)

    return results

In [14]:
results = evaluate_all_methods(df_english, df_indo, batch_size=6)



=== RUNNING ZERO_SHOT | BATCH SIZE = 6 ===


  0%|          | 0/169 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

100%|██████████| 169/169 [22:08<00:00,  7.86s/it]


{'bleu': 0.138343503103525, 'rougeL': 0.5123305885823051, 'bertscore': 0.8228814369251606}


=== RUNNING FEW_SHOT | BATCH SIZE = 6 ===


100%|██████████| 169/169 [25:56<00:00,  9.21s/it]


{'bleu': 0.134361424339305, 'rougeL': 0.5139593469003865, 'bertscore': 0.8250882923014079}


=== RUNNING ROLE_BASED | BATCH SIZE = 6 ===


100%|██████████| 169/169 [26:12<00:00,  9.30s/it]


{'bleu': 0.14227376750453122, 'rougeL': 0.518044570907784, 'bertscore': 0.8240086426259029}


=== RUNNING BACK_TRANSLATION | BATCH SIZE = 6 ===


100%|██████████| 169/169 [27:41<00:00,  9.83s/it]

{'bleu': 0.09946334765193332, 'rougeL': 0.4125833164034409, 'bertscore': 0.7626790563932991}



