In [1]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import re
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# !pip install rouge_score bert_score
from rouge_score import rouge_scorer
from bert_score import score as bertscore




In [2]:
url_english = "https://drive.google.com/uc?export=download&id=1J5zhBA3qvKFPbADziKRv1xBQ9A_jMdRs"
url_indo    = "https://drive.google.com/uc?export=download&id=1_jMC6ImrPz2KJzj4aWFdTiPS3_2-DcG6"


In [3]:
import pandas as pd
import requests

def load_text_url(url):
    text = requests.get(url).text
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    return pd.DataFrame({"text": lines})

df_english = load_text_url(url_english)
df_indo    = load_text_url(url_indo)


In [4]:
import peft
print(peft.__version__)

Skipping import of cpp extensions due to incompatible torch version 2.9.1+cu126 for torchao version 0.14.1             Please see https://github.com/pytorch/ao/issues/2919 for more info
W1126 19:59:51.244000 42428 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


0.14.0


In [5]:
# from transformers import MarianTokenizer, MarianMTModel
# import torch

# device = "cuda" if torch.cuda.is_available() else "cpu"

# model_name = "Helsinki-NLP/opus-mt-en-id"

# tokenizer = MarianTokenizer.from_pretrained(model_name)
# model = MarianMTModel.from_pretrained(model_name).to(device)


In [6]:
from transformers import MarianTokenizer, MarianMTModel
from peft import PeftModel
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

# ==== IMPORTANT: point this to your LoRA folder ====
lora_dir = "helsinki-lora-finetuned"

# Load tokenizer (use your LoRA checkpoint folder, not base Helsinki)
tokenizer = MarianTokenizer.from_pretrained(lora_dir)

# Load base Helsinki model
base_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-id")

# Load LoRA adapter on top of base
model = PeftModel.from_pretrained(base_model, lora_dir)

model = model.to(device)
model.eval()

print("Loaded LoRA fine-tuned Helsinki model successfully!")

  result[k] = f.get_tensor(k)


Loaded LoRA fine-tuned Helsinki model successfully!


In [7]:
def translate_batch(text_batch, max_new_tokens=128):
    inputs = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True).to(device)

    with torch.no_grad():
        out = model.generate(**inputs, max_length=max_new_tokens)

    decoded = tokenizer.batch_decode(out, skip_special_tokens=True)

    # clean
    cleaned = [d.strip() for d in decoded]
    return cleaned


In [8]:
import numpy as np
# !pip install sacrebleu
import sacrebleu
from bert_score import score as bert_score
from rouge_score import rouge_scorer

rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def compute_bleu(ref, pred):
    return sacrebleu.corpus_bleu([pred], [[ref]]).score

def compute_rougeL(ref, pred):
    return rouge.score(ref, pred)['rougeL'].fmeasure

def compute_bertscore(ref, pred):
    P, R, F1 = bert_score([pred], [ref], lang="id")
    return float(F1[0])


In [13]:
from tqdm import tqdm

def run_batched_helsinki(df_en, df_id, batch_size=16):

    outputs = []
    bleu_list, rouge_list, bert_list = [], [], []

    print(f"\n=== HELSINKI TRANSLATION — BATCH SIZE {batch_size} ===\n")

    for start in tqdm(range(0, len(df_en), batch_size)):
        end = min(start + batch_size, len(df_en))

        batch_in = df_en["text"].iloc[start:end].tolist()
        batch_gt = df_id["text"].iloc[start:end].tolist()

        # --- translate ---
        batch_pred = translate_batch(batch_in)

        # store
        outputs.extend(batch_pred)

        # --- eval ---
        for pred, gt in zip(batch_pred, batch_gt):
            print(gt)
            print(pred)
            bleu_list.append(compute_bleu(gt, pred))
            rouge_list.append(compute_rougeL(gt, pred))
            bert_list.append(compute_bertscore(gt, pred))

        torch.cuda.empty_cache()

    scores = {
        "bleu": float(np.mean(bleu_list)),
        "rougeL": float(np.mean(rouge_list)),
        "bertscore": float(np.mean(bert_list)),
    }

    return outputs, scores


#Keapus untuk resultnya

In [None]:
outputs, scores = run_batched_helsinki(df_english, df_indo, batch_size=16)
print(scores)