In [None]:
import pandas as pd
from google.genai import types
from google import genai
import re, os, json
from jiwer import wer, cer
from dotenv import load_dotenv
from rapidfuzz import process
from scipy.stats import kendalltau

load_dotenv(".env")
API_KEY = os.getenv("GEMINI_API_KEY")
print(API_KEY)

client = genai.Client(
    api_key=API_KEY
)

In [None]:
start = 424
n = 425
text_list = [f'../data/raw/ocr_result/ocr_{x}.txt' for x in [140, 266, 339, 423, 424, 425, 428, 445, 454, 480]]
print(len(text_list), text_list[0], text_list[-1])

gt_list = [f'../data/raw/ground_truth/ocr_{x}.txt' for x in [140, 266, 339, 423, 424, 425, 428, 445, 454, 480]]
print(len(gt_list), gt_list[0], gt_list[-1])

In [None]:
def return_prompt(row):
    prompt = (
        "Teks berikut adalah hasil OCR:\n\n"
        f"{row}\n\n"
        "Instruksi:\n"
        "1. Susun ulang teks agar urut dan mudah dibaca.\n"
        "2. Perbaiki typo dan kesalahan pemenggalan kata.\n"
        "3. Jangan menambah, mengurangi, atau mengubah informasi apa pun.\n"
        "4. Jangan menambah komentar, penjelasan, atau catatan.\n"
        "5. Output harus berupa teks final saja, tanpa markdown, tanpa format tambahan.\n"
        "6. Jangan melakukan asumsi atau mengisi bagian teks yang hilang.\n"
        "7. Teks output **harus berisi kata-kata yang sama dengan input**, kecuali kata yang memang diperbaiki karena typo.\n"
        "8. Jangan mengubah struktur kalimat secara berlebihan—hanya rapikan urutan dan perbaiki ejaan.\n\n"
        "PERINTAH PENTING:\n"
        "• Jangan membuat kalimat baru.\n"
        "• Jangan menghilangkan kata.\n"
        "• Jangan melakukan halusinasi.\n"
        "• Kembalikan hanya teks yang sudah diperbaiki."
    )
    return prompt

In [None]:
from rapidfuzz import process
from scipy.stats import kendalltau
import re

import re

def split_into_units(text):
    text = text.replace("\r", "").strip()
    text = re.sub(r"(\S+@\S+)\.(\S+)", r"\1<dot>\2", text)

    protected = [
        "Vol.", "No.", "hlm.", "eISSN.", "p-ISSN.", "etc."
    ]
    for p in protected:
        text = text.replace(p, p.replace(".", "<dot>"))
    
    text = text.replace("\n", "<nl>")

    parts = re.split(r"\.\s+(?=[A-Z])", text)
    parts = [p.replace("<dot>", ".") for p in parts]
    parts = [p.replace("<nl>", " ") for p in parts]
    units = [p.strip() for p in parts if p.strip()]

    return units


def kendall_tau_sentence_level(gt_text, pred_text):
    gt_units = split_into_units(gt_text)
    pred_units = split_into_units(pred_text)
    print(gt_units)
    print(pred_units)

    gt_order = list(range(len(gt_units)))
    matched_indices = []
    for unit in pred_units:
        match = process.extractOne(unit, gt_units)
        if match is None:
            continue
        matched_indices.append(match[2])  

    unique_pred_order = []
    seen = set()
    for idx in matched_indices:
        if idx not in seen:
            unique_pred_order.append(idx)
            seen.add(idx)

    min_len = min(len(gt_order), len(unique_pred_order))
    gt_order = gt_order[:min_len]
    unique_pred_order = unique_pred_order[:min_len]

    if min_len < 2:
        return 0

    tau, p = kendalltau(gt_order, unique_pred_order)
    return tau


In [None]:
baseline = []
trained = []

def pmr(gt, pred):
    gt_words = gt.split()
    pred_words = pred.split()
    length = min(len(gt_words), len(pred_words))
    matches = sum(1 for i in range(length) if gt_words[i] == pred_words[i])
    return matches / length


def evaluate(gt, res, isBaseline):
    wer_ = wer(gt, res)
    cer_ = cer(gt, res)
    pmr_ = pmr(gt, res)
    tau_ = kendall_tau_sentence_level(gt, res)

    if isBaseline:
        baseline.append({"wer": wer_, "cer": cer_, "pmr": pmr_, "tau": tau_})
    else:
        trained.append({"wer": wer_, "cer": cer_, "pmr": pmr_, "tau": tau_})

In [None]:
import time

for index in [140, 266, 339, 423, 424, 425, 428, 445, 454, 480]:
    time.sleep(2)
    row = ""
    with open(f"../../data/raw/ocr_result/ocr_{index}.txt", "r", encoding="utf-8", errors="ignore") as file:
        row = file.read()

    gt = ""
    with open(f"../../data/raw/ground_truth/gt_{index}.txt", "r", encoding="utf-8", errors="ignore") as file:
        gt = file.read()

    try:
        model = "gemini-2.0-flash"
        contents = [
            types.Content(
                role="user",
                parts=[
                    types.Part.from_text(text=return_prompt(row)),
                ],
            ),
        ]
        generate_content_config = types.GenerateContentConfig(
            response_mime_type="text/plain",
        )

        response_text=""
        for chunk in client.models.generate_content_stream(
            model=model,
            contents=contents,
            config=generate_content_config,
        ): response_text += chunk.text

        print(index)
        print(response_text)
        
        with open(f"./LLM_res/res_{index}.txt", "w",encoding="utf-8", errors="ignore") as file:
            file.write(response_text)

    except Exception as e:
        print(f"Row - Error processing column :", e)

In [None]:
# evaluate(gt, row, True)
# evaluate(gt, response_text, False)

In [None]:
# with open("eval_baseline.txt", "a") as file:
#     file.write(str(baseline))
# with open("eval_trained.txt", "a") as file:
#     file.write(str(trained))

In [None]:
# print(baseline)

In [None]:
# print(trained)