In [10]:
from jiwer import wer, cer
import pandas as pd
import re
from tqdm import tqdm
from bert_score import score
import torch
from transformers import AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
def pmr(gt, pred):
    gt_words = gt.split()
    pred_words = pred.split()
    length = min(len(gt_words), len(pred_words))
    matches = sum(1 for i in range(length) if gt_words[i] == pred_words[i])
    
    if(length == 0): 
        print('length 0')
        return 0
    return matches / length 

In [12]:
def compute_bleu(reference, prediction):
    ref_tokens = [reference.split()]
    pred_tokens = prediction.split()
    smoothie = SmoothingFunction().method4
    return sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoothie)

def compute_meteor(reference, prediction):
    ref_tokens = reference.split()
    pred_tokens = prediction.split()
    return meteor_score([ref_tokens], pred_tokens)

def compute_cosine_similarity(reference, prediction):
    vectorizer = TfidfVectorizer().fit([reference, prediction])
    vectors = vectorizer.transform([reference, prediction])
    return cosine_similarity(vectors[0], vectors[1])[0][0]

In [13]:
tokenizer_id = "indobenchmark/indobert-large-p1"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)

def truncate_to_512(text):
    tokens = tokenizer.encode(text, add_special_tokens=True)
    if len(tokens) > 512:
        tokens = tokens[:511] + [tokenizer.sep_token_id]  
    return tokenizer.decode(tokens, skip_special_tokens=True)

def compute_indobert_large_score(refs, preds, batch_size=8):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    refs = [truncate_to_512(r) for r in refs]
    preds = [truncate_to_512(p) for p in preds]

    P, R, F1 = score(
        preds,
        refs,
        model_type=tokenizer_id,
        num_layers=24,
        lang="id",
        device=device,
        batch_size=batch_size,
        verbose=False,
    )
    return float(P.mean()), float(R.mean()), float(F1.mean())

In [14]:
FINAL_SYMSPELL_DIR = '../text-processing/algorithm/final_symspell_res'
FINAL_LLM_DIR = '../text-processing/LLM-test/final_LLM_res'
GT_DIR = '../data/raw/ground_truth'
BASELINE_DIR = '../data/raw/ocr_result'

In [15]:
with open('eval_list.txt', 'r') as file:
    content = file.read()

test_files = []
for file in content.split('\n'):
    test_files.append(file.split('.')[0])
len(test_files)

100

In [16]:
baseline_cer, baseline_wer, baseline_pmr, baseline_bleu, baseline_cosine, baseline_indobert  = [], [], [], [], [], []
final_symspell_cer, final_symspell_wer, final_symspell_pmr, final_symspell_bleu, final_symspell_cosine, final_symspell_indobert  = [], [], [], [], [], []
final_llm_cer, final_llm_wer, final_llm_pmr, final_llm_bleu, final_llm_cosine, final_llm_indobert  = [], [], [], [], [], []

In [17]:
def read_file(path):
    try:
        return open(path, 'r', encoding='utf-8').read()
    except UnicodeDecodeError:
        return open(path, 'r', encoding='utf-8', errors='ignore').read()

In [18]:
for filename in tqdm(test_files):
    baseline = read_file(f'{BASELINE_DIR}/ocr_{filename}.txt')
    gt = read_file(f'{GT_DIR}/gt_{filename}.txt')
    final_symspell_str = read_file(f'{FINAL_SYMSPELL_DIR}/res_{filename}.txt')
    final_llm_str = read_file(f'{FINAL_LLM_DIR}/res_{filename}.txt')

    baseline = re.sub(r"\s+", " ", baseline.replace("\n", " ")).strip().lower()
    gt = re.sub(r"\s+", " ", gt.replace("\n", " ")).strip().lower()
    final_symspell_str = re.sub(r"\s+", " ", final_symspell_str.replace("\n", " ")).strip().lower()
    final_llm_str = re.sub(r"\s+", " ", final_llm_str.replace("\n", " ")).strip().lower()

    if(len(gt) == 0): print(filename)

    baseline_wer.append(wer(gt, baseline))
    baseline_cer.append(cer(gt, baseline))
    baseline_pmr.append(pmr(gt, baseline))
    baseline_bleu.append(compute_bleu(gt, baseline))
    baseline_cosine.append(compute_cosine_similarity(gt, baseline))
    baseline_indobert.append(compute_indobert_large_score([gt],[baseline]))

    final_symspell_wer.append(wer(gt, final_symspell_str))
    final_symspell_cer.append(cer(gt, final_symspell_str))
    final_symspell_pmr.append(pmr(gt, final_symspell_str))
    final_symspell_bleu.append(compute_bleu(gt, final_symspell_str))
    final_symspell_cosine.append(compute_cosine_similarity(gt, final_symspell_str))
    final_symspell_indobert.append(compute_indobert_large_score([gt], [final_symspell_str]))

    final_llm_wer.append(wer(gt, final_llm_str))
    final_llm_cer.append(cer(gt, final_llm_str))
    final_llm_pmr.append(pmr(gt, final_llm_str))
    final_llm_bleu.append(compute_bleu(gt, final_llm_str))
    final_llm_cosine.append(compute_cosine_similarity(gt, final_llm_str))
    final_llm_indobert.append(compute_indobert_large_score([gt], [final_llm_str]))
    

100%|██████████| 100/100 [56:58<00:00, 34.18s/it]


In [19]:
data = {
  "name": test_files,
  "baseline_wer": baseline_wer,
  "baseline_cer" : baseline_cer,
  "baseline_pmr" : baseline_pmr,
  "baseline_bleu": baseline_bleu,
  "baseline_cosine" : baseline_cosine,
  "baseline_indobert" : baseline_indobert,

  "final_symspell_wer": final_symspell_wer,
  "final_symspell_cer" : final_symspell_cer,
  "final_symspell_pmr" : final_symspell_pmr,
  "final_symspell_bleu": final_symspell_bleu,
  "final_symspell_cosine" : final_symspell_cosine,
  "final_symspell_indobert" : final_symspell_indobert,

  "final_llm_wer": final_llm_wer,
  "final_llm_cer" : final_llm_cer,
  "final_llm_pmr" : final_llm_pmr,
  "final_llm_bleu": final_llm_bleu,
  "final_llm_cosine" : final_llm_cosine,
  "final_llm_indobert" : final_llm_indobert
}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,name,baseline_wer,baseline_cer,baseline_pmr,baseline_bleu,baseline_cosine,baseline_indobert,final_symspell_wer,final_symspell_cer,final_symspell_pmr,final_symspell_bleu,final_symspell_cosine,final_symspell_indobert,final_llm_wer,final_llm_cer,final_llm_pmr,final_llm_bleu,final_llm_cosine,final_llm_indobert
0,522,0.16129,0.113295,0.290323,0.838709,0.928364,"(0.9131332635879517, 0.9442889094352722, 0.928...",0.064516,0.033526,0.297521,0.899011,0.972525,"(0.9586562514305115, 0.9712709188461304, 0.964...",0.120968,0.067052,0.310924,0.813136,0.979888,"(0.9763405919075012, 0.9691017270088196, 0.972..."
1,479,21.230769,16.631868,0.0,0.001806,0.012009,"(0.3306380808353424, 0.42230814695358276, 0.37...",1.0,0.802198,0.0,0.0,0.0,"(0.3634084463119507, 0.37184950709342957, 0.36...",1.038462,0.78022,0.0,0.0,0.0,"(0.3678537607192993, 0.38872015476226807, 0.37..."
2,528,1.166667,0.885895,0.362745,0.407504,0.709366,"(0.5526434183120728, 0.788463294506073, 0.6498...",0.127451,0.123613,0.677083,0.839401,0.938608,"(0.9472967386245728, 0.9506863355636597, 0.948...",0.245098,0.156894,0.27451,0.677893,0.915647,"(0.8783180117607117, 0.8957652449607849, 0.886..."
3,365,0.362694,0.294331,0.005181,0.710469,0.9312,"(0.8656865954399109, 0.9295729398727417, 0.896...",0.056995,0.059593,0.005181,0.928273,0.987054,"(0.9405235052108765, 0.9671791791915894, 0.953...",0.103627,0.057413,0.010363,0.836883,0.978935,"(0.9326664805412292, 0.9558390378952026, 0.944..."
4,478,1.12782,1.413115,0.0,0.448918,0.62955,"(0.6963430047035217, 0.7169560790061951, 0.706...",0.007519,0.001093,0.992481,0.980877,0.995194,"(0.9955625534057617, 0.9964922666549683, 0.996...",0.015038,0.00765,0.984962,0.963638,0.995211,"(0.9834587574005127, 0.9842321872711182, 0.983..."


In [20]:
df.describe()

Unnamed: 0,baseline_wer,baseline_cer,baseline_pmr,baseline_bleu,baseline_cosine,final_symspell_wer,final_symspell_cer,final_symspell_pmr,final_symspell_bleu,final_symspell_cosine,final_llm_wer,final_llm_cer,final_llm_pmr,final_llm_bleu,final_llm_cosine
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.432745,0.319161,0.172997,0.796237,0.883098,0.172637,0.121424,0.391989,0.804493,0.863786,0.187251,0.130597,0.300028,0.802165,0.881203
std,2.128334,1.668447,0.28395,0.26613,0.263253,0.30558,0.247709,0.403037,0.305491,0.302619,0.30925,0.243745,0.36767,0.294379,0.29811
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.042914,0.023549,0.007589,0.800474,0.954217,0.019637,0.003315,0.01591,0.841662,0.950848,0.020367,0.004999,0.010655,0.829099,0.978509
50%,0.078184,0.041788,0.017484,0.895241,0.982927,0.046567,0.016357,0.229559,0.927495,0.987325,0.063949,0.025256,0.085221,0.912719,0.991215
75%,0.21019,0.163096,0.241223,0.94625,0.992754,0.123326,0.071141,0.872601,0.960054,0.995291,0.121799,0.071116,0.503433,0.960907,0.997171
max,21.230769,16.631868,1.0,1.0,1.0,1.0,0.990772,1.0,1.0,1.0,1.038462,0.98573,1.0,1.0,1.0


In [22]:
df.to_csv('final_with_indobert_result.csv',index=False)