In [1]:
from jiwer import wer, cer
import pandas as pd
import re
from tqdm import tqdm
from bert_score import score
import torch
from transformers import AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import textdistance

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def pmr(gt, pred):
    gt_words = gt.split()
    pred_words = pred.split()
    length = min(len(gt_words), len(pred_words))
    matches = sum(1 for i in range(length) if gt_words[i] == pred_words[i])
    
    if(length == 0): 
        print('length 0')
        return 0
    return matches / length 

def compute_jaro_winkler(reference, prediction):
    return textdistance.jaro_winkler(reference, prediction)

In [3]:
def compute_bleu(reference, prediction):
    ref_tokens = [reference.split()]
    pred_tokens = prediction.split()
    smoothie = SmoothingFunction().method4
    return sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoothie)

def compute_cosine_similarity(reference, prediction):
    vectorizer = TfidfVectorizer().fit([reference, prediction])
    vectors = vectorizer.transform([reference, prediction])
    return cosine_similarity(vectors[0], vectors[1])[0][0]

In [4]:
tokenizer_id = "indobenchmark/indobert-large-p1"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)

def truncate_to_512(text):
    tokens = tokenizer.encode(text, add_special_tokens=True)
    if len(tokens) > 512:
        tokens = tokens[:511] + [tokenizer.sep_token_id]  
    return tokenizer.decode(tokens, skip_special_tokens=True)

def compute_indobert_large_score(refs, preds, batch_size=8):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    refs = [truncate_to_512(r) for r in refs]
    preds = [truncate_to_512(p) for p in preds]

    P, R, F1 = score(
        preds,
        refs,
        model_type=tokenizer_id,
        num_layers=24,
        lang="id",
        device=device,
        batch_size=batch_size,
        verbose=False,
    )
    return float(P.mean()), float(R.mean()), float(F1.mean())

In [5]:
LLM_DIR = '../../text-processing/LLM-test/LLM_res'
SYMSPELL_DIR = '../../text-processing/algorithm/symspell_res'
GT_DIR = '../../data/raw/ground_truth'
BASELINE_DIR = '../../data/raw/ocr_result'

In [6]:
with open('../eval_list.txt', 'r') as file:
    content = file.read()

test_files = []
for file in content.split('\n'):
    test_files.append(file.split('.')[0])
len(test_files)

100

In [7]:
baseline_cer, baseline_wer, baseline_pmr, baseline_bleu, baseline_cosine, baseline_indobert, baseline_jw  = [], [], [], [], [], [], []
llm_cer, llm_wer, llm_pmr, llm_bleu, llm_cosine, llm_indobert, llm_jw  = [], [], [], [], [], [], []
symspell_cer, symspell_wer, symspell_pmr, symspell_bleu, symspell_cosine, symspell_indobert, symspell_jw  = [], [], [], [], [], [], []

In [8]:
def read_file(path):
    try:
        return open(path, 'r', encoding='utf-8').read()
    except UnicodeDecodeError:
        return open(path, 'r', encoding='utf-8', errors='ignore').read()

def clean_text(x):
    return re.sub(r"\s+", " ", x.replace("\n", " ")).strip().lower()

In [9]:
for filename in tqdm(test_files):
    baseline = clean_text(read_file(f'{BASELINE_DIR}/ocr_{filename}.txt'))
    gt = clean_text(read_file(f'{GT_DIR}/gt_{filename}.txt'))
    llm = clean_text(read_file(f'{LLM_DIR}/res_{filename}.txt'))
    symspell = clean_text(read_file(f'{SYMSPELL_DIR}/res_{filename}.txt'))

    if(len(gt) == 0): print(filename)

    baseline_wer.append(wer(gt, baseline))
    baseline_cer.append(cer(gt, baseline))
    baseline_pmr.append(pmr(gt, baseline))
    baseline_bleu.append(compute_bleu(gt, baseline))
    baseline_cosine.append(compute_cosine_similarity(gt, baseline))
    baseline_indobert.append(compute_indobert_large_score([gt],[baseline]))
    baseline_jw.append(compute_jaro_winkler(gt, baseline))

    llm_wer.append(wer(gt, llm))
    llm_cer.append(cer(gt, llm))
    llm_pmr.append(pmr(gt, llm))
    llm_bleu.append(compute_bleu(gt, llm))
    llm_cosine.append(compute_cosine_similarity(gt, llm))
    llm_indobert.append(compute_indobert_large_score([gt],[llm]))
    llm_jw.append(compute_jaro_winkler(gt, llm))

    symspell_wer.append(wer(gt, symspell))
    symspell_cer.append(cer(gt, symspell))
    symspell_pmr.append(pmr(gt, symspell))
    symspell_bleu.append(compute_bleu(gt, symspell))
    symspell_cosine.append(compute_cosine_similarity(gt, symspell))
    symspell_indobert.append(compute_indobert_large_score([gt],[symspell]))
    symspell_jw.append(compute_jaro_winkler(gt, symspell))
    

100%|██████████| 100/100 [37:05<00:00, 22.25s/it]


In [10]:
data = {
  "name": test_files,
  "baseline_wer": baseline_wer,
  "baseline_cer" : baseline_cer,
  "baseline_pmr" : baseline_pmr,
  "baseline_bleu": baseline_bleu,
  "baseline_cosine" : baseline_cosine,
  "baseline_indobert" : baseline_indobert,
  "baseline_jw" : baseline_jw,

  "llm_wer": llm_wer,
  "llm_cer" : llm_cer,
  "llm_pmr" : llm_pmr,
  "llm_bleu": llm_bleu,
  "llm_cosine" : llm_cosine,
  "llm_indobert" : llm_indobert,
  "llm_jw" : llm_jw,

  "symspell_wer": symspell_wer,
  "symspell_cer" : symspell_cer,
  "symspell_pmr" : symspell_pmr,
  "symspell_bleu": symspell_bleu,
  "symspell_cosine" : symspell_cosine,
  "symspell_indobert" : symspell_indobert,
  "symspell_jw" : symspell_jw,
}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,name,baseline_wer,baseline_cer,baseline_pmr,baseline_bleu,baseline_cosine,baseline_indobert,baseline_jw,llm_wer,llm_cer,...,llm_cosine,llm_indobert,llm_jw,symspell_wer,symspell_cer,symspell_pmr,symspell_bleu,symspell_cosine,symspell_indobert,symspell_jw
0,522,0.16129,0.113295,0.290323,0.838709,0.928364,"(0.9131332635879517, 0.9442889094352722, 0.928...",0.920078,0.201613,0.139884,...,0.933411,"(0.9452977180480957, 0.9663629531860352, 0.955...",0.920401,0.169355,0.116763,0.290323,0.819364,0.923033,"(0.9092438817024231, 0.9430822134017944, 0.925...",0.920454
1,479,21.230769,16.631868,0.0,0.001806,0.012009,"(0.3306380808353424, 0.42230814695358276, 0.37...",0.513429,17.692308,13.78022,...,0.013164,"(0.3175305128097534, 0.42699506878852844, 0.36...",0.515238,21.230769,16.538462,0.0,0.001806,0.012002,"(0.3305588960647583, 0.4300723969936371, 0.373...",0.512467
2,528,1.166667,0.885895,0.362745,0.407504,0.709366,"(0.5526434183120728, 0.788463294506073, 0.6498...",0.841161,0.166667,0.156894,...,0.947693,"(0.9757590293884277, 0.9448020458221436, 0.960...",0.888836,1.166667,0.881141,0.362745,0.407504,0.709366,"(0.552596926689148, 0.791944682598114, 0.65096...",0.842073
3,365,0.362694,0.294331,0.005181,0.710469,0.9312,"(0.8656865954399109, 0.9295729398727417, 0.896...",0.771601,0.73057,0.361919,...,0.932442,"(0.7799885272979736, 0.8684403300285339, 0.821...",0.755412,0.362694,0.294331,0.005181,0.710469,0.9312,"(0.8636263608932495, 0.9272060394287109, 0.894...",0.77257
4,478,1.12782,1.413115,0.0,0.448918,0.62955,"(0.6963430047035217, 0.7169560790061951, 0.706...",0.642682,1.315789,1.540984,...,0.624073,"(0.7156298160552979, 0.7455645799636841, 0.730...",0.643751,1.12782,1.414208,0.0,0.448918,0.62872,"(0.6963430047035217, 0.7169560790061951, 0.706...",0.64262


In [11]:
df.describe()

Unnamed: 0,baseline_wer,baseline_cer,baseline_pmr,baseline_bleu,baseline_cosine,baseline_jw,llm_wer,llm_cer,llm_pmr,llm_bleu,llm_cosine,llm_jw,symspell_wer,symspell_cer,symspell_pmr,symspell_bleu,symspell_cosine,symspell_jw
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.432745,0.319161,0.172997,0.796237,0.883098,0.846417,0.378587,0.275368,0.193985,0.804651,0.894556,0.846949,0.437868,0.319356,0.172238,0.784495,0.874625,0.845028
std,2.128334,1.668447,0.28395,0.26613,0.263253,0.091307,1.77902,1.387446,0.314907,0.254102,0.256943,0.085407,2.127372,1.659033,0.282862,0.260687,0.260587,0.08957
min,0.0,0.0,0.0,0.0,0.0,0.513429,0.007168,0.00094,0.0,0.0,0.0,0.515238,0.003448,0.000449,0.0,0.0,0.0,0.512467
25%,0.042914,0.023549,0.007589,0.800474,0.954217,0.811311,0.040914,0.023354,0.00682,0.814565,0.954387,0.818335,0.049335,0.024243,0.007589,0.775863,0.934588,0.812607
50%,0.078184,0.041788,0.017484,0.895241,0.982927,0.838156,0.076715,0.042482,0.015235,0.89391,0.983639,0.838453,0.085179,0.044333,0.017484,0.881792,0.974648,0.837772
75%,0.21019,0.163096,0.241223,0.94625,0.992754,0.91611,0.191581,0.139656,0.299633,0.947853,0.993369,0.908461,0.216763,0.163512,0.236121,0.930848,0.98898,0.914459
max,21.230769,16.631868,1.0,1.0,1.0,1.0,17.692308,13.78022,1.0,0.987408,0.999396,0.997658,21.230769,16.538462,1.0,0.991312,0.998967,0.997517


In [12]:
df[['baseline_indobert_P', 
    'baseline_indobert_R', 
    'baseline_indobert_F1']] = pd.DataFrame(df['baseline_indobert'].tolist(), index=df.index)

df[['llm_indobert_P', 
    'llm_indobert_R', 
    'llm_indobert_F1']] = pd.DataFrame(df['llm_indobert'].tolist(), index=df.index)

df[['symspell_indobert_P', 
    'symspell_indobert_R', 
    'symspell_indobert_F1']] = pd.DataFrame(df['symspell_indobert'].tolist(), index=df.index)

df.head()

Unnamed: 0,name,baseline_wer,baseline_cer,baseline_pmr,baseline_bleu,baseline_cosine,baseline_indobert,baseline_jw,llm_wer,llm_cer,...,symspell_jw,baseline_indobert_P,baseline_indobert_R,baseline_indobert_F1,llm_indobert_P,llm_indobert_R,llm_indobert_F1,symspell_indobert_P,symspell_indobert_R,symspell_indobert_F1
0,522,0.16129,0.113295,0.290323,0.838709,0.928364,"(0.9131332635879517, 0.9442889094352722, 0.928...",0.920078,0.201613,0.139884,...,0.920454,0.913133,0.944289,0.92845,0.945298,0.966363,0.955714,0.909244,0.943082,0.925854
1,479,21.230769,16.631868,0.0,0.001806,0.012009,"(0.3306380808353424, 0.42230814695358276, 0.37...",0.513429,17.692308,13.78022,...,0.512467,0.330638,0.422308,0.370893,0.317531,0.426995,0.364216,0.330559,0.430072,0.373806
2,528,1.166667,0.885895,0.362745,0.407504,0.709366,"(0.5526434183120728, 0.788463294506073, 0.6498...",0.841161,0.166667,0.156894,...,0.842073,0.552643,0.788463,0.64982,0.975759,0.944802,0.960031,0.552597,0.791945,0.650967
3,365,0.362694,0.294331,0.005181,0.710469,0.9312,"(0.8656865954399109, 0.9295729398727417, 0.896...",0.771601,0.73057,0.361919,...,0.77257,0.865687,0.929573,0.896493,0.779989,0.86844,0.821841,0.863626,0.927206,0.894288
4,478,1.12782,1.413115,0.0,0.448918,0.62955,"(0.6963430047035217, 0.7169560790061951, 0.706...",0.642682,1.315789,1.540984,...,0.64262,0.696343,0.716956,0.706499,0.71563,0.745565,0.730291,0.696343,0.716956,0.706499


In [14]:
df.to_csv('error_correction_result.csv',index=False)