In [1]:
from jiwer import wer, cer
import pandas as pd
import re
from tqdm import tqdm
from bert_score import score
import torch
from transformers import AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def pmr(gt, pred):
    gt_words = gt.split()
    pred_words = pred.split()
    length = min(len(gt_words), len(pred_words))
    matches = sum(1 for i in range(length) if gt_words[i] == pred_words[i])
    
    if(length == 0): 
        print('length 0')
        return 0
    return matches / length 

In [3]:
def compute_bleu(reference, prediction):
    ref_tokens = [reference.split()]
    pred_tokens = prediction.split()
    smoothie = SmoothingFunction().method4
    return sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoothie)

def compute_meteor(reference, prediction):
    ref_tokens = reference.split()
    pred_tokens = prediction.split()
    return meteor_score([ref_tokens], pred_tokens)

def compute_cosine_similarity(reference, prediction):
    vectorizer = TfidfVectorizer().fit([reference, prediction])
    vectors = vectorizer.transform([reference, prediction])
    return cosine_similarity(vectors[0], vectors[1])[0][0]

In [4]:
tokenizer_id = "indobenchmark/indobert-large-p1"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)

def truncate_to_512(text):
    tokens = tokenizer.encode(text, add_special_tokens=True)
    if len(tokens) > 512:
        tokens = tokens[:511] + [tokenizer.sep_token_id]  
    return tokenizer.decode(tokens, skip_special_tokens=True)

def compute_indobert_large_score(refs, preds, batch_size=8):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    refs = [truncate_to_512(r) for r in refs]
    preds = [truncate_to_512(p) for p in preds]

    P, R, F1 = score(
        preds,
        refs,
        model_type=tokenizer_id,
        num_layers=24,
        lang="id",
        device=device,
        batch_size=batch_size,
        verbose=False,
    )
    return float(P.mean()), float(R.mean()), float(F1.mean())

In [5]:
FINAL_SYMSPELL_DIR = '../text-processing/algorithm/final_symspell_res'
FINAL_LLM_DIR = '../text-processing/LLM-test/final_LLM_res'
GT_DIR = '../data/raw/ground_truth'
BASELINE_DIR = '../data/raw/ocr_result'

In [6]:
with open('eval_list.txt', 'r') as file:
    content = file.read()

test_files = []
for file in content.split('\n'):
    test_files.append(file.split('.')[0])
len(test_files)

100

In [7]:
baseline_cer, baseline_wer, baseline_pmr, baseline_bleu, baseline_cosine, baseline_indobert  = [], [], [], [], [], []
final_symspell_cer, final_symspell_wer, final_symspell_pmr, final_symspell_bleu, final_symspell_cosine, final_symspell_indobert  = [], [], [], [], [], []
final_llm_cer, final_llm_wer, final_llm_pmr, final_llm_bleu, final_llm_cosine, final_llm_indobert  = [], [], [], [], [], []

In [8]:
def read_file(path):
    try:
        return open(path, 'r', encoding='utf-8').read()
    except UnicodeDecodeError:
        return open(path, 'r', encoding='utf-8', errors='ignore').read()

In [9]:
for filename in tqdm(test_files):
    baseline = read_file(f'{BASELINE_DIR}/ocr_{filename}.txt')
    gt = read_file(f'{GT_DIR}/gt_{filename}.txt')
    final_symspell_str = read_file(f'{FINAL_SYMSPELL_DIR}/res_{filename}.txt')
    final_llm_str = read_file(f'{FINAL_LLM_DIR}/res_{filename}.txt')

    baseline = re.sub(r"\s+", " ", baseline.replace("\n", " ")).strip().lower()
    gt = re.sub(r"\s+", " ", gt.replace("\n", " ")).strip().lower()
    final_symspell_str = re.sub(r"\s+", " ", final_symspell_str.replace("\n", " ")).strip().lower()
    final_llm_str = re.sub(r"\s+", " ", final_llm_str.replace("\n", " ")).strip().lower()

    if(len(gt) == 0): print(filename)

    baseline_wer.append(wer(gt, baseline))
    baseline_cer.append(cer(gt, baseline))
    baseline_pmr.append(pmr(gt, baseline))
    baseline_bleu.append(compute_bleu(gt, baseline))
    baseline_cosine.append(compute_cosine_similarity(gt, baseline))
    baseline_indobert.append(compute_indobert_large_score([gt],[baseline]))

    final_symspell_wer.append(wer(gt, final_symspell_str))
    final_symspell_cer.append(cer(gt, final_symspell_str))
    final_symspell_pmr.append(pmr(gt, final_symspell_str))
    final_symspell_bleu.append(compute_bleu(gt, final_symspell_str))
    final_symspell_cosine.append(compute_cosine_similarity(gt, final_symspell_str))
    final_symspell_indobert.append(compute_indobert_large_score([gt], [final_symspell_str]))

    final_llm_wer.append(wer(gt, final_llm_str))
    final_llm_cer.append(cer(gt, final_llm_str))
    final_llm_pmr.append(pmr(gt, final_llm_str))
    final_llm_bleu.append(compute_bleu(gt, final_llm_str))
    final_llm_cosine.append(compute_cosine_similarity(gt, final_llm_str))
    final_llm_indobert.append(compute_indobert_large_score([gt], [final_llm_str]))
    

100%|██████████| 100/100 [2:53:30<00:00, 104.10s/it]   


In [10]:
data = {
  "name": test_files,
  "baseline_wer": baseline_wer,
  "baseline_cer" : baseline_cer,
  "baseline_pmr" : baseline_pmr,
  "baseline_bleu": baseline_bleu,
  "baseline_cosine" : baseline_cosine,
  "baseline_indobert" : baseline_indobert,

  "final_symspell_wer": final_symspell_wer,
  "final_symspell_cer" : final_symspell_cer,
  "final_symspell_pmr" : final_symspell_pmr,
  "final_symspell_bleu": final_symspell_bleu,
  "final_symspell_cosine" : final_symspell_cosine,
  "final_symspell_indobert" : final_symspell_indobert,

  "final_llm_wer": final_llm_wer,
  "final_llm_cer" : final_llm_cer,
  "final_llm_pmr" : final_llm_pmr,
  "final_llm_bleu": final_llm_bleu,
  "final_llm_cosine" : final_llm_cosine,
  "final_llm_indobert" : final_llm_indobert
}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,name,baseline_wer,baseline_cer,baseline_pmr,baseline_bleu,baseline_cosine,baseline_indobert,final_symspell_wer,final_symspell_cer,final_symspell_pmr,final_symspell_bleu,final_symspell_cosine,final_symspell_indobert,final_llm_wer,final_llm_cer,final_llm_pmr,final_llm_bleu,final_llm_cosine,final_llm_indobert
0,522,0.16129,0.113295,0.290323,0.838709,0.928364,"(0.9131332635879517, 0.9442889094352722, 0.928...",0.064516,0.033526,0.297521,0.899011,0.972525,"(0.9586562514305115, 0.9712709188461304, 0.964...",0.064516,0.031214,0.303279,0.89907,0.982733,"(0.9803768992424011, 0.9753679037094116, 0.977..."
1,479,21.230769,16.631868,0.0,0.001806,0.012009,"(0.3306380808353424, 0.42230814695358276, 0.37...",1.0,0.802198,0.0,0.0,0.0,"(0.3634084463119507, 0.37184950709342957, 0.36...",1.0,0.813187,0.0,0.0,0.0,"(0.36423352360725403, 0.3655880391597748, 0.36..."
2,528,1.166667,0.885895,0.362745,0.407504,0.709366,"(0.5526434183120728, 0.788463294506073, 0.6498...",0.127451,0.123613,0.677083,0.839401,0.938608,"(0.9472967386245728, 0.9506863355636597, 0.948...",0.117647,0.122029,0.6875,0.84935,0.938608,"(0.9514400362968445, 0.9529650807380676, 0.952..."
3,365,0.362694,0.294331,0.005181,0.710469,0.9312,"(0.8656865954399109, 0.9295729398727417, 0.896...",0.056995,0.059593,0.005181,0.928273,0.987054,"(0.9405235052108765, 0.9671791791915894, 0.953...",0.108808,0.06468,0.010363,0.827419,0.954383,"(0.9351310133934021, 0.9569826126098633, 0.945..."
4,478,1.12782,1.413115,0.0,0.448918,0.62955,"(0.6963430047035217, 0.7169560790061951, 0.706...",0.007519,0.001093,0.992481,0.980877,0.995194,"(0.9955625534057617, 0.9964922666549683, 0.996...",0.015038,0.004372,0.984962,0.961674,0.974829,"(0.9950908422470093, 0.9946269989013672, 0.994..."


In [11]:
df.describe()

Unnamed: 0,baseline_wer,baseline_cer,baseline_pmr,baseline_bleu,baseline_cosine,final_symspell_wer,final_symspell_cer,final_symspell_pmr,final_symspell_bleu,final_symspell_cosine,final_llm_wer,final_llm_cer,final_llm_pmr,final_llm_bleu,final_llm_cosine
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.432745,0.319161,0.172997,0.796237,0.883098,0.172637,0.121424,0.391989,0.804493,0.863786,0.168569,0.120471,0.398404,0.813179,0.876732
std,2.128334,1.668447,0.28395,0.26613,0.263253,0.30558,0.247709,0.403037,0.305491,0.302619,0.304101,0.247899,0.40343,0.305572,0.301925
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.042914,0.023549,0.007589,0.800474,0.954217,0.019637,0.003315,0.01591,0.841662,0.950848,0.016207,0.002633,0.016303,0.861124,0.974723
50%,0.078184,0.041788,0.017484,0.895241,0.982927,0.046567,0.016357,0.229559,0.927495,0.987325,0.047643,0.016587,0.222458,0.931398,0.990916
75%,0.21019,0.163096,0.241223,0.94625,0.992754,0.123326,0.071141,0.872601,0.960054,0.995291,0.110192,0.071016,0.856043,0.967522,0.997037
max,21.230769,16.631868,1.0,1.0,1.0,1.0,0.990772,1.0,1.0,1.0,1.0,0.990772,1.0,1.0,1.0


In [13]:
df[['baseline_indobert_P', 
    'baseline_indobert_R', 
    'baseline_indobert_F1']] = pd.DataFrame(df['baseline_indobert'].tolist(), index=df.index)

df[['final_symspell_indobert_P', 
    'final_symspell_indobert_R', 
    'final_symspell_indobert_F1']] = pd.DataFrame(df['final_symspell_indobert'].tolist(), index=df.index)

df[['final_llm_indobert_P', 
    'final_llm_indobert_R', 
    'final_llm_indobert_F1']] = pd.DataFrame(df['final_llm_indobert'].tolist(), index=df.index)

df.head()

Unnamed: 0,name,baseline_wer,baseline_cer,baseline_pmr,baseline_bleu,baseline_cosine,baseline_indobert,final_symspell_wer,final_symspell_cer,final_symspell_pmr,...,final_llm_indobert,baseline_indobert_P,baseline_indobert_R,baseline_indobert_F1,final_symspell_indobert_P,final_symspell_indobert_R,final_symspell_indobert_F1,final_llm_indobert_P,final_llm_indobert_R,final_llm_indobert_F1
0,522,0.16129,0.113295,0.290323,0.838709,0.928364,"(0.9131332635879517, 0.9442889094352722, 0.928...",0.064516,0.033526,0.297521,...,"(0.9803768992424011, 0.9753679037094116, 0.977...",0.913133,0.944289,0.92845,0.958656,0.971271,0.964922,0.980377,0.975368,0.977866
1,479,21.230769,16.631868,0.0,0.001806,0.012009,"(0.3306380808353424, 0.42230814695358276, 0.37...",1.0,0.802198,0.0,...,"(0.36423352360725403, 0.3655880391597748, 0.36...",0.330638,0.422308,0.370893,0.363408,0.37185,0.367581,0.364234,0.365588,0.364909
2,528,1.166667,0.885895,0.362745,0.407504,0.709366,"(0.5526434183120728, 0.788463294506073, 0.6498...",0.127451,0.123613,0.677083,...,"(0.9514400362968445, 0.9529650807380676, 0.952...",0.552643,0.788463,0.64982,0.947297,0.950686,0.948988,0.95144,0.952965,0.952202
3,365,0.362694,0.294331,0.005181,0.710469,0.9312,"(0.8656865954399109, 0.9295729398727417, 0.896...",0.056995,0.059593,0.005181,...,"(0.9351310133934021, 0.9569826126098633, 0.945...",0.865687,0.929573,0.896493,0.940524,0.967179,0.953665,0.935131,0.956983,0.945931
4,478,1.12782,1.413115,0.0,0.448918,0.62955,"(0.6963430047035217, 0.7169560790061951, 0.706...",0.007519,0.001093,0.992481,...,"(0.9950908422470093, 0.9946269989013672, 0.994...",0.696343,0.716956,0.706499,0.995563,0.996492,0.996027,0.995091,0.994627,0.994859


In [25]:
import textdistance

def compute_jaro_winkler(reference, prediction):
    return textdistance.jaro_winkler(reference, prediction)

def clean_text(x):
    import re
    return re.sub(r"\s+", " ", x.replace("\n", " ")).strip().lower()

In [26]:
baseline_jw, symspell_jw, llm_jw = [], [], []

for name in tqdm(df["name"]):
    # Load cleaned text
    gt  = clean_text(read_file(f"{GT_DIR}/gt_{name}.txt"))
    base = clean_text(read_file(f"{BASELINE_DIR}/ocr_{name}.txt"))
    sym  = clean_text(read_file(f"{FINAL_SYMSPELL_DIR}/res_{name}.txt"))
    llm  = clean_text(read_file(f"{FINAL_LLM_DIR}/res_{name}.txt"))

    baseline_jw.append(compute_jaro_winkler(gt, base))
    symspell_jw.append(compute_jaro_winkler(gt, sym))
    llm_jw.append(compute_jaro_winkler(gt, llm))


100%|██████████| 100/100 [00:00<00:00, 368.78it/s]


In [27]:
df["baseline_jw"] = baseline_jw
df["final_symspell_jw"] = symspell_jw
df["final_llm_jw"] = llm_jw

In [31]:
df.to_csv("final_with_indobert_jw.csv", index=False)

In [30]:
df.describe().to_csv('mean.csv', index=False)