In [1]:
import os
import re
from rapidfuzz import distance
import pytesseract
from txt_accuracy import *
from pathlib import Path

root_dir = Path.cwd().parent.parent

# Example: perform OCR using pytesseract

In [None]:
ground_truth = open(root_dir / 'data' / 'ground-truth' / 'txt' / 'gt_kbaa-p100.txt')
ground_truth_text = ground_truth.read()
ground_truth_text_clean = clean_text_normalized(ground_truth_text, index_numbers=False)

ocr_text = pytesseract.image_to_string('../../data/tiffs/kbaa-p100.tif').format('tiff')
ocr_text_clean = clean_text_normalized(ocr_text, index_numbers=False)

In [13]:
print(ocr_text)

ENTRIES 1920-1938 100

Field, Joseph E. Three years in [1920]
Texas...Boston: Abel Tompkins,
1836. 47 p. Auto., p. 41-47. DLC.

His experiences in the War with
Mexico,

Field, Richard, b. 1843. Richard [1921]
Field, Lexington, Mo.: 1930. 38 p.
MoK. Missouri lawyer and judge
tells also of his youth on a farm.

Field, Stephen Johnson, 1816—1899,[1922 ]
Personal reminiscences of early
days in California, with other
sketches. San Francisco? 1880.

248 p. WHi. By a judge and mem-
ber of the state legislature.

Fifer, Joseph Wilson, b. 1840.
“Private Joe” Fifer. Memories
of war & peace. Bloomington, I11,:
Pantagraph pr. co., 1936. 104 p.

WHi. Civil War soldier and
governor of Illinois.

Finch, Edwin Ward, b. 1831. The [1924]
frontier, army and professional
life of Edwin W. Finch...N.Y.:
Press of Simmonds, Manning &
Dawson, 1909. 119 p. Auto., p. 1-64.
DLC. A New York physician tells
of his boyhood on a Michigan
homestead and of his service
with the Union forces as a surgeon.

Finck, Henry T

# Example: Compare ground-truth and LLM cleaned text

In [None]:
ground_truth = open(root_dir / 'data' / 'ground-truth' / 'txt' / 'gt_kbaa-p101.txt')
ground_truth_text = ground_truth.read()
ground_truth_text_clean = clean_text_normalized(ground_truth_text)
ground_truth_text_clean_nonorm = clean_text_nonorm(ground_truth_text)

llm = open(root_dir / 'results' / 'llm-img2txt' / 'gemini-2.5-flash-no-thinking' / 'gemini-2.5-flash-no-thinking_img_kbaa-p101.txt')
llm_text = llm.read()
llm_text_clean = clean_text_normalized(llm_text)
llm_text_clean_nonorm = clean_text_nonorm(llm_text)

In [3]:
print('========== Ground truth original ==========')
print(ground_truth_text)
print('========== LLM original ==========')
print(llm_text)
print('========== Ground truth cleaned ==========')
print(ground_truth_text_clean)
print('========== LLM cleaned ==========')
print(llm_text_clean)

Adams, John Quincy-Akeley 5 ENTRIES 39-57

1850-56. 10 vols. Auto., vol. 2, p. 503-517; vol. 3, p. 3-88. WHi. This account closes with 1776, and stresses his accomplishments in the Congress.
Adams, John Quincy, b. 1845. Narrative...Harrisburg: Sieg, printer, 1872. 64 p. MH. Slave in Virginia; free laborer in Pa. [39]
Adams, John Quincy, 1849-1940. An old boy remembers. Boston: Ruth Hill, 1935. 125 p. NN. The writer, a Presbyterian minister, devotes the greater portion of his book to his early life in western New York state and to his schooling including his years at the University of Rochester and the Auburn Theological Seminary. [40]
Adams, Juliette (Graves), b. 1858. Chapters from a musical life. A short autobiographical narrative, by Mrs. Crosby Adams. Chicago: C. Adams 1903. 138 p. DLC. Music teacher in New York and Chicago. [41]
Adams, Mary Still, b. 1839. Autobiography...Los Angeles: Buckingham bros., printers, 1893. 288 p. CLSU. Kansas school-teacher marries clergyman and devote

In [8]:
len(ground_truth_text_clean)

3304

# Example: compute metrics

In [13]:
compute_metrics(ground_truth_text, llm_text, doc_format='txt', normalized=True, index_numbers=True)

{'dist_char': 27,
 'cer': 0.007696693272519955,
 'dist_word': 27,
 'wer': 0.0430622009569378,
 'token_sort_ratio': 99.50135346915515}

# Example: build a dataframe of results (using synthetic data)

In [12]:
doc_names = ['a', 'b']
results_data = {
    'pytesseract': {
        'a': {
            'dist_char': 1,
            'cer': 0.01,
            'wer': 0.1,
        },
        'b': {
            'dist_char': 1,
            'cer': 0.01,
            'wer': 0.1,
        },
        '__ALL__': {
            'dist_char': 1,
            'cer': 0.01,
            'wer': 0.1,
        },
    },
}
doc_lengths = {'a': 100, 'b': 100}
total_doc_len = 200
build_dataframe('title', doc_names, results_data, doc_lengths, total_doc_len)

Unnamed: 0,a:dist_char,a:doc_len,a:cer_pct,a:wer_pct,b:dist_char,b:doc_len,b:cer_pct,b:wer_pct,__ALL__:dist_char,__ALL__:doc_len,__ALL__:cer_pct,__ALL__:wer_pct
pytesseract,1,100,1.0,10.0,1,100,1.0,10.0,1,200,1.0,10.0


# Example: run txt_accuracy's main function

In [15]:
main()

2025-06-11 17:22:27 [INFO] Script directory: /Users/timyu/projects/GC-20C-Text-Lab/ocr-benchmarking/src/benchmarking
2025-06-11 17:22:27 [INFO] Project root: /Users/timyu/projects/GC-20C-Text-Lab/ocr-benchmarking
2025-06-11 17:22:27 [INFO] Found ground-truth txt files: ['/Users/timyu/projects/GC-20C-Text-Lab/ocr-benchmarking/data/ground-truth/txt/kbaa-p#100.txt']
2025-06-11 17:22:27 [INFO] Found file names: ['kbaa-p#100']
2025-06-11 17:22:27 [INFO] Models found: [('ocr_img2txt', 'pytesseract')]
2025-06-11 17:22:27 [INFO] Collecting results for model: pytesseract
2025-06-11 17:22:27 [INFO] Collected results for model: ['kbaa-p#100', '__ALL__']
2025-06-11 17:22:27 [INFO] Computing metrics for model: pytesseract
2025-06-11 17:22:27 [INFO] Computing metrics for document: kbaa-p#100
