# OCR-mLLM Pipeline

## 1. Run pytesseract

In [3]:
import os
from pathlib import Path
import pytesseract
from PIL import Image

DEBUG = True

root_dir = Path.cwd().parent.parent
# Get the user's path for the images folder assuming all images are stored here in .png format
source_dir = root_dir / "images"

# Get the user's path for the output folder, create one if it doesn't exist
target_dir = root_dir / "results"
target_dir.mkdir(parents=True, exist_ok=True)

# Add all filenames in images directory into the `filenames` array with the ENTIRE filepath
img_filepaths = []
for path in source_dir.iterdir():
  if path.is_file():
    img_filepaths.append(path)

In [4]:
# Read the files from ocr-benchmarking/images folder & write to results folder
for path in img_filepaths:
    file_name = target_dir / "ocr_img2txt" / path.stem
    file_name = str(file_name) + ".txt"
    
    with open(file_name, 'w') as file:
        file.write(pytesseract.image_to_string(Image.open(str(path)))) # TODO: Change config as needed

# 2. Send to OpenAI

## (i) Prepare the prompt

In [5]:
prompt_template_ocr_llm = """
You are a text correction assistant. Your task is to clean up and correct errors from raw OCR output.
The text may contain misrecognized characters, broken words, or incorrect formatting.
Carefully read the provided OCR output and produce a corrected version that is grammatically accurate 
and as faithful to the original content as possible. Because this is a historical document, try to 
preserve archaic spelling or formatting where clearly intended. Only correct obvious OCR errors.
Put the dates associated with each entry at the end of the line.

Input (Raw OCR Text):
{input}
"""
input = ""
with open("/Users/muhammadkhalid/Desktop/map2025/ocr-benchmarking/results/ocr_img2txt/kbaa-p 096.txt", 'r') as file:
    input += file.read()

prompt_ocr_llm = prompt_template_ocr_llm.format(input=input).strip()

prompt_llm = """
You are an expert historian. Your task is to transcribe the provided image into text. The image
is a 20th century bibliographic entry. Because this is a historical document, try to preserve 
archaic spelling or formatting where clearly intended. Put the dates associated with each entry at the end of the line.
Return the text only, nothing else.
"""

# prompt = """
# From the provided image, give me the first word and nothing else
# """

## (ii) API Call

### a. OCR-LLM call

In [6]:
from openai import OpenAI
import base64
from dotenv import load_dotenv

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')
    
base64_image = encode_image(source_dir / "kbaa-p096.png")

client = OpenAI(api_key=openai_api_key)
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[
        {
            "role": "user", 
            "content": [
                {
                    "type": "text",
                    "text": prompt_ocr_llm
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{base64_image}"
                    }
                }
            ]
        }
        ]
)

with open(target_dir / "ocr_llm_img2txt" / "gpt-4o" / "kbaa-p096.txt", 'w') as file:
    file.write(response.choices[0].message.content)

### b. LLM call (without OCR)

In [7]:
from openai import OpenAI
import base64
from dotenv import load_dotenv

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')
    
base64_image = encode_image(source_dir / "kbaa-p096.png")

client = OpenAI(api_key=openai_api_key)
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[
        {
            "role": "user", 
            "content": [
                {
                    "type": "text",
                    "text": prompt_llm
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{base64_image}"
                    }
                }
            ]
        }
        ]
)

with open(target_dir / "llm_img2txt" / "gpt-4o" / "kbaa-p096.txt", 'w') as file:
    file.write(response.choices[0].message.content)

# 3. Benchmark results

In [None]:
import sys
sys.path.append(str(Path.cwd().parent))
from benchmarking.txt_accuracy import clean_text_normalized, clean_text_nonorm, compute_metrics, build_dataframe, get_doc_names, get_all_models, get_docs
from venv import logger
from datetime import datetime

def main():
    """
    Prerequisites:
    - Ground truth text files located at `project_root/ground-truth/txt/kbaa-p#xyz.txt`
    - LLM/OCR transcribed files located at:
        - for LLM transcriptions: `project_root/results/llm_img2txt/<MODEL-NAME>/kbaa-p#xyz.txt`
        - for OCR transcriptions: `project_root/results/ocr_img2txt/<MODEL-NAME>/kbaa-p#xyz.txt`

    The main function will:
    - Gather all ground truth text files
    - For each ground truth text file and for each LLM/OCR model, gather the corresponding transcription
    - Clean all the text files (normalized and not normalized)
    - Compute metrics for each file and model
    - Save results in two CSV files (one for normalized, one for non-normalized)
        - Results are saved in `project_root/benchmarking-results/txt-accuracy`
    """

    # =============
    # Preliminaries
    # =============

    # args = parse_arguments()

    script_dir = str(Path.cwd())
    project_root = str(root_dir)
    logger.info("Script directory: %s", script_dir)
    logger.info("Project root: %s", project_root)

    # Ground truth
    ground_truth_dir = root_dir / "ground-truth"
    doc_names = get_doc_names(ground_truth_dir)

    # results/ paths
    all_models = get_all_models(
        os.path.join(project_root, "results", "llm_img2txt"),
        os.path.join(project_root, "results", "ocr_img2txt"),
        os.path.join(project_root, "results", "ocr_llm_img2txt"),
    )
    logger.info(f"Models found: {all_models}")

    # ===========
    # Gather files
    # ===========

    # -> Gather ground truths and put into dict:

    ground_truths, ground_truths["__ALL__"] = get_docs(ground_truth_dir, doc_names)
    doc_lengths_normalized = {
        doc: len(clean_text_normalized(text)) for doc, text in ground_truths.items()
    }
    doc_lengths_nonorm = {
        doc: len(clean_text_nonorm(text)) for doc, text in ground_truths.items()
    }
    total_doc_len_normalized = len(clean_text_normalized(ground_truths["__ALL__"]))
    total_doc_len_nonorm = len(clean_text_nonorm(ground_truths["__ALL__"]))

    # -> Gather each transcribed document and put into dict:

    # Structure: results[model][doc]
    results = {}

    for model_type, model in all_models:
        logger.info("Collecting results for model: %s", model)
        model_path = os.path.join(project_root, "results", model_type, model)
        results[model_type] = results.get(model_type, {})
        results[model_type][model], results[model_type][model]["__ALL__"] = get_docs(model_path, doc_names)
            
        # logger.info("Collected results for model: %s", list(results[model].keys()))
        logger.info("Collected results for model_type: %s, model: %s", model_type, model)

    # ===============
    # Compute metrics
    # ===============

    normalized_results_data = {}
    nonorm_results_data = {}

    for model_type, model in all_models:
        normalized_results_data[model_type] = normalized_results_data.get(model_type, {})
        normalized_results_data[model_type][model] = normalized_results_data[model_type].get(model, {})
        nonorm_results_data[model_type] = nonorm_results_data.get(model_type, {})
        nonorm_results_data[model_type][model] = nonorm_results_data[model_type].get(model, {})

        logger.info("Computing metrics for model_type: %s, model: %s", model_type, model)
        for doc in doc_names:
            logger.info("Computing metrics for document: %s", doc)
            normalized_results_data[model_type][model][doc] = compute_metrics(
                ground_truths[doc], results[model_type][model][doc], normalized=True
            )
            nonorm_results_data[model_type][model][doc] = compute_metrics(
                ground_truths[doc], results[model_type][model][doc], normalized=False
            )

        normalized_results_data[model_type][model]["__ALL__"] = compute_metrics(
            ground_truths["__ALL__"], results[model_type][model]["__ALL__"], normalized=True
        )
        nonorm_results_data[model_type][model]["__ALL__"] = compute_metrics(
            ground_truths["__ALL__"], results[model_type][model]["__ALL__"], normalized=False
        )

    # Compute metrics separately for __ALL__]

    # ====================
    # Put metrics in table
    # ====================

    time = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

    results_base_dir = root_dir / "benchmarking-results" / "txt-accuracy"

    # Create different results directory for each model type
    for model_type, _ in all_models:
        results_dir = results_base_dir / model_type
        results_dir.mkdir(parents=True, exist_ok=True)

        normalized_df = build_dataframe(
            f"normalized_{time}",
            doc_names,
            normalized_results_data[model_type],
            doc_lengths_normalized,
            total_doc_len_normalized,
        )
        nonorm_df = build_dataframe(
            f"nonorm_{time}",
            doc_names,
            nonorm_results_data[model_type],
            doc_lengths_nonorm,
            total_doc_len_nonorm,
        )

        # ============
        # Save results
        # ============

        # # Default save to project_root/benchmarking-results/txt-accuracy
        # results_path = os.path.join(project_root, "benchmarking-results", "txt-accuracy")
        # if not os.path.exists(results_path):
        #     os.makedirs(results_path)
        normalized_df.to_csv(os.path.join(str(results_dir), f"normalized_{time}.csv"))
        nonorm_df.to_csv(os.path.join(str(results_dir), f"nonorm_{time}.csv"))


if __name__ == "__main__":
    main()

2025-06-20 10:19:01 [INFO] Script directory: /Users/muhammadkhalid/Desktop/map2025/ocr-benchmarking/src/workflow
2025-06-20 10:19:01 [INFO] Project root: /Users/muhammadkhalid/Desktop/map2025/ocr-benchmarking
2025-06-20 10:19:01 [INFO] Found ground-truth txt files: ['/Users/muhammadkhalid/Desktop/map2025/ocr-benchmarking/ground-truth/kbaa-p096.txt']
2025-06-20 10:19:01 [INFO] Found file names: ['kbaa-p096']
2025-06-20 10:19:01 [INFO] Models found: [('llm_img2txt', 'gpt-4o'), ('ocr_llm_img2txt', 'gpt-4o')]
2025-06-20 10:19:01 [INFO] Collecting results for model: gpt-4o
2025-06-20 10:19:01 [INFO] Collected results for model_type: llm_img2txt, model: gpt-4o
2025-06-20 10:19:01 [INFO] Collecting results for model: gpt-4o
2025-06-20 10:19:01 [INFO] Collected results for model_type: ocr_llm_img2txt, model: gpt-4o
2025-06-20 10:19:01 [INFO] Computing metrics for model_type: llm_img2txt, model: gpt-4o
2025-06-20 10:19:01 [INFO] Computing metrics for document: kbaa-p096
2025-06-20 10:19:01 [INF

In [33]:
import sys
sys.path.append("..")
from benchmarking.txt_accuracy import clean_text_normalized

gt_dir = root_dir / "ground-truth"
ocr_dir = root_dir / "results" / "ocr_img2txt"
llm_dir = root_dir / "results" / "llm_img2txt" / "gpt-4o"
ocr_llm_dir = root_dir / "results" / "ocr_llm_img2txt" / "gpt-4o"

ocr_llm_filepaths = []
for path in ocr_llm_dir.iterdir():
  if path.is_file():
    if "3" in path.stem:
        continue
    ocr_llm_filepaths.append(path)

ocr_filepaths = []
for path in ocr_dir.iterdir():
  if path.is_file():
    if "3" in path.stem:
        continue
    ocr_filepaths.append(path)

llm_filepaths = []
for path in llm_dir.iterdir():
  if path.is_file():
    if "3" in path.stem:
        continue
    llm_filepaths.append(path)

gt_filepaths = []
for path in gt_dir.iterdir():
  if path.is_file():
    gt_filepaths.append(path)

# if len(ocr_filepaths) != len(gt_filepaths):
#     raise ValueError("Number of OCR files and GT files do not match")

for ocr_llm_path, llm_path, gt_path in zip(ocr_llm_filepaths, llm_filepaths, gt_filepaths):
    with open(ocr_llm_path, 'r') as file:
        ocr_llm_text = clean_text_normalized(file.read())
    with open(llm_path, 'r') as file:
        llm_text = clean_text_normalized(file.read())
    with open(gt_path, 'r') as file:
        gt_text = clean_text_normalized(file.read())
print(llm_text)
print(gt_text)

for i in range(len(ocr_llm_text)):
    if ocr_llm_text[i] != gt_text[i]:
        print(f"Mismatch at index {i}")
        print(f"OCR: {ocr_llm_text[i]}")
        print(f"GT: {gt_text[i]}")
        break
        

evans robley dunglison 18461912 a sailors log recollections of forty years of naval life ny d appleton 1901 467 p whi naval officer who saw action in the civil war and spanishamerican war 1846 evans robley dunglison 18461912 an admirals log being continued recollections of naval life ny d appleton 1911 467 p nn the authors career from the close of the spanishamerican war until his retirement in 1909 1847 evans warren f autobiography of a shakermount lebanon ny the author 1869 162 p mbat the story of his conversion 1848 evarts john w b 1837 light of lifeoklahoma city the author 1909 485 p okhi itinerant printer and apostle of scientific religion 1849 everett syble byrd adventures with life an autobiography of a distinguished negro citizen boston meador 1945 182 p whi school teacher and director of physical education in kansas and oklahoma 1850 everton walter marion b 1876 autobiography in his everton knowles book logan 1942 p 147 usic utah teacher and merchant including his missions for