# OCR-mLLM Pipeline

Before running this code you will need to set up your OpenAI & Gemini API keys. Here's how I did it:

1. Create a new file in your root directory called `.env` (no prefix)
2. Store your API keys with the following names: OPENAI_API_KEY, ANTHROPIC_API_KEY, and GOOGLE_API_KEY
3. Create a virtual environment by typing the following commands into your terminal:
    - ```python3 -m venv .venv```
    - ```source .venv/bin/activate```
    - ```pip install -r requirements.txt```
4. After running the pipeline, type ```deactivate``` in your terminal to make everything go back to normal

## 1. Setup

### a. Run this cell to ensure you have all the necessary directories

Before running the cell make sure you have an images folder in your root directory to feed the images into the pipeline

In [54]:
from pathlib import Path
import os
import pytesseract
from PIL import Image

# Get the root directory of the project
root_dir = Path.cwd().parent.parent

# Get the user's path for the images folder assuming all images are stored here in .png format
source_dir = root_dir / "images"

# Get the user's path for the output folder, create one if it doesn't exist
txt_output_dir = root_dir / "results"
txt_output_dir.mkdir(parents=True, exist_ok=True)

# bm_output_dir = root_dir / "benchmarking-results"/ "txt-accuracy"
# bm_output_dir.mkdir(parents=True, exist_ok=True)

# llm_array = ["gpt-4o", "gemini-2.5-flash", "claude-4-sonnet"]
llm_array = ["gpt-4o", "gemini-2.0-flash"]

def make_llm_dirs(llm_array, target_dir):
    for llm in llm_array:
        dir = target_dir / "ocr_img2txt"
        dir.mkdir(parents=True, exist_ok=True)
        dir = target_dir / "llm_img2txt" / llm
        dir.mkdir(parents=True, exist_ok=True)
        dir = target_dir / "ocr_llm_img2txt" / llm
        dir.mkdir(parents=True, exist_ok=True)

make_llm_dirs(llm_array, txt_output_dir)
# make_llm_dirs(llm_array, bm_output_dir)

### b. Setup API keys & image encoding function

In [55]:
from openai import OpenAI
from anthropic import Anthropic
from google import genai
import base64
from dotenv import load_dotenv

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


gpt_client = OpenAI(api_key=openai_api_key)
gemini_client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
claude_client = Anthropic(api_key=anthropic_api_key)


### c. Get image file paths

In [56]:
# Add all filenames in images directory into the `filenames` array with the ENTIRE filepath
img_filepaths = []
for path in source_dir.iterdir():
  if path.suffix.lower() == ".png" and path.is_file():
    img_filepaths.append(path)

## 2. Run pytesseract

In [57]:
# Read the files from ocr-benchmarking/images folder & write to results folder
for path in img_filepaths:
    file_name = txt_output_dir / "ocr_img2txt" / path.stem
    file_name = str(file_name) + ".txt"
    
    with open(file_name, 'w') as file:
        file.write(pytesseract.image_to_string(Image.open(str(path)))) # TODO: Change config as needed

## 3. Prepare the prompt

In [58]:
prompt_template_ocr_llm = """
You are a text correction assistant. Your task is to clean up and correct errors from raw OCR output.
The text may contain misrecognized characters, broken words, or incorrect formatting.
Carefully read the provided OCR output and produce a corrected version that is grammatically accurate 
and as faithful to the original content as possible. Because this is a historical document, try to 
preserve archaic spelling or formatting where clearly intended. Only correct obvious OCR errors.
Put the dates associated with each entry at the end of the line.

Input (Raw OCR Text):
{input}
"""
input = ""
with open("/Users/muhammadkhalid/Desktop/map2025/ocr-benchmarking/results/ocr_img2txt/kbaa-p096.txt", 'r') as file:
    input += file.read()

prompt_ocr_llm = prompt_template_ocr_llm.format(input=input).strip()

prompt_llm = """
You are an expert historian. Your task is to transcribe the provided image into text. The image
is a 20th century bibliographic entry. Because this is a historical document, try to preserve 
archaic spelling or formatting where clearly intended. Put the dates associated with each entry at the end of the line.
Return the text only, nothing else.
"""

# prompt = """
# From the provided image, give me the first word and nothing else
# """

## 4. Send to OpenAI

### a. OCR-LLM call

In [59]:
for path in img_filepaths:
    base64_image = encode_image(path)

    response = gpt_client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {
                "role": "user", 
                "content": [
                    {
                        "type": "text",
                        "text": prompt_ocr_llm
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{base64_image}"
                        }
                    }
                ]
            }
            ]
    )

    with open(txt_output_dir / "ocr_llm_img2txt" / "gpt-4o" / Path(path.stem + ".txt"), 'w') as file:
        file.write(response.choices[0].message.content)

[file retrieval] 2025-06-23 18:04:42 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


### b. LLM call (without OCR)

In [60]:
for path in img_filepaths:
    base64_image = encode_image(path)

    response = gpt_client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {
                "role": "user", 
                "content": [
                    {
                        "type": "text",
                        "text": prompt_llm
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{base64_image}"
                        }
                    }
                ]
            }
            ]
    )

    with open(txt_output_dir / "llm_img2txt" / "gpt-4o" / Path(path.stem + ".txt"), 'w') as file:
        file.write(response.choices[0].message.content)

[file retrieval] 2025-06-23 18:05:08 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


## 5. Send to Gemini


### a. OCR-LLM call

In [61]:
for path in img_filepaths:
    my_file = gemini_client.files.upload(file=path)

    response = gemini_client.models.generate_content(
        model='gemini-2.0-flash',
        contents=[
            prompt_ocr_llm,
            my_file
        ]
    )

    with open(txt_output_dir / "ocr_llm_img2txt" / "gemini-2.0-flash" / Path(path.stem + ".txt"), 'w') as file:
        file.write(response.text)

[file retrieval] 2025-06-23 18:05:08 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files "HTTP/1.1 200 OK"
[file retrieval] 2025-06-23 18:05:08 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files?upload_id=ABgVH89bHeN0-vA-xzs2K_EGUYmGBXEGUqZ-TzID8lIRuuIZ5uv_yAciVJ4QlyDuV6hzZ8cjQZLSwSf8qPnMUXfMtLoxSxcspz6tzUhrZBAqjyo&upload_protocol=resumable "HTTP/1.1 200 OK"
[file retrieval] 2025-06-23 18:05:12 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files?upload_id=ABgVH89bHeN0-vA-xzs2K_EGUYmGBXEGUqZ-TzID8lIRuuIZ5uv_yAciVJ4QlyDuV6hzZ8cjQZLSwSf8qPnMUXfMtLoxSxcspz6tzUhrZBAqjyo&upload_protocol=resumable "HTTP/1.1 200 OK"
[file retrieval] 2025-06-23 18:05:12 [INFO] AFC is enabled with max remote calls: 10.
[file retrieval] 2025-06-23 18:05:22 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent "HTTP/1.1 200 OK"
[file retrieval] 20

### b. LLM call (without OCR)

In [66]:
for path in img_filepaths:
    my_file = gemini_client.files.upload(file=path)

    response = gemini_client.models.generate_content(
        model='gemini-2.0-flash',
        contents=[
            prompt_llm,
            my_file
        ]
    )

    with open(txt_output_dir / "llm_img2txt" / "gemini-2.0-flash" / Path(path.stem + ".txt"), 'w') as file:
        file.write(response.text)

[file retrieval] 2025-06-23 18:09:12 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files "HTTP/1.1 200 OK"
[file retrieval] 2025-06-23 18:09:12 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files?upload_id=ABgVH8-3KU37dFDba1mFKIvOfc5RYRqFx9d62ZmHwTuLZYDqHsclMmASOT5Yrxi_PKCGIThqeKhARRBqdZIO2RFRET1a82VvUpf2BZRgTsaFYHE&upload_protocol=resumable "HTTP/1.1 200 OK"
[file retrieval] 2025-06-23 18:09:15 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files?upload_id=ABgVH8-3KU37dFDba1mFKIvOfc5RYRqFx9d62ZmHwTuLZYDqHsclMmASOT5Yrxi_PKCGIThqeKhARRBqdZIO2RFRET1a82VvUpf2BZRgTsaFYHE&upload_protocol=resumable "HTTP/1.1 200 OK"
[file retrieval] 2025-06-23 18:09:15 [INFO] AFC is enabled with max remote calls: 10.
[file retrieval] 2025-06-23 18:09:24 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent "HTTP/1.1 200 OK"
[file retrieval] 20

## 6. Send to Claude

### a. OCR-LLM call

In [67]:
# for path in img_filepaths:
#     base64_image = encode_image(path)

#     response = claude_client.messages.create(
#         model='claude-opus-4-20250514',
#         temperature=0,
#         max_tokens=10,
#         messages=[
#             {
#                 "role": "user", 
#                 "content": [
#                     {
#                         "type": "text",
#                         "text": prompt_ocr_llm
#                     },
#                     {
#                         "type": "image",
#                         "source": {
#                             "type": "base64",
#                             "media_type": "image/png",
#                             "data": base64_image
#                         }
#                     }
#                 ]
#             }
#             ]
#     )
#     print(response)

#     with open(txt_output_dir / "ocr_llm_img2txt" / "claude-4-sonnet" / Path(path.stem + ".txt"), 'w') as file:
#         file.write(response.choices[0].message.content)

### b. LLM call (without OCR)

In [68]:
# for path in img_filepaths:
#     base64_image = encode_image(path)

#     response = claude_client.messages.create(
#         model='claude-opus-4-20250514',
#         temperature=0,
#         messages=[
#             {
#                 "role": "user", 
#                 "content": [
#                     {
#                         "type": "text",
#                         "text": prompt_llm
#                     },
#                     {
#                         "type": "image",
#                         "source": {
#                             "type": "base64",
#                             "media_type": "image/png",
#                             "data": base64_image
#                         }
#                     }
#                 ]
#             }
#             ]
#     )

#     with open(txt_output_dir / "llm_img2txt" / "claude-4-sonnet" / Path(path.stem + ".txt"), 'w') as file:
#         file.write(response.choices[0].message.content)

## 7. Benchmark results

In [69]:
import sys
sys.path.append(str(Path.cwd().parent))
from benchmarking.txt_accuracy import clean_text_normalized, clean_text_nonorm, compute_metrics, build_dataframe, get_doc_names, get_all_models, get_docs
from venv import logger
from datetime import datetime

def main():
    """
    Prerequisites:
    - Ground truth text files located at `project_root/ground-truth/txt/kbaa-p#xyz.txt`
    - LLM/OCR transcribed files located at:
        - for LLM transcriptions: `project_root/results/llm_img2txt/<MODEL-NAME>/kbaa-p#xyz.txt`
        - for OCR transcriptions: `project_root/results/ocr_img2txt/<MODEL-NAME>/kbaa-p#xyz.txt`

    The main function will:
    - Gather all ground truth text files
    - For each ground truth text file and for each LLM/OCR model, gather the corresponding transcription
    - Clean all the text files (normalized and not normalized)
    - Compute metrics for each file and model
    - Save results in two CSV files (one for normalized, one for non-normalized)
        - Results are saved in `project_root/benchmarking-results/txt-accuracy`
    """

    # =============
    # Preliminaries
    # =============

    # args = parse_arguments()

    script_dir = str(Path.cwd())
    project_root = str(root_dir)
    logger.info("Script directory: %s", script_dir)
    logger.info("Project root: %s", project_root)

    # Ground truth
    ground_truth_dir = root_dir / "ground-truth"
    doc_names = get_doc_names(ground_truth_dir)

    # results/ paths
    all_models = get_all_models(
        os.path.join(project_root, "results", "llm_img2txt"),
        os.path.join(project_root, "results", "ocr_img2txt"),
        os.path.join(project_root, "results", "ocr_llm_img2txt"),
    )
    logger.info(f"Models found: {all_models}")

    # ===========
    # Gather files
    # ===========

    # -> Gather ground truths and put into dict:

    ground_truths, ground_truths["__ALL__"] = get_docs(ground_truth_dir, doc_names)
    doc_lengths_normalized = {
        doc: len(clean_text_normalized(text)) for doc, text in ground_truths.items()
    }
    doc_lengths_nonorm = {
        doc: len(clean_text_nonorm(text)) for doc, text in ground_truths.items()
    }
    total_doc_len_normalized = len(clean_text_normalized(ground_truths["__ALL__"]))
    total_doc_len_nonorm = len(clean_text_nonorm(ground_truths["__ALL__"]))

    # -> Gather each transcribed document and put into dict:

    # Structure: results[model][doc]
    results = {}

    for model_type, model in all_models:
        logger.info("Collecting results for model: %s", model)
        model_path = os.path.join(project_root, "results", model_type, model)
        results[model_type] = results.get(model_type, {})
        results[model_type][model], results[model_type][model]["__ALL__"] = get_docs(model_path, doc_names)
            
        # logger.info("Collected results for model: %s", list(results[model].keys()))
        logger.info("Collected results for model_type: %s, model: %s", model_type, model)

    # ===============
    # Compute metrics
    # ===============

    normalized_results_data = {}
    nonorm_results_data = {}

    for model_type, model in all_models:
        normalized_results_data[model_type] = normalized_results_data.get(model_type, {})
        normalized_results_data[model_type][model] = normalized_results_data[model_type].get(model, {})
        nonorm_results_data[model_type] = nonorm_results_data.get(model_type, {})
        nonorm_results_data[model_type][model] = nonorm_results_data[model_type].get(model, {})

        logger.info("Computing metrics for model_type: %s, model: %s", model_type, model)
        for doc in doc_names:
            logger.info("Computing metrics for document: %s", doc)
            normalized_results_data[model_type][model][doc] = compute_metrics(
                ground_truths[doc], results[model_type][model][doc], normalized=True
            )
            nonorm_results_data[model_type][model][doc] = compute_metrics(
                ground_truths[doc], results[model_type][model][doc], normalized=False
            )
            print(normalized_results_data[model_type][model][doc])
            print(nonorm_results_data[model_type][model][doc])

        normalized_results_data[model_type][model]["__ALL__"] = compute_metrics(
            ground_truths["__ALL__"], results[model_type][model]["__ALL__"], normalized=True
        )
        nonorm_results_data[model_type][model]["__ALL__"] = compute_metrics(
            ground_truths["__ALL__"], results[model_type][model]["__ALL__"], normalized=False
        )

    # Compute metrics separately for __ALL__]

    # ====================
    # Put metrics in table
    # ====================

    time = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

    results_base_dir = root_dir / "benchmarking-results" / "txt-accuracy"

    # Create different results directory for each model type
    for model_type, _ in all_models:
        results_dir = results_base_dir / model_type
        results_dir.mkdir(parents=True, exist_ok=True)

        normalized_df = build_dataframe(
            f"normalized_{time}",
            doc_names,
            normalized_results_data[model_type],
            doc_lengths_normalized,
            total_doc_len_normalized,
        )
        nonorm_df = build_dataframe(
            f"nonorm_{time}",
            doc_names,
            nonorm_results_data[model_type],
            doc_lengths_nonorm,
            total_doc_len_nonorm,
        )

        # ============
        # Save results
        # ============

        # # Default save to project_root/benchmarking-results/txt-accuracy
        # results_path = os.path.join(project_root, "benchmarking-results", "txt-accuracy")
        # if not os.path.exists(results_path):
        #     os.makedirs(results_path)
        normalized_df.to_csv(os.path.join(str(results_dir), f"normalized_{time}.csv"))
        nonorm_df.to_csv(os.path.join(str(results_dir), f"nonorm_{time}.csv"))


if __name__ == "__main__":
    main()

[file retrieval] 2025-06-23 18:09:24 [INFO] Script directory: /Users/muhammadkhalid/Desktop/map2025/ocr-benchmarking/src/workflow


[file retrieval] 2025-06-23 18:09:24 [INFO] Project root: /Users/muhammadkhalid/Desktop/map2025/ocr-benchmarking
[file retrieval] 2025-06-23 18:09:24 [INFO] Found ground-truth txt files: ['/Users/muhammadkhalid/Desktop/map2025/ocr-benchmarking/ground-truth/kbaa-p096.txt']
[file retrieval] 2025-06-23 18:09:24 [INFO] Found file names: ['kbaa-p096']
[file retrieval] 2025-06-23 18:09:24 [INFO] Models found: [('llm_img2txt', 'gemini-2.0-flash'), ('ocr_llm_img2txt', 'gemini-2.0-flash'), ('llm_img2txt', 'gpt-4o'), ('ocr_llm_img2txt', 'gpt-4o')]
[file retrieval] 2025-06-23 18:09:24 [INFO] Collecting results for model: gemini-2.0-flash
[file retrieval] 2025-06-23 18:09:24 [INFO] Collected results for model_type: llm_img2txt, model: gemini-2.0-flash
[file retrieval] 2025-06-23 18:09:24 [INFO] Collecting results for model: gemini-2.0-flash
[file retrieval] 2025-06-23 18:09:24 [INFO] Collected results for model_type: ocr_llm_img2txt, model: gemini-2.0-flash
[file retrieval] 2025-06-23 18:09:24 [IN

{'dist_char': 223, 'cer': 0.06630984240261671, 'wer': 0.37478991596638656, 'token_sort_ratio': 99.18239928645755}
{'dist_char': 317, 'cer': 0.08635249250885317, 'wer': 0.5097402597402597, 'token_sort_ratio': 97.13155291790306}
{'dist_char': 219, 'cer': 0.06512042818911686, 'wer': 0.3680672268907563, 'token_sort_ratio': 99.24231169217056}
{'dist_char': 312, 'cer': 0.08499046581312994, 'wer': 0.5048701298701299, 'token_sort_ratio': 97.18886848424918}
{'dist_char': 63, 'cer': 0.01873327386262266, 'wer': 0.10588235294117647, 'token_sort_ratio': 98.68302903322359}
{'dist_char': 77, 'cer': 0.02097521111413784, 'wer': 0.12012987012987013, 'token_sort_ratio': 98.27807030027039}
{'dist_char': 37, 'cer': 0.011002081474873625, 'wer': 0.06218487394957983, 'token_sort_ratio': 99.35945180992105}
{'dist_char': 58, 'cer': 0.01579950967038954, 'wer': 0.09415584415584416, 'token_sort_ratio': 98.65610411656527}


## Don't use the cell below, this is just for my use

In [70]:
# import sys
# sys.path.append("..")
# from benchmarking.txt_accuracy import clean_text_normalized

# gt_dir = root_dir / "ground-truth"
# ocr_dir = root_dir / "results" / "ocr_img2txt"
# llm_dir = root_dir / "results" / "llm_img2txt" / "gpt-4o"
# ocr_llm_dir = root_dir / "results" / "ocr_llm_img2txt" / "gpt-4o"

# ocr_llm_filepaths = []
# for path in ocr_llm_dir.iterdir():
#   if path.is_file():
#     if "3" in path.stem:
#         continue
#     ocr_llm_filepaths.append(path)

# ocr_filepaths = []
# for path in ocr_dir.iterdir():
#   if path.is_file():
#     if "3" in path.stem:
#         continue
#     ocr_filepaths.append(path)

# llm_filepaths = []
# for path in llm_dir.iterdir():
#   if path.is_file():
#     if "3" in path.stem:
#         continue
#     llm_filepaths.append(path)

# gt_filepaths = []
# for path in gt_dir.iterdir():
#   if path.is_file():
#     gt_filepaths.append(path)

# # if len(ocr_filepaths) != len(gt_filepaths):
# #     raise ValueError("Number of OCR files and GT files do not match")

# for ocr_llm_path, llm_path, gt_path in zip(ocr_llm_filepaths, llm_filepaths, gt_filepaths):
#     with open(ocr_llm_path, 'r') as file:
#         ocr_llm_text = clean_text_normalized(file.read())
#     with open(llm_path, 'r') as file:
#         llm_text = clean_text_normalized(file.read())
#     with open(gt_path, 'r') as file:
#         gt_text = clean_text_normalized(file.read())
# print(llm_text)
# print(gt_text)

# for i in range(len(ocr_llm_text)):
#     if ocr_llm_text[i] != gt_text[i]:
#         print(f"Mismatch at index {i}")
#         print(f"OCR: {ocr_llm_text[i]}")
#         print(f"GT: {gt_text[i]}")
#         break
        