# OCR-mLLM Pipeline

Before running this code you will need to set up your OpenAI & Gemini API keys. Here's how I did it:

1. Create a new file in your root directory called `.env` (no prefix)
2. Store your API keys with the following names: OPENAI_API_KEY, ANTHROPIC_API_KEY, and GOOGLE_API_KEY
3. Create a virtual environment by typing the following commands into your terminal:
    - ```python3 -m venv .venv```
    - ```source .venv/bin/activate```
    - ```pip install -r requirements.txt```
4. After running the pipeline, type ```deactivate``` in your terminal to make everything go back to normal

## 1. Setup

### a. Run this cell to ensure you have all the necessary directories

Before running the cell make sure you have an images folder in your root directory to feed the images into the pipeline

In [None]:
from pathlib import Path
import os
import pytesseract
from PIL import Image

# Get the root directory of the project
root_dir = Path.cwd().parent.parent

doc_format = "txt"

# Get the user's path for the images folder assuming all images are stored here in .png format
source_dir = root_dir / "data" / "pngs"

# Get the user's path for the output folder, create one if it doesn't exist
output_dir = root_dir / "results" / doc_format
output_dir.mkdir(parents=True, exist_ok=True)

bm_output_dir = root_dir / "benchmarking-results"/ f"{doc_format}-accuracy"
bm_output_dir.mkdir(parents=True, exist_ok=True)

# llm_array = ["gpt-4o", "gemini-2.5-flash", "claude-4-sonnet"]
llm_array = ["gpt-4o", "gemini-2.0-flash"]

def make_llm_dirs(llm_array, target_dir, doc_format):
    for llm in llm_array:
        dir = target_dir / f"ocr-img2txt"
        dir.mkdir(parents=True, exist_ok=True)
        dir = target_dir / f"llm-img2{doc_format}" / llm
        dir.mkdir(parents=True, exist_ok=True)
        dir = target_dir / f"ocr-llm-img2{doc_format}" / llm
        dir.mkdir(parents=True, exist_ok=True)

make_llm_dirs(llm_array, output_dir, doc_format)

### b. Setup API keys & image encoding function

In [11]:
from openai import OpenAI
from anthropic import Anthropic
from google import genai
import base64
from dotenv import load_dotenv

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


gpt_client = OpenAI(api_key=openai_api_key)
gemini_client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
claude_client = Anthropic(api_key=anthropic_api_key)


### c. Get image file paths

In [2]:
# Add all filenames in images directory into the `filenames` array with the ENTIRE filepath
img_filepaths = []
count = 0
for path in source_dir.iterdir():
  if count < 10:
    if path.suffix.lower() == ".png" and path.is_file():
      img_filepaths.append(path)
      count += 1

## 2. Run pytesseract

In [3]:
# Read the files from ocr-benchmarking/images folder & write to results folder
for path in img_filepaths:
    file_name = output_dir / "ocr-img2txt" / path.stem
    file_name = str(file_name) + ".txt"
    
    with open(file_name, 'w') as file:
        file.write(pytesseract.image_to_string(Image.open(str(path)))) # TODO: Change config as needed

## 3. Prepare the prompt

In [13]:
prompt_template_ocr_llm = """
You are a text correction assistant. Your task is to clean up and correct errors from raw OCR output.
The text may contain misrecognized characters, broken words, or incorrect formatting.
Carefully read the provided OCR output and produce a corrected version that is grammatically accurate 
and as faithful to the original content as possible. Because this is a historical document, try to 
preserve archaic spelling or formatting where clearly intended. Only correct obvious OCR errors.
Put the dates associated with each entry at the end of the line.

Input (Raw OCR Text):
{input}
"""

prompt_llm = """
You are an expert historian. Your task is to transcribe the provided image into text. The image
is a 20th century bibliographic entry. Because this is a historical document, try to preserve 
archaic spelling or formatting where clearly intended. Put the dates associated with each entry at the end of the line.
Return the text only, nothing else.
"""

prompt_llm = """
From the provided image, give me the first word and nothing else
"""

## 4. Send to OpenAI

### a. OCR-LLM call

In [None]:
for path in img_filepaths:
    input = ""
    base64_image = encode_image(path)
    ocr_text_path = str(output_dir / "ocr-img2txt" / path.stem) + ".txt" # THIS REMAINS THE SAME b/c we're reading the OCR output
    with open(ocr_text_path, 'r') as file:
        input += file.read()
    prompt_ocr_llm = prompt_template_ocr_llm.format(input=input).strip()

    response = gpt_client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {
                "role": "user", 
                "content": [
                    {
                        "type": "text",
                        "text": prompt_ocr_llm
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{base64_image}"
                        }
                    }
                ]
            }
            ]
    )

    with open(output_dir / f"ocr-llm-img2{doc_format}" / "gpt-4o" / Path(path.stem + f".{doc_format}"), 'w') as file:
        file.write(response.choices[0].message.content)

### b. LLM call (without OCR)

In [16]:
for path in img_filepaths:
    base64_image = encode_image(path)

    response = gpt_client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {
                "role": "user", 
                "content": [
                    {
                        "type": "text",
                        "text": prompt_llm
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{base64_image}"
                        }
                    }
                ]
            }
            ]
    )

    with open(output_dir / f"llm-img2{doc_format}" / "gpt-4o" / Path(path.stem + f".{doc_format}"), 'w') as file:
        file.write(response.choices[0].message.content)

[file retrieval] 2025-06-30 15:56:55 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


## 5. Send to Gemini


### a. OCR-LLM call

In [None]:
for path in img_filepaths:
    my_file = gemini_client.files.upload(file=path)
    input = ""
    ocr_text_path = str(output_dir / "ocr-img2txt" / path.stem) + ".txt" # THIS REMAINS THE SAME b/c we're reading the OCR output
    with open(ocr_text_path, 'r') as file:
        input += file.read()
    prompt_ocr_llm = prompt_template_ocr_llm.format(input=input).strip()

    response = gemini_client.models.generate_content(
        model='gemini-2.0-flash',
        contents=[
            prompt_ocr_llm,
            my_file
        ]
    )

    with open(output_dir / f"ocr-llm-img2{doc_format}" / "gemini-2.0-flash" / Path(path.stem + f".{doc_format}"), 'w') as file:
        file.write(response.text)

[file retrieval] 2025-06-23 18:05:08 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files "HTTP/1.1 200 OK"
[file retrieval] 2025-06-23 18:05:08 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files?upload_id=ABgVH89bHeN0-vA-xzs2K_EGUYmGBXEGUqZ-TzID8lIRuuIZ5uv_yAciVJ4QlyDuV6hzZ8cjQZLSwSf8qPnMUXfMtLoxSxcspz6tzUhrZBAqjyo&upload_protocol=resumable "HTTP/1.1 200 OK"
[file retrieval] 2025-06-23 18:05:12 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files?upload_id=ABgVH89bHeN0-vA-xzs2K_EGUYmGBXEGUqZ-TzID8lIRuuIZ5uv_yAciVJ4QlyDuV6hzZ8cjQZLSwSf8qPnMUXfMtLoxSxcspz6tzUhrZBAqjyo&upload_protocol=resumable "HTTP/1.1 200 OK"
[file retrieval] 2025-06-23 18:05:12 [INFO] AFC is enabled with max remote calls: 10.
[file retrieval] 2025-06-23 18:05:22 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent "HTTP/1.1 200 OK"
[file retrieval] 20

### b. LLM call (without OCR)

In [17]:
for path in img_filepaths:
    my_file = gemini_client.files.upload(file=path)

    response = gemini_client.models.generate_content(
        model='gemini-2.0-flash',
        contents=[
            prompt_llm,
            my_file
        ]
    )

    with open(output_dir / f"llm-img2{doc_format}" / "gemini-2.0-flash" / Path(path.stem + f".{doc_format}"), 'w') as file:
        file.write(response.text)

[file retrieval] 2025-06-30 15:57:11 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files "HTTP/1.1 200 OK"
[file retrieval] 2025-06-30 15:57:11 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files?upload_id=ABgVH8-acDCIOvz27fq2eb2vZioJCF8BN-044zRuKKM1P7IZM9f8JT8PaT8jpwpouYDaJ1Mn8kXyGt_zkX_1P0JMEH1MWeAdikdPHQ1A_MEGnfk&upload_protocol=resumable "HTTP/1.1 200 OK"
[file retrieval] 2025-06-30 15:57:14 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files?upload_id=ABgVH8-acDCIOvz27fq2eb2vZioJCF8BN-044zRuKKM1P7IZM9f8JT8PaT8jpwpouYDaJ1Mn8kXyGt_zkX_1P0JMEH1MWeAdikdPHQ1A_MEGnfk&upload_protocol=resumable "HTTP/1.1 200 OK"
[file retrieval] 2025-06-30 15:57:14 [INFO] AFC is enabled with max remote calls: 10.
[file retrieval] 2025-06-30 15:57:18 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent "HTTP/1.1 200 OK"
[file retrieval] 20

## 6. Send to Claude

### a. OCR-LLM call

In [None]:
# for path in img_filepaths:
#     base64_image = encode_image(path)

#     response = claude_client.messages.create(
#         model='claude-opus-4-20250514',
#         temperature=0,
#         max_tokens=10,
#         messages=[
#             {
#                 "role": "user", 
#                 "content": [
#                     {
#                         "type": "text",
#                         "text": prompt_ocr_llm
#                     },
#                     {
#                         "type": "image",
#                         "source": {
#                             "type": "base64",
#                             "media_type": "image/png",
#                             "data": base64_image
#                         }
#                     }
#                 ]
#             }
#             ]
#     )
#     print(response)

#     with open(txt_output_dir / "ocr-llm-img2txt" / "claude-4-sonnet" / Path(path.stem + ".txt"), 'w') as file:
#         file.write(response.choices[0].message.content)

### b. LLM call (without OCR)

In [None]:
# for path in img_filepaths:
#     base64_image = encode_image(path)

#     response = claude_client.messages.create(
#         model='claude-opus-4-20250514',
#         temperature=0,
#         messages=[
#             {
#                 "role": "user", 
#                 "content": [
#                     {
#                         "type": "text",
#                         "text": prompt_llm
#                     },
#                     {
#                         "type": "image",
#                         "source": {
#                             "type": "base64",
#                             "media_type": "image/png",
#                             "data": base64_image
#                         }
#                     }
#                 ]
#             }
#             ]
#     )

#     with open(txt_output_dir / "llm-img2txt" / "claude-4-sonnet" / Path(path.stem + ".txt"), 'w') as file:
#         file.write(response.choices[0].message.content)

## 7. Benchmark results

In [None]:
import glob
import json
import sys
sys.path.append(str(Path.cwd().
parent))
from benchmarking.txt_accuracy import clean_text_normalized, clean_text_nonorm, compute_metrics, build_dataframe
from tools.file_retrieval import get_doc_names, get_docs, get_all_models
from venv import logger
from datetime import datetime

def main():
    """
    Prerequisites:
    - Ground truth text files located at `project_root/ground-truth/txt/kbaa-p#xyz.txt`
    - LLM/OCR transcribed files located at:
        - for LLM transcriptions: `project_root/results/llm_img2txt/<MODEL-NAME>/kbaa-p#xyz.txt`
        - for OCR transcriptions: `project_root/results/ocr_img2txt/<MODEL-NAME>/kbaa-p#xyz.txt`

    The main function will:
    - Gather all ground truth text files
    - For each ground truth text file and for each LLM/OCR model, gather the corresponding transcription
    - Clean all the text files (normalized and not normalized)
    - Compute metrics for each file and model
    - Save results in two CSV files (one for normalized, one for non-normalized)
        - Results are saved in `project_root/benchmarking-results/txt-accuracy`
    """

    # =============
    # Preliminaries
    # =============

    # args = parse_arguments()

    script_dir = str(Path.cwd())
    project_root = str(root_dir)
    logger.info("Script directory: %s", script_dir)
    logger.info("Project root: %s", project_root)

    # Ground truth
    ground_truth_dir = root_dir / "data" / "ground-truth" / doc_format
    doc_names = get_doc_names(ground_truth_dir, doc_format, keep_prefix=False)

    # results/ paths
    all_models = get_all_models(
        os.path.join(output_dir, f"llm-img2{doc_format}"),
        os.path.join(output_dir, "ocr-img2txt"),
        os.path.join(output_dir, f"ocr-llm-img2{doc_format}"),
        doc_format
    )
    logger.info(f"Models found: {all_models}")

    # ===========
    # Gather files
    # ===========

    # -> Gather ground truths and put into dict:
    ground_truths, all_texts = get_docs(ground_truth_dir, doc_names, doc_format, name_has_prefix=True)
    ground_truths["__ALL__"] = all_texts
    if doc_format == "txt":
        doc_lengths_normalized = {
            doc: len(clean_text_normalized(text)) for doc, text in ground_truths.items()
        }
        doc_lengths_nonorm = {
            doc: len(clean_text_nonorm(text)) for doc, text in ground_truths.items()
        }
        total_doc_len_normalized = len(clean_text_normalized(ground_truths["__ALL__"]))
        total_doc_len_nonorm = len(clean_text_nonorm(ground_truths["__ALL__"]))
    elif doc_format == "json":
        doc_lengths_normalized, doc_lengths_nonorm, total_doc_len_normalized, total_doc_len_nonorm = {}, {}, 0, 0
        for doc, json_data in ground_truths.items():

            # Loop over each entry in json object array
            for entry in json_data["entries"]:

                # Loop over each field's value in the entry
                for text in entry.values():
                    doc_lengths_normalized[doc] = doc_lengths_normalized.get(doc, 0) + len(entry)
                    doc_lengths_nonorm[doc] = doc_lengths_nonorm.get(doc, 0) + len(entry)
            
            # Add up the totals as we go along with doc_lengths_normalized etc.
            total_doc_len_normalized += doc_lengths_normalized[doc]
            total_doc_len_nonorm += doc_lengths_nonorm[doc]

    # -> Gather each transcribed document and put into dict:

    # Structure: results[model][doc]
    results = {}

    for model_type, model in all_models:
        logger.info("Collecting results for model: %s", model)
        model_path = os.path.join(output_dir, model_type, model)
        results[model_type] = results.get(model_type, {})
        results[model_type][model], results[model_type][model]["__ALL__"] = get_docs(model_path, doc_names, doc_format, name_has_prefix=False)
        logger.info("Collected results for model_type: %s, model: %s", model_type, model)

    # ===============
    # Compute metrics
    # ===============

    normalized_results_data = {}
    nonorm_results_data = {}

    for model_type, model in all_models:
        normalized_results_data[model_type] = normalized_results_data.get(model_type, {})
        normalized_results_data[model_type][model] = normalized_results_data[model_type].get(model, {})
        nonorm_results_data[model_type] = nonorm_results_data.get(model_type, {})
        nonorm_results_data[model_type][model] = nonorm_results_data[model_type].get(model, {})

        logger.info("Computing metrics for model_type: %s, model: %s", model_type, model)
        for doc in doc_names:
            logger.info("Computing metrics for document: %s", doc)
            normalized_results_data[model_type][model][doc] = compute_metrics(
                ground_truths[doc], results[model_type][model][doc], doc_format, normalized=True
            )
            nonorm_results_data[model_type][model][doc] = compute_metrics(
                ground_truths[doc], results[model_type][model][doc], doc_format, normalized=False
            )

        normalized_results_data[model_type][model]["__ALL__"] = compute_metrics(
            ground_truths["__ALL__"], results[model_type][model]["__ALL__"], doc_format, normalized=True
        )
        nonorm_results_data[model_type][model]["__ALL__"] = compute_metrics(
            ground_truths["__ALL__"], results[model_type][model]["__ALL__"], doc_format, normalized=False
        )

    # Compute metrics separately for __ALL__]

    # ====================
    # Put metrics in table
    # ====================

    time = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

    results_base_dir = root_dir / "benchmarking-results" / f"{doc_format}-accuracy"

    # Create different results directory for each model type
    for model_type, _ in all_models:
        results_dir = results_base_dir / model_type
        results_dir.mkdir(parents=True, exist_ok=True)

        normalized_df = build_dataframe(
            f"normalized_{time}",
            doc_names,
            normalized_results_data[model_type],
            doc_lengths_normalized,
            total_doc_len_normalized,
        )
        nonorm_df = build_dataframe(
            f"nonorm_{time}",
            doc_names,
            nonorm_results_data[model_type],
            doc_lengths_nonorm,
            total_doc_len_nonorm,
        )

        # ============
        # Save results
        # ============

        # # Default save to project_root/benchmarking-results/txt-accuracy
        # results_path = os.path.join(project_root, "benchmarking-results", "txt-accuracy")
        # if not os.path.exists(results_path):
        #     os.makedirs(results_path)
        normalized_df.to_csv(os.path.join(str(results_dir), f"normalized_{time}.csv"))
        nonorm_df.to_csv(os.path.join(str(results_dir), f"nonorm_{time}.csv"))


if __name__ == "__main__":
    main()

[file retrieval] 2025-07-05 18:53:45 [INFO] Script directory: /Users/muhammadkhalid/Desktop/map2025/ocr-benchmarking/src/workflow
[file retrieval] 2025-07-05 18:53:45 [INFO] Project root: /Users/muhammadkhalid/Desktop/map2025/ocr-benchmarking
[file retrieval] 2025-07-05 18:53:45 [INFO] Found ground-truth txt files: ['/Users/muhammadkhalid/Desktop/map2025/ocr-benchmarking/data/ground-truth/json/gt_kbaa-p003.json']
[file retrieval] 2025-07-05 18:53:45 [INFO] Found file names: ['gt_kbaa-p003']
[file retrieval] 2025-07-05 18:53:45 [INFO] Models found: [('llm-img2json', 'gemini-2.0-flash'), ('ocr-llm-img2json', 'gemini-2.0-flash'), ('llm-img2json', 'gpt-4o'), ('ocr-llm-img2json', 'gpt-4o')]
[file retrieval] 2025-07-05 18:53:45 [INFO] Collecting results for model: gemini-2.0-flash
[file retrieval] 2025-07-05 18:53:45 [INFO] Collected results for model_type: llm-img2json, model: gemini-2.0-flash
[file retrieval] 2025-07-05 18:53:45 [INFO] Collecting results for model: gemini-2.0-flash
[file r

{'entries': [{'city': 'Boston', 'description': 'Written by the patient after her recovery.', 'index': 1, 'first-name': 'B.C.', 'last-name': 'A.', 'library': 'DLC', 'page-count': 47, 'publish-year': 1909, 'publisher': 'R. G. Badger', 'title': 'My life as a dissociated personality...with an introduction by Morton Prince, M.D.'}, {'birth-year': 1854, 'city': 'Kristiansand', 'description': 'Minnesota violinist.', 'first-name': 'Eivind P.', 'index': 2, 'last-name': 'Aakus', 'library': 'MnHi', 'page-count': 112, 'publish-year': 1932, 'publisher': 'Johanssen & Tangens', 'title': 'Minne...'}, {'birth-year': 1863, 'city': 'Boston', 'death-year': 1934, 'description': 'Reporter in Chicago and N.Y.', 'first-name': 'Willis John', 'index': 3, 'last-name': 'Abbot', 'library': 'WU', 'page-count': 358, 'publish-year': 1934, 'publisher': 'Little, Brown', 'title': 'Watching the world go by.'}, {'description': 'Story of youth in California about 1850.', 'first-name': 'Augustus', 'index': 4, 'last-name': '

In [8]:
import sys
import json
from pathlib import Path
sys.path.append(str(Path.cwd().
parent))
from benchmarking.txt_accuracy import clean_text_normalized, clean_text_nonorm, compute_metrics, build_dataframe
with open("/Users/muhammadkhalid/Desktop/map2025/ocr-benchmarking/results/json/llm-img2json/gemini-2.0-flash/kbaa-p003.json", "r") as file:
    raw_json_data = json.load(file)
text_data = json.dumps(raw_json_data['entries'][0])
with open("/Users/muhammadkhalid/Desktop/test.txt", "w") as file:
    file.write(text_data)
with open("/Users/muhammadkhalid/Desktop/map2025/ocr-benchmarking/data/ground-truth/json/gt_kbaa-p003.json", "r") as file:
    gt_json_data = json.load(file)
# for i in range(len(raw_json_data['entries'])):
    
print(json.dumps(raw_json_data['entries'][0]))

{"city": "Boston", "description": "Written by the patient after her recovery.", "index": 1, "first-name": "B.C.", "last-name": "A.", "library": "DLC", "page-count": 47, "publish-year": 1909, "publisher": "R. G. Badger", "title": "My life as a dissociated personality...with an introduction by Morton Prince, M.D."}
