# OCR-mLLM Pipeline

Before running this code you will need to set up your OpenAI & Gemini API keys. Here's how I did it:

1. Create a new file in your root directory called `.env` (no prefix)
2. Store your API keys with the following names: OPENAI_API_KEY, ANTHROPIC_API_KEY, and GOOGLE_API_KEY
3. Create a virtual environment by typing the following commands into your terminal:
    - ```python3 -m venv .venv```
    - ```source .venv/bin/activate```
    - ```pip install -r requirements.txt```
4. After running the pipeline, type ```deactivate``` in your terminal to make everything go back to normal

## 1. Setup

### a. Run this cell to ensure you have all the necessary directories

Before running the cell make sure you have an images folder in your root directory to feed the images into the pipeline

In [38]:
from pathlib import Path
import os
import pytesseract
from PIL import Image
from json_creation import *
from google.genai import types


# Get the root directory of the project
root_dir = Path.cwd().parent.parent

doc_format = "txt"

# Get the user's path for the images folder assuming all images are stored here in .png format
source_dir = root_dir / "data" / "pngs"

# Get the user's path for the output folder, create one if it doesn't exist
output_dir = root_dir / "results" / doc_format
output_dir.mkdir(parents=True, exist_ok=True)

bm_output_dir = root_dir / "benchmarking-results"/ f"{doc_format}-accuracy"
bm_output_dir.mkdir(parents=True, exist_ok=True)

# llm_array = ["gpt-4o", "gemini-2.5-flash", "claude-4-sonnet"]
llm_array = ["gpt-4o", "gemini-2.5-flash"]

def make_llm_dirs(llm_array, target_dir, doc_format):
    for llm in llm_array:
        if doc_format == "txt":
            dir = target_dir / f"ocr-img2txt" / "pytesseract"
            dir.mkdir(parents=True, exist_ok=True)
            dir = target_dir / f"llm-img2{doc_format}" / llm
            dir.mkdir(parents=True, exist_ok=True)
            dir = target_dir / f"ocr-llm-img2{doc_format}" / llm
            dir.mkdir(parents=True, exist_ok=True)
        else:
            dir = target_dir / f"llm-img2{doc_format}" / llm
            dir.mkdir(parents=True, exist_ok=True)
            dir = target_dir / f"llm-txt2{doc_format}" / llm
            dir.mkdir(parents=True, exist_ok=True)
make_llm_dirs(llm_array, output_dir, doc_format)

### b. Setup API keys & image encoding function

In [8]:
from openai import OpenAI
from anthropic import Anthropic
from google import genai
import base64
#from dotenv import load_dotenv

#load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


gpt_client = OpenAI(api_key=openai_api_key)
gemini_client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
claude_client = Anthropic(api_key=anthropic_api_key)


### c. Get image file paths

In [9]:
# Add all filenames in images directory into the `filenames` array with the ENTIRE filepath
img_filepaths = []
count = 0
for path in source_dir.iterdir():
  if count < 10:
    if path.suffix.lower() == ".png" and path.is_file():
      img_filepaths.append(path)
      count += 1

## 2. Run pytesseract

In [29]:
# Windows users should run this cell, inserting their path to Tesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'

In [30]:
# Read the files from ocr-benchmarking/images folder & write to results folder
for path in img_filepaths:
    file_name = output_dir / "ocr-img2txt" / path.stem
    file_name = str(file_name) + ".txt"
    
    with open(file_name, 'w') as file:
        file.write(pytesseract.image_to_string(Image.open(str(path)))) # TODO: Change config as needed

## 3. Prepare the prompt

In [24]:
prompt_template_ocr_llm = """
You are a text correction assistant. Your task is to clean up and correct errors from raw OCR output.
The text may contain misrecognized characters, broken words, or incorrect formatting.
Carefully read the provided OCR output, compare it to the original image, and produce a corrected version that is  
as faithful to the original content as possible. Only correct obvious OCR errors, and do not attempt to complete
cut-off entries or predict missing information. Put each entry on a separate line.
When an entry has an index number in square brackets, place it at the end of the entry.
Input (Raw OCR Text):
{input}
"""

prompt_llm = """
Your task is to transcribe this image of a historical bibliography page as faithfully as possible.
Only transcribe typed text that appears on the page and do not attempt to predict missing information or complete cut off entries. 
Put each entry on a separate line. When an entry has an index number in square brackets, place it at the end of the entry. 
"""



## 4. OpenAI

### (i) Text

#### a. OCR-LLM call

In [25]:
for path in img_filepaths:
    input = ""
    base64_image = encode_image(path)
    ocr_text_path = str(output_dir / "ocr-img2txt" / path.stem) + ".txt" # THIS REMAINS THE SAME b/c we're reading the OCR output
    with open(ocr_text_path, 'r') as file:
        input += file.read()
    prompt_ocr_llm = prompt_template_ocr_llm.format(input=input).strip()

    response = gpt_client.chat.completions.create(
        model='gpt-4o',
        temperature= 0,
        messages=[
            {
                "role": "user", 
                "content": [
                    {
                        "type": "text",
                        "text": prompt_ocr_llm
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{base64_image}"
                        }
                    }
                ]
            }
            ]
    )

    with open(output_dir / f"ocr-llm-img2{doc_format}" / "gpt-4o" / Path(path.stem + f".{doc_format}"), 'w') as file:
        file.write(response.choices[0].message.content)

[file retrieval] 2025-07-08 14:23:07 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 14:23:50 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 14:24:28 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 14:25:16 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 14:25:57 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 14:26:32 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 14:27:24 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 14:28:13 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file re

#### b. LLM call (without OCR)

In [26]:
for path in img_filepaths:
    base64_image = encode_image(path)

    response = gpt_client.chat.completions.create(
        model='gpt-4o',
        temperature= 0,
        messages=[
            {
                "role": "user", 
                "content": [
                    {
                        "type": "text",
                        "text": prompt_llm
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{base64_image}"
                        }
                    }
                ]
            }
            ]
    )

    with open(output_dir / f"llm-img2{doc_format}" / "gpt-4o" / Path(path.stem + f".{doc_format}"), 'w') as file:
        file.write(response.choices[0].message.content)

[file retrieval] 2025-07-08 14:36:06 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 14:36:42 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 14:37:30 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 14:38:15 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 14:38:55 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 14:39:44 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 14:40:29 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 14:41:10 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file re

### (ii) JSON

#### a. Image to JSON

In [36]:
if doc_format == "json":
    count = 0
    for path in img_filepaths:
        print("Image path", path)
        #if count == 1:
            #break
        count += 1
        response = openai_img2json(path)
        with open(output_dir / f"llm-img2{doc_format}" / "gpt-4o" / Path(path.stem + f".{doc_format}"), 'w') as file:
            print("Output path:", output_dir / f"llm-img2{doc_format}" / "gpt-4o" / Path(path.stem + f".{doc_format}"))
            file.write(response)

Image path c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\data\pngs\kbaa-p003.png


[file retrieval] 2025-07-08 10:55:41 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Output path: c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-img2json\gpt-4o\kbaa-p003.json
Image path c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\data\pngs\kbaa-p004.png


[file retrieval] 2025-07-08 10:56:40 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Output path: c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-img2json\gpt-4o\kbaa-p004.json
Image path c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\data\pngs\kbaa-p005.png


[file retrieval] 2025-07-08 10:57:48 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Output path: c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-img2json\gpt-4o\kbaa-p005.json
Image path c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\data\pngs\kbaa-p006.png


[file retrieval] 2025-07-08 10:59:12 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Output path: c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-img2json\gpt-4o\kbaa-p006.json
Image path c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\data\pngs\kbaa-p007.png


[file retrieval] 2025-07-08 11:00:31 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Output path: c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-img2json\gpt-4o\kbaa-p007.json
Image path c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\data\pngs\kbaa-p008.png


[file retrieval] 2025-07-08 11:02:08 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Output path: c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-img2json\gpt-4o\kbaa-p008.json
Image path c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\data\pngs\kbaa-p009.png


[file retrieval] 2025-07-08 11:04:08 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Output path: c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-img2json\gpt-4o\kbaa-p009.json
Image path c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\data\pngs\kbaa-p010.png


[file retrieval] 2025-07-08 11:05:16 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Output path: c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-img2json\gpt-4o\kbaa-p010.json
Image path c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\data\pngs\kbaa-p011.png


[file retrieval] 2025-07-08 11:06:25 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Output path: c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-img2json\gpt-4o\kbaa-p011.json
Image path c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\data\pngs\kbaa-p012.png


[file retrieval] 2025-07-08 11:07:50 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Output path: c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-img2json\gpt-4o\kbaa-p012.json


#### b. Text to JSON

In [39]:
if doc_format == "json":
    count = 0
    for path in img_filepaths:
        
        ocr_text_path = str(root_dir / "results" / "txt" / "ocr-llm-img2txt" / "gpt-4o" /path.stem) + ".txt" # THIS REMAINS THE SAME b/c we're reading the OCR output
        #if count == 1:
            #break
        count += 1
        #response = openai_txt2json(ocr_text_path.replace("json", "txt"))
        response = openai_txt2json(ocr_text_path)
        with open(output_dir / f"llm-txt2{doc_format}" / "gpt-4o" / Path(path.stem + f".{doc_format}"), 'w') as file:
            print("Writing to", output_dir / f"llm-txt2{doc_format}" / "gpt-4o" / Path(path.stem + f".{doc_format}"))
            file.write(response)

[file retrieval] 2025-07-08 11:12:11 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Writing to c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-txt2json\gpt-4o\kbaa-p003.json


[file retrieval] 2025-07-08 11:12:53 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Writing to c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-txt2json\gpt-4o\kbaa-p004.json


[file retrieval] 2025-07-08 11:13:23 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Writing to c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-txt2json\gpt-4o\kbaa-p005.json


[file retrieval] 2025-07-08 11:13:39 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Writing to c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-txt2json\gpt-4o\kbaa-p006.json


[file retrieval] 2025-07-08 11:14:13 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Writing to c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-txt2json\gpt-4o\kbaa-p007.json


[file retrieval] 2025-07-08 11:14:41 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Writing to c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-txt2json\gpt-4o\kbaa-p008.json


[file retrieval] 2025-07-08 11:15:11 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Writing to c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-txt2json\gpt-4o\kbaa-p009.json


[file retrieval] 2025-07-08 11:15:43 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Writing to c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-txt2json\gpt-4o\kbaa-p010.json


[file retrieval] 2025-07-08 11:15:51 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Writing to c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-txt2json\gpt-4o\kbaa-p011.json


[file retrieval] 2025-07-08 11:16:28 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Writing to c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\json\llm-txt2json\gpt-4o\kbaa-p012.json


## 5. Gemini


### (i) Text

#### a. OCR-LLM call

In [33]:
for path in img_filepaths:
    my_file = gemini_client.files.upload(file=path)
    input = ""
    ocr_text_path = str(output_dir / "ocr-img2txt" / path.stem) + ".txt" # THIS REMAINS THE SAME b/c we're reading the OCR output
    if doc_format == "txt":
        with open(ocr_text_path, 'r') as file:
            input += file.read()
        prompt_ocr_llm = prompt_template_ocr_llm.format(input=input).strip()

        response = gemini_client.models.generate_content(
            model='gemini-2.5-flash',
            config= types.GenerateContentConfig(
            temperature = 0
            ),
            contents=[
                prompt_ocr_llm,
                my_file
            ]
        )

        with open(output_dir / f"ocr-llm-img2{doc_format}" / "gemini-2.5-flash" / Path(path.stem + f".{doc_format}"), 'w') as file:
            file.write(response.text)
    #elif doc_format == "json":   

[file retrieval] 2025-07-08 15:02:17 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 15:02:18 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files?upload_id=ABgVH8_VoC2Um0I9t19fNNApfbOVpDMJLhi8ha-yUlkSA9H4-CrZrV_VvU_O2lPmw8ZxP-HAxoi5RakhRUFxQ2YCaNm99SjTc8qMtQM4Bfpa6w&upload_protocol=resumable "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 15:02:21 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files?upload_id=ABgVH8_VoC2Um0I9t19fNNApfbOVpDMJLhi8ha-yUlkSA9H4-CrZrV_VvU_O2lPmw8ZxP-HAxoi5RakhRUFxQ2YCaNm99SjTc8qMtQM4Bfpa6w&upload_protocol=resumable "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 15:02:21 [INFO] AFC is enabled with max remote calls: 10.
[file retrieval] 2025-07-08 15:02:36 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent "HTTP/1.1 200 OK"
[file retrieval] 2025

#### b. LLM call (without OCR)

In [34]:
for path in img_filepaths:
    my_file = gemini_client.files.upload(file=path)

    response = gemini_client.models.generate_content(
        model='gemini-2.5-flash',
        config= types.GenerateContentConfig(
        temperature = 0
        ),
        contents=[
            prompt_llm,
            my_file
        ]
    )

    with open(output_dir / f"llm-img2{doc_format}" / "gemini-2.5-flash" / Path(path.stem + f".{doc_format}"), 'w') as file:
        file.write(response.text)

[file retrieval] 2025-07-08 15:09:11 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 15:09:13 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files?upload_id=ABgVH88UFd5YznxNEbxsKTmzlpaAf4TIy4biEg_KmDWGFP0dIfdEtZnpHmApBUAU4wboCjGRsEVi6SQmSBuQaHAAjUy_v-Dhk8kzNUKJG9Xq0Js&upload_protocol=resumable "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 15:09:16 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files?upload_id=ABgVH88UFd5YznxNEbxsKTmzlpaAf4TIy4biEg_KmDWGFP0dIfdEtZnpHmApBUAU4wboCjGRsEVi6SQmSBuQaHAAjUy_v-Dhk8kzNUKJG9Xq0Js&upload_protocol=resumable "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 15:09:16 [INFO] AFC is enabled with max remote calls: 10.
[file retrieval] 2025-07-08 15:09:27 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent "HTTP/1.1 200 OK"
[file retrieval] 20

### (ii) JSON

#### a. Image to JSON

In [40]:
if doc_format == "json":
    count = 0
    for path in img_filepaths:
        #if count == 1:
            #break
        count += 1
        response = gemini_img2json(path)
        with open(output_dir / f"llm-img2{doc_format}" / "gemini-2.5-flash" / Path(path.stem + f".{doc_format}"), 'w') as file:
            file.write(response)

[file retrieval] 2025-07-08 11:17:25 [INFO] AFC is enabled with max remote calls: 10.
[file retrieval] 2025-07-08 11:18:19 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 11:18:19 [INFO] AFC remote call 1 is done.
[file retrieval] 2025-07-08 11:18:20 [INFO] AFC is enabled with max remote calls: 10.
[file retrieval] 2025-07-08 11:19:09 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 11:19:09 [INFO] AFC remote call 1 is done.
[file retrieval] 2025-07-08 11:19:09 [INFO] AFC is enabled with max remote calls: 10.
[file retrieval] 2025-07-08 11:19:55 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 11:19:55 [INFO] AFC remote call 1 is done.
[file retrieval] 20

#### b. Text to JSON

In [45]:
if doc_format == "json":
    count = 0
    for path in img_filepaths:
        ocr_text_path = str(root_dir/ "results" / "txt" / "ocr-llm-img2txt" / "gemini-2.5-flash" / path.stem) + ".txt" # THIS REMAINS THE SAME b/c we're reading the OCR output
        #if count == 1:
            #break
        count += 1
        response = gemini_txt2json(ocr_text_path)
        with open(output_dir / f"llm-txt2{doc_format}" / "gemini-2.5-flash" / Path(path.stem + f".{doc_format}"), 'w') as file:
            file.write(response)

[file retrieval] 2025-07-08 11:28:39 [INFO] AFC is enabled with max remote calls: 10.


[file retrieval] 2025-07-08 11:29:14 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 11:29:14 [INFO] AFC remote call 1 is done.
[file retrieval] 2025-07-08 11:29:14 [INFO] AFC is enabled with max remote calls: 10.
[file retrieval] 2025-07-08 11:29:57 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 11:29:57 [INFO] AFC remote call 1 is done.
[file retrieval] 2025-07-08 11:29:57 [INFO] AFC is enabled with max remote calls: 10.
[file retrieval] 2025-07-08 11:30:47 [INFO] HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent "HTTP/1.1 200 OK"
[file retrieval] 2025-07-08 11:30:48 [INFO] AFC remote call 1 is done.
[file retrieval] 2025-07-08 11:30:48 [INFO] AFC is enabled with max remote calls: 10.
[file retrieval] 20

## 6. Send to Claude

### a. OCR-LLM call

In [None]:
# for path in img_filepaths:
#     base64_image = encode_image(path)

#     response = claude_client.messages.create(
#         model='claude-opus-4-20250514',
#         temperature=0,
#         max_tokens=10,
#         messages=[
#             {
#                 "role": "user", 
#                 "content": [
#                     {
#                         "type": "text",
#                         "text": prompt_ocr_llm
#                     },
#                     {
#                         "type": "image",
#                         "source": {
#                             "type": "base64",
#                             "media_type": "image/png",
#                             "data": base64_image
#                         }
#                     }
#                 ]
#             }
#             ]
#     )
#     print(response)

#     with open(txt_output_dir / "ocr-llm-img2txt" / "claude-4-sonnet" / Path(path.stem + ".txt"), 'w') as file:
#         file.write(response.choices[0].message.content)

### b. LLM call (without OCR)

In [None]:
# for path in img_filepaths:
#     base64_image = encode_image(path)

#     response = claude_client.messages.create(
#         model='claude-opus-4-20250514',
#         temperature=0,
#         messages=[
#             {
#                 "role": "user", 
#                 "content": [
#                     {
#                         "type": "text",
#                         "text": prompt_llm
#                     },
#                     {
#                         "type": "image",
#                         "source": {
#                             "type": "base64",
#                             "media_type": "image/png",
#                             "data": base64_image
#                         }
#                     }
#                 ]
#             }
#             ]
#     )

#     with open(txt_output_dir / "llm-img2txt" / "claude-4-sonnet" / Path(path.stem + ".txt"), 'w') as file:
#         file.write(response.choices[0].message.content)

## 7. Benchmark results

In [37]:
import glob
import json
import sys
sys.path.append(str(Path.cwd().
parent))
from benchmarking.txt_accuracy import clean_text_normalized, clean_text_nonorm, compute_metrics, build_dataframe
from tools.file_retrieval import get_doc_names, get_docs, get_all_models
from venv import logger
from datetime import datetime

def main():
    """
    Prerequisites:
    - Ground truth text files located at `project_root/ground-truth/txt/kbaa-pxyz.txt`
    - LLM/OCR transcribed files located at:
        - for LLM transcriptions: `project_root/results/llm_img2txt/<MODEL-NAME>/kbaa-pxyz.txt`
        - for OCR transcriptions: `project_root/results/ocr_img2txt/<MODEL-NAME>/kbaa-pxyz.txt`

    The main function will:
    - Gather all ground truth text files
    - For each ground truth text file and for each LLM/OCR model, gather the corresponding transcription
    - Clean all the text files (normalized and not normalized)
    - Compute metrics for each file and model
    - Save results in two CSV files (one for normalized, one for non-normalized)
        - Results are saved in `project_root/benchmarking-results/txt-accuracy`
    """

    # =============
    # Preliminaries
    # =============

    # args = parse_arguments()

    script_dir = str(Path.cwd())
    project_root = str(root_dir)
    logger.info("Script directory: %s", script_dir)
    logger.info("Project root: %s", project_root)

    # Ground truth
    ground_truth_dir = root_dir / "data" / "ground-truth" / doc_format
    doc_names = get_doc_names(ground_truth_dir, doc_format, keep_prefix=False)
    #doc_names = ['kbaa-p038']

    # results/ paths
    if doc_format == "txt":
        all_models = get_all_models(
            doc_format,
            os.path.join(output_dir, f"llm-img2{doc_format}"),
            os.path.join(output_dir, "ocr-img2txt"),
            os.path.join(output_dir, f"ocr-llm-img2{doc_format}"),
        )
    else:
        all_models = get_all_models(
            doc_format,
            os.path.join(output_dir, f"llm-img2{doc_format}"),
            os.path.join(output_dir, f"llm-txt2{doc_format}"),
        )
    logger.info(f"Models found: {all_models}")

    # ===========
    # Gather files
    # ===========

    # -> Gather ground truths and put into dict:
    ground_truths, all_texts = get_docs(ground_truth_dir, doc_names, doc_format, name_has_prefix=True)
    ground_truths["__ALL__"] = all_texts
    if doc_format == "txt":
        doc_lengths_normalized = {
            doc: len(clean_text_normalized(text)) for doc, text in ground_truths.items()
        }
        doc_lengths_nonorm = {
            doc: len(clean_text_nonorm(text)) for doc, text in ground_truths.items()
        }
        total_doc_len_normalized = len(clean_text_normalized(ground_truths["__ALL__"]))
        total_doc_len_nonorm = len(clean_text_nonorm(ground_truths["__ALL__"]))
    elif doc_format == "json":
        doc_lengths_normalized, doc_lengths_nonorm, total_doc_len_normalized, total_doc_len_nonorm = {}, {}, 0, 0
        for doc, json_data in ground_truths.items():

            # Loop over each entry in json object array
            for entry in json_data["entries"]:

                # Loop over each field's value in the entry
                for text in entry.values():
                    doc_lengths_normalized[doc] = doc_lengths_normalized.get(doc, 0) + len(entry)
                    doc_lengths_nonorm[doc] = doc_lengths_nonorm.get(doc, 0) + len(entry)
            
            # Add up the totals as we go along with doc_lengths_normalized etc.
            total_doc_len_normalized += doc_lengths_normalized[doc]
            total_doc_len_nonorm += doc_lengths_nonorm[doc]

    # -> Gather each transcribed document and put into dict:

    # Structure: results[model][doc]
    results = {}

    for model_type, model in all_models:
        logger.info("Collecting results for model: %s", model)
        model_path = os.path.join(output_dir, model_type, model)
        results[model_type] = results.get(model_type, {})
        results[model_type][model], results[model_type][model]["__ALL__"] = get_docs(model_path, doc_names, doc_format, name_has_prefix=False)
        logger.info("Collected results for model_type: %s, model: %s", model_type, model)

    # ===============
    # Compute metrics
    # ===============

    normalized_results_data = {}
    nonorm_results_data = {}

    for model_type, model in all_models:
        normalized_results_data[model_type] = normalized_results_data.get(model_type, {})
        normalized_results_data[model_type][model] = normalized_results_data[model_type].get(model, {})
        nonorm_results_data[model_type] = nonorm_results_data.get(model_type, {})
        nonorm_results_data[model_type][model] = nonorm_results_data[model_type].get(model, {})

        logger.info("Computing metrics for model_type: %s, model: %s", model_type, model)
        for doc in doc_names:
            logger.info("Computing metrics for document: %s", doc)
            normalized_results_data[model_type][model][doc] = compute_metrics(
                ground_truths[doc], results[model_type][model][doc], doc_format, normalized=True
            )
            nonorm_results_data[model_type][model][doc] = compute_metrics(
                ground_truths[doc], results[model_type][model][doc], doc_format, normalized=False
            )

        normalized_results_data[model_type][model]["__ALL__"] = compute_metrics(
            ground_truths["__ALL__"], results[model_type][model]["__ALL__"], doc_format, normalized=True
        )
        nonorm_results_data[model_type][model]["__ALL__"] = compute_metrics(
            ground_truths["__ALL__"], results[model_type][model]["__ALL__"], doc_format, normalized=False
        )

    # Compute metrics separately for __ALL__]

    # ====================
    # Put metrics in table
    # ====================

    time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    results_base_dir = root_dir / "benchmarking-results" / f"{doc_format}-accuracy"

    # Create different results directory for each model type
    for model_type, _ in all_models:
        results_dir = results_base_dir / model_type
        results_dir.mkdir(parents=True, exist_ok=True)

        normalized_df = build_dataframe(
            f"normalized_{time}",
            doc_names,
            normalized_results_data[model_type],
            doc_lengths_normalized,
            total_doc_len_normalized,
        )
        nonorm_df = build_dataframe(
            f"nonorm_{time}",
            doc_names,
            nonorm_results_data[model_type],
            doc_lengths_nonorm,
            total_doc_len_nonorm,
        )

        # ============
        # Save results
        # ============

        # # Default save to project_root/benchmarking-results/txt-accuracy
        # results_path = os.path.join(project_root, "benchmarking-results", "txt-accuracy")
        # if not os.path.exists(results_path):
        #     os.makedirs(results_path)
        normalized_df.to_csv(os.path.join(str(results_dir), f"normalized_{time}.csv"))
        nonorm_df.to_csv(os.path.join(str(results_dir), f"nonorm_{time}.csv"))


if __name__ == "__main__":
    main()

[file retrieval] 2025-07-08 15:19:24 [INFO] Script directory: c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\src\workflow
[file retrieval] 2025-07-08 15:19:25 [INFO] Project root: c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1
[file retrieval] 2025-07-08 15:19:59 [INFO] Found ground-truth txt files: ['c:\\Users\\vriez\\OneDrive\\Desktop\\Summer MAP\\ocr-benchmarking-1\\data\\ground-truth\\txt\\gt_kbaa-p003.txt', 'c:\\Users\\vriez\\OneDrive\\Desktop\\Summer MAP\\ocr-benchmarking-1\\data\\ground-truth\\txt\\gt_kbaa-p004.txt', 'c:\\Users\\vriez\\OneDrive\\Desktop\\Summer MAP\\ocr-benchmarking-1\\data\\ground-truth\\txt\\gt_kbaa-p005.txt', 'c:\\Users\\vriez\\OneDrive\\Desktop\\Summer MAP\\ocr-benchmarking-1\\data\\ground-truth\\txt\\gt_kbaa-p006.txt', 'c:\\Users\\vriez\\OneDrive\\Desktop\\Summer MAP\\ocr-benchmarking-1\\data\\ground-truth\\txt\\gt_kbaa-p007.txt', 'c:\\Users\\vriez\\OneDrive\\Desktop\\Summer MAP\\ocr-benchmarking-1\\data\\ground-truth\\txt\\gt_

c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\txt\llm-img2txt
c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\txt\ocr-img2txt
c:\Users\vriez\OneDrive\Desktop\Summer MAP\ocr-benchmarking-1\results\txt\ocr-llm-img2txt


[file retrieval] 2025-07-08 15:21:46 [INFO] Models found: [('llm-img2txt', 'gemini-2.5-flash'), ('ocr-llm-img2txt', 'gemini-2.5-flash'), ('llm-img2txt', 'gpt-4o'), ('ocr-llm-img2txt', 'gpt-4o'), ('ocr-img2txt', 'kbaa-p003.txt'), ('ocr-img2txt', 'kbaa-p004.txt'), ('ocr-img2txt', 'kbaa-p005.txt'), ('ocr-img2txt', 'kbaa-p006.txt'), ('ocr-img2txt', 'kbaa-p007.txt'), ('ocr-img2txt', 'kbaa-p008.txt'), ('ocr-img2txt', 'kbaa-p009.txt'), ('ocr-img2txt', 'kbaa-p010.txt'), ('ocr-img2txt', 'kbaa-p011.txt'), ('ocr-img2txt', 'kbaa-p012.txt')]


KeyboardInterrupt: 