# OCR-mLLM Pipeline

Before running this code you will need to set up your OpenAI & Gemini API keys. Here's how I did it:

1. Create a new file in your root directory called `.env` (no prefix)
2. Store your API keys with the following names: OPENAI_API_KEY, ANTHROPIC_API_KEY, and GOOGLE_API_KEY
3. Create a virtual environment by typing the following commands into your terminal:
    - ```python3 -m venv .venv```
    - ```source .venv/bin/activate```
    - ```pip install -r requirements.txt```
4. After running the pipeline, type ```deactivate``` in your terminal to make everything go back to normal

## 1. Setup

### a. Run this cell to ensure you have all the necessary directories

Before running the cell make sure you have an images folder in your root directory to feed the images into the pipeline

In [None]:
from pathlib import Path
import os
import sys
import pytesseract
from PIL import Image
import asyncio
from venv import logger
from json_creation import *
from google.genai import types
sys.path.append('../')

# Get the root directory of the project
root_dir = Path.cwd().parent.parent

sys.path.append(Path(str(root_dir / "tools")))
from tools.file_retrieval import *
                
# Get the user's path for the images folder assuming all images are stored here in .png format
source_dir = root_dir / "data" / "pngs"
txt_source_dir = root_dir / "results" / "txt" / "ocr-llm-img2txt"

# Get the user's path for the output folder, create one if it doesn't exist
txt_output_dir = root_dir / "results" / "txt"
txt_output_dir.mkdir(parents=True, exist_ok=True)

json_output_dir = root_dir / "results" / "json"
json_output_dir.mkdir(parents=True, exist_ok=True)

bm_txt_output_dir = root_dir / "benchmarking-results"/ f"txt-accuracy"
bm_txt_output_dir.mkdir(parents=True, exist_ok=True)

bm_json_output_dir = root_dir / "benchmarking-results"/ f"json-accuracy"
bm_json_output_dir.mkdir(parents=True, exist_ok=True)

# llm_array = ["gpt-4o", "gemini-2.5-flash", "claude-4-sonnet"]
llms = {"openai": "gpt-4o", "google": "gemini-2.5-flash"}
# llms = {"openai": "gpt-4o", "google": "gemini-2.5-flash", "alibaba": "qwen2.5-vl-72b-instruct", "meta": "llama-4-maverick"}

def make_llm_dirs(llms, target_dir, doc_format):
    for llm in llms.values():
        if doc_format == "txt":
            dir = target_dir / f"ocr-img2txt" / "pytesseract"
            dir.mkdir(parents=True, exist_ok=True)
            dir = target_dir / f"llm-img2txt" / llm
            dir.mkdir(parents=True, exist_ok=True)
            dir = target_dir / f"ocr-llm-img2txt" / llm
            dir.mkdir(parents=True, exist_ok=True)
        else:
            dir = target_dir / f"llm-img2json" / llm
            dir.mkdir(parents=True, exist_ok=True)
            dir = target_dir / f"llm-txt2json" / llm
            dir.mkdir(parents=True, exist_ok=True)
make_llm_dirs(llms, txt_output_dir, "txt")
make_llm_dirs(llms, json_output_dir, "json")

### b. Setup API keys & image encoding function

In [None]:
# optional
from dotenv import load_dotenv

load_dotenv()

In [2]:
from openai import OpenAI
from anthropic import Anthropic
from google import genai
import base64
from json_creation import *
from txt_creation import *

openai_api_key = os.getenv("OPENAI_API_KEY")
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


gpt_client = OpenAI(api_key=openai_api_key)
gemini_client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
claude_client = Anthropic(api_key=anthropic_api_key)


### c. Get image file paths

In [3]:
# Add all filenames in images directory into the `filenames` array with the ENTIRE filepath
img_filepaths = []
ocr_output_filepaths = []

for path in source_dir.iterdir():
    if path.suffix.lower() == ".png" and path.is_file():
      img_filepaths.append(path)

## 2. Run pytesseract

In [None]:
# Windows users should run this cell, inserting their path to Tesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'

In [None]:
# Read the files from ocr-benchmarking/images folder & write to results folder
for path in img_filepaths:
    file_name = txt_output_dir / "ocr-img2txt" / "pytesseract" / path.stem
    file_name = str(file_name) + ".txt"
    
    with open(file_name, 'w') as file:
        file.write(pytesseract.image_to_string(Image.open(str(path)))) # TODO: Change config as needed

## 3. Prepare the prompt

In [4]:
prompt_template_ocr_llm = """
You are a text correction assistant. Your task is to clean up and correct errors from raw OCR output.
The text may contain misrecognized characters, broken words, or incorrect formatting.
Carefully read the provided OCR output, compare it to the original image, and produce a corrected version that is  
as faithful to the original content as possible. Only correct obvious OCR errors, and do not attempt to complete
cut-off entries or predict missing information. Put each entry on a separate line.
When an entry has an index number in square brackets, place it at the end of the entry.
Input (Raw OCR Text):
{input}
"""

prompt_llm = """
Your task is to transcribe this image of a historical bibliography page as faithfully as possible.
Only transcribe typed text that appears on the page and do not attempt to predict missing information or complete cut off entries. 
Put each entry on a separate line. When an entry has an index number in square brackets, place it at the end of the entry. 
"""



## 4. OpenAI

### (i) Text

#### a. OCR-LLM call

In [None]:
for path in img_filepaths:
    input = ""
    base64_image = encode_image(path)
    ocr_text_path = str(txt_output_dir / "ocr-img2txt" / "pytesseract" / path.stem) + ".txt" # THIS REMAINS THE SAME b/c we're reading the OCR output
    with open(ocr_text_path, 'r') as file:
        input += file.read()
    prompt_ocr_llm = prompt_template_ocr_llm.format(input=input).strip()

    response = gpt_client.chat.completions.create(
        model='gpt-4o',
        temperature= 0,
        messages=[
            {
                "role": "user", 
                "content": [
                    {
                        "type": "text",
                        "text": prompt_ocr_llm
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{base64_image}"
                        }
                    }
                ]
            }
            ]
    )

    with open(txt_output_dir / f"ocr-llm-img2txt" / "gpt-4o" / Path(path.stem + f".txt"), 'w') as file:
        file.write(response.choices[0].message.content)

#### b. LLM call (without OCR)

In [None]:
for path in img_filepaths:
    base64_image = encode_image(path)

    response = gpt_client.chat.completions.create(
        model='gpt-4o',
        temperature= 0,
        messages=[
            {
                "role": "user", 
                "content": [
                    {
                        "type": "text",
                        "text": prompt_llm
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{base64_image}"
                        }
                    }
                ]
            }
            ]
    )

    with open(txt_output_dir / f"llm-img2txt" / "gpt-4o" / Path(path.stem + f".txt"), 'w') as file:
        file.write(response.choices[0].message.content)

#### c. OCR-LLM (Async)

In [5]:
# Fetch ocr output files
ocr_output_dir = txt_output_dir/"ocr-img2txt"/"pytesseract"
ocr_output_filepaths = get_paths(ocr_output_dir, "txt")

# Run the async processes
await process_double_async(img_filepaths, ocr_output_filepaths, txt_output_dir/"ocr-llm-img2txt", openai_img_txt2txt_async, llms['openai'])

[file retrieval] 2025-07-11 15:43:19 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 15:43:21 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 15:43:21 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 15:43:25 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 15:44:04 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 15:44:08 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 15:44:12 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 15:44:14 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file re

#### d. LLM (Async)

In [6]:
await process_single_async(img_filepaths, txt_output_dir/"llm-img2txt", openai_img2txt_async, llms['openai'])

[file retrieval] 2025-07-11 15:51:10 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 15:51:15 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 15:51:27 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 15:51:30 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 15:51:59 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 15:52:02 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 15:52:07 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 15:52:25 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file re

### (ii) JSON

#### a. Image to JSON

In [None]:
for path in img_filepaths:
    response = openai_img2json(path)
    with open(json_output_dir / f"llm-img2json" / "gpt-4o" / Path(path.stem + ".json"), 'w') as file:
        print("Output path:", json_output_dir / f"llm-img2json" / "gpt-4o" / Path(path.stem + ".json"))
        file.write(response)

#### b. Text to JSON

In [None]:
dir = txt_source_dir / llms['openai'] # where to look for ocr-llm-img2txt output

# Get the text paths from ocr-llm-img2txt/gpt-4o directory
txt_filepaths = get_paths(dir, "txt")

for path in txt_filepaths:
    ocr_text_path = str(root_dir / "results" / "txt" / "ocr-img2txt" / "pytesseract" /path.stem) + ".txt" # THIS REMAINS THE SAME b/c we're reading the OCR output
    response = openai_txt2json(ocr_text_path)
    with open(json_output_dir / f"llm-txt2json" / "gpt-4o" / Path(path.stem + ".json"), 'w') as file:
        print("Writing to", json_output_dir / "llm-txt2json" / "gpt-4o" / Path(path.stem + ".json"))
        file.write(response)

#### c. Image to JSON (Async)

In [7]:
await process_json_async(img_filepaths, json_output_dir/"llm-img2json", openai_img2json_async, llms['openai'])

[file retrieval] 2025-07-11 16:02:14 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 16:02:19 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 16:02:23 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 16:02:31 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 16:03:20 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 16:03:24 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 16:03:26 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 16:03:37 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file re

#### d. Text to JSON (Async)

In [8]:
dir = txt_source_dir / llms['openai'] # where to look for ocr-llm-img2txt output
# Get the text paths from ocr-llm-img2txt/gpt-4o directory
txt_filepaths = get_paths(dir, "txt")

# Call the main function that concurrently runs relevant async function
await process_json_async(txt_filepaths, json_output_dir/"llm-txt2json", openai_txt2json_async, llms['openai'])

[file retrieval] 2025-07-11 16:12:56 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 16:13:07 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 16:13:08 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 16:13:20 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 16:13:23 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 16:13:29 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 16:13:31 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 16:13:36 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file re

Retrying in 2.63s after error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-N8S8KUq02KYXhJXKPXPfRjJq on tokens per min (TPM): Limit 30000, Used 30000, Requested 926. Please try again in 1.852s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Retrying in 2.28s after error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-N8S8KUq02KYXhJXKPXPfRjJq on tokens per min (TPM): Limit 30000, Used 30000, Requested 920. Please try again in 1.84s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


[file retrieval] 2025-07-11 16:14:37 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:14:37 [INFO] Retrying request to /chat/completions in 1.860000 seconds
[file retrieval] 2025-07-11 16:14:38 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:14:38 [INFO] Retrying request to /chat/completions in 1.840000 seconds
[file retrieval] 2025-07-11 16:14:40 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:14:40 [INFO] Retrying request to /chat/completions in 1.840000 seconds
[file retrieval] 2025-07-11 16:14:42 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:14:42 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"

Retrying in 2.82s after error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-N8S8KUq02KYXhJXKPXPfRjJq on tokens per min (TPM): Limit 30000, Used 30000, Requested 911. Please try again in 1.822s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


[file retrieval] 2025-07-11 16:15:05 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:15:05 [INFO] Retrying request to /chat/completions in 1.868000 seconds
[file retrieval] 2025-07-11 16:15:06 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:15:06 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:15:06 [INFO] Retrying request to /chat/completions in 1.972000 seconds
[file retrieval] 2025-07-11 16:15:06 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:15:06 [INFO] Retrying request to /chat/completions in 1.920000 seconds
[file retrieval] 2025-07-11 16:15:07 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"

Retrying in 2.01s after error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-N8S8KUq02KYXhJXKPXPfRjJq on tokens per min (TPM): Limit 30000, Used 30000, Requested 934. Please try again in 1.868s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


[file retrieval] 2025-07-11 16:15:07 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:15:07 [INFO] Retrying request to /chat/completions in 1.822000 seconds
[file retrieval] 2025-07-11 16:15:08 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:15:08 [INFO] Retrying request to /chat/completions in 1.972000 seconds
[file retrieval] 2025-07-11 16:15:08 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:15:08 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:15:08 [INFO] Retrying request to /chat/completions in 1.920000 seconds
[file retrieval] 2025-07-11 16:15:09 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"

Retrying in 2.68s after error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-N8S8KUq02KYXhJXKPXPfRjJq on tokens per min (TPM): Limit 30000, Used 29616, Requested 986. Please try again in 1.204s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


[file retrieval] 2025-07-11 16:15:11 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:15:11 [INFO] Retrying request to /chat/completions in 0.458000 seconds
[file retrieval] 2025-07-11 16:15:11 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:15:11 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:15:11 [INFO] Retrying request to /chat/completions in 0.226000 seconds
[file retrieval] 2025-07-11 16:15:13 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:15:13 [INFO] Retrying request to /chat/completions in 1.972000 seconds
[file retrieval] 2025-07-11 16:15:15 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"

Retrying in 4.79s after error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-N8S8KUq02KYXhJXKPXPfRjJq on tokens per min (TPM): Limit 30000, Used 29194, Requested 986. Please try again in 360ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


[file retrieval] 2025-07-11 16:15:26 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:15:26 [INFO] Retrying request to /chat/completions in 1.654000 seconds
[file retrieval] 2025-07-11 16:15:37 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 16:15:37 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 16:15:37 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:15:37 [INFO] Retrying request to /chat/completions in 1.774000 seconds
[file retrieval] 2025-07-11 16:15:39 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:15:39 [INFO] Retrying request to /chat/completions in 1.774000 seconds
[file retrieval] 2025-07-11 1

Retrying in 2.58s after error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-N8S8KUq02KYXhJXKPXPfRjJq on tokens per min (TPM): Limit 30000, Used 29415, Requested 887. Please try again in 604ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


[file retrieval] 2025-07-11 16:15:51 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 16:15:51 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:15:51 [INFO] Retrying request to /chat/completions in 1.882000 seconds
[file retrieval] 2025-07-11 16:15:52 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:15:52 [INFO] Retrying request to /chat/completions in 1.774000 seconds
[file retrieval] 2025-07-11 16:15:53 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:15:53 [INFO] Retrying request to /chat/completions in 1.882000 seconds
[file retrieval] 2025-07-11 16:15:53 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieva

Retrying in 2.80s after error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-N8S8KUq02KYXhJXKPXPfRjJq on tokens per min (TPM): Limit 30000, Used 30000, Requested 936. Please try again in 1.872s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Retrying in 2.55s after error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-N8S8KUq02KYXhJXKPXPfRjJq on tokens per min (TPM): Limit 30000, Used 30000, Requested 939. Please try again in 1.878s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


[file retrieval] 2025-07-11 16:16:13 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:16:13 [INFO] Retrying request to /chat/completions in 1.790000 seconds
[file retrieval] 2025-07-11 16:16:14 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:16:14 [INFO] Retrying request to /chat/completions in 1.878000 seconds
[file retrieval] 2025-07-11 16:16:15 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:16:15 [INFO] Retrying request to /chat/completions in 1.872000 seconds
[file retrieval] 2025-07-11 16:16:15 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:16:15 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"

Retrying in 2.70s after error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-N8S8KUq02KYXhJXKPXPfRjJq on tokens per min (TPM): Limit 30000, Used 30000, Requested 895. Please try again in 1.79s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


[file retrieval] 2025-07-11 16:16:19 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:16:19 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:16:19 [INFO] Retrying request to /chat/completions in 1.878000 seconds
[file retrieval] 2025-07-11 16:16:21 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:16:21 [INFO] Retrying request to /chat/completions in 1.872000 seconds
[file retrieval] 2025-07-11 16:16:21 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
[file retrieval] 2025-07-11 16:16:21 [INFO] Retrying request to /chat/completions in 1.242000 seconds
[file retrieval] 2025-07-11 16:16:22 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"

## 5. Gemini


### (i) Text

#### a. OCR-LLM call

In [None]:
for path in img_filepaths:
    my_file = gemini_client.files.upload(file=path)
    input = ""
    ocr_text_path = str(txt_output_dir / "ocr-img2txt" / "pytesseract" / path.stem) + ".txt" # THIS REMAINS THE SAME b/c we're reading the OCR output
    with open(ocr_text_path, 'r') as file:
        input += file.read()
    prompt_ocr_llm = prompt_template_ocr_llm.format(input=input).strip()

    response = gemini_client.models.generate_content(
        model='gemini-2.5-flash',
        config= types.GenerateContentConfig(
        temperature = 0
        ),
        contents=[
            prompt_ocr_llm,
            my_file
        ]
    )

    with open(txt_output_dir / f"ocr-llm-img2txt" / "gemini-2.5-flash" / Path(path.stem + f".txt"), 'w') as file:
        file.write(response.text)

#### b. LLM call (without OCR)

In [None]:
for path in img_filepaths:
    my_file = gemini_client.files.upload(file=path)

    response = gemini_client.models.generate_content(
        model='gemini-2.5-flash',
        config= types.GenerateContentConfig(
        temperature = 0
        ),
        contents=[
            prompt_llm,
            my_file
        ]
    )

    with open(txt_output_dir / f"llm-img2txt" / "gemini-2.5-flash" / Path(path.stem + f".txt"), 'w') as file:
        file.write(response.text)

#### c. LLM call (Async)

In [None]:
await process_single_async(img_filepaths, txt_output_dir/"llm-img2txt", gemini_img2txt_async, llms['google'])

#### d. OCR-LLM (Async)

In [None]:
# Fetch ocr output files
ocr_output_dir = txt_output_dir/"ocr-img2txt"/"pytesseract"
ocr_output_filepaths = get_paths(ocr_output_dir, "txt")

# Run the async processes
await process_double_async(img_filepaths, ocr_output_filepaths, txt_output_dir/"ocr-llm-img2txt", gemini_img_txt2txt_async, llms['google'])

### (ii) JSON

#### a. Image to JSON

In [None]:
for path in img_filepaths:
    response = gemini_img2json(path)
    with open(json_output_dir / f"llm-img2json" / "gemini-2.5-flash" / Path(path.stem + f".json"), 'w') as file:
        file.write(response)

#### b. Text to JSON

In [None]:

for path in img_filepaths:
    ocr_text_path = str(root_dir/ "results" / "txt" / "ocr-llm-img2txt" / "gemini-2.5-flash" / path.stem) + ".txt" # THIS REMAINS THE SAME b/c we're reading the OCR output
    response = gemini_txt2json(ocr_text_path)
    with open(json_output_dir / f"llm-txt2json" / "gemini-2.5-flash" / Path(path.stem + f".json"), 'w') as file:
        file.write(response)

#### c. Image to JSON (Async)

In [5]:
await process_json_async(img_filepaths, json_output_dir/"llm-img2json", gemini_img2json_async, llms['google'])

[file retrieval] 2025-07-11 10:52:58 [INFO] AFC is enabled with max remote calls: 10.
[file retrieval] 2025-07-11 10:52:58 [INFO] AFC is enabled with max remote calls: 10.


#### d. Text to JSON (Async)

In [None]:

dir = txt_source_dir / llms['google'] # where to look for ocr-llm-img2txt output

# Get the text paths from ocr-llm-img2txt/gpt-4o directory
txt_filepaths = get_paths(dir, "txt")

# Call the main function that concurrently runs relevant async function
await process_json_async(txt_filepaths, json_output_dir/"llm-txt2json", gemini_txt2json_async, llms['google'])


## 6. Send to Claude

### a. OCR-LLM call

In [None]:
# for path in img_filepaths:
#     base64_image = encode_image(path)

#     response = claude_client.messages.create(
#         model='claude-opus-4-20250514',
#         temperature=0,
#         max_tokens=10,
#         messages=[
#             {
#                 "role": "user", 
#                 "content": [
#                     {
#                         "type": "text",
#                         "text": prompt_ocr_llm
#                     },
#                     {
#                         "type": "image",
#                         "source": {
#                             "type": "base64",
#                             "media_type": "image/png",
#                             "data": base64_image
#                         }
#                     }
#                 ]
#             }
#             ]
#     )
#     print(response)

#     with open(txt_output_dir / "ocr-llm-img2txt" / "claude-4-sonnet" / Path(path.stem + ".txt"), 'w') as file:
#         file.write(response.choices[0].message.content)

### b. LLM call (without OCR)

In [None]:
# for path in img_filepaths:
#     base64_image = encode_image(path)

#     response = claude_client.messages.create(
#         model='claude-opus-4-20250514',
#         temperature=0,
#         messages=[
#             {
#                 "role": "user", 
#                 "content": [
#                     {
#                         "type": "text",
#                         "text": prompt_llm
#                     },
#                     {
#                         "type": "image",
#                         "source": {
#                             "type": "base64",
#                             "media_type": "image/png",
#                             "data": base64_image
#                         }
#                     }
#                 ]
#             }
#             ]
#     )

#     with open(txt_output_dir / "llm-img2txt" / "claude-4-sonnet" / Path(path.stem + ".txt"), 'w') as file:
#         file.write(response.choices[0].message.content)

## 7. Qwen

### (i) Text

#### a. LLM (Async)

In [5]:
await process_single_async(img_filepaths, txt_output_dir/"llm-img2txt", openrouter_img2txt_async, llms['alibaba'])

Here
Here
qwen2.5-vl-72b-instruct
qwen/qwen2.5-vl-72b-instruct
qwen2.5-vl-72b-instruct
qwen/qwen2.5-vl-72b-instruct


[file retrieval] 2025-07-11 15:44:05 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 15:44:07 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


#### b. OCR-LLM (Async)

In [5]:
# Fetch ocr output files
ocr_output_dir = txt_output_dir/"ocr-img2txt"/"pytesseract"
ocr_output_filepaths = get_paths(ocr_output_dir, "txt")

# Run the async processes
await process_double_async(img_filepaths, ocr_output_filepaths, txt_output_dir/"ocr-llm-img2txt", openrouter_img_txt2txt_async, llms['alibaba'])

[file retrieval] 2025-07-11 15:52:29 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 15:52:32 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


### (ii) JSON

## 8. Llama

### (i) Text

#### a. LLM (Async)

In [7]:
await process_single_async(img_filepaths, txt_output_dir/"llm-img2txt", openrouter_img2txt_async, llms['meta'])

Here
Here


[file retrieval] 2025-07-11 15:59:09 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 15:59:10 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


#### b. OCR-LLM (Async)

In [8]:
# Fetch ocr output files
ocr_output_dir = txt_output_dir/"ocr-img2txt"/"pytesseract"
ocr_output_filepaths = get_paths(ocr_output_dir, "txt")

# Run the async processes
await process_double_async(img_filepaths, ocr_output_filepaths, txt_output_dir/"ocr-llm-img2txt", openrouter_img_txt2txt_async, llms['meta'])

[file retrieval] 2025-07-11 16:02:48 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
[file retrieval] 2025-07-11 16:02:49 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


### (ii) JSON

## 9. Benchmark results

a. Text accuracy benchmarking

In [None]:
import glob
import sys
sys.path.append(str(Path.cwd().
parent))
from benchmarking.txt_accuracy import clean_text_normalized, clean_text_nonorm, compute_metrics, build_dataframe
from tools.file_retrieval import get_doc_names, get_docs, get_all_models
from datetime import datetime
from venv import logger

def main():
    """
    Prerequisites:
    - Ground truth text files located at `project_root/ground-truth/txt/kbaa-pxyz.txt`
    - LLM/OCR transcribed files located at:
        - for LLM transcriptions: `project_root/results/llm_img2txt/<MODEL-NAME>/kbaa-pxyz.txt`
        - for OCR transcriptions: `project_root/results/ocr_img2txt/<MODEL-NAME>/kbaa-pxyz.txt`

    The main function will:
    - Gather all ground truth text files
    - For each ground truth text file and for each LLM/OCR model, gather the corresponding transcription
    - Clean all the text files (normalized and not normalized)
    - Compute metrics for each file and model
    - Save results in two CSV files (one for normalized, one for non-normalized)
        - Results are saved in `project_root/benchmarking-results/txt-accuracy`
    """

    # =============
    # Preliminaries
    # =============

    # args = parse_arguments()

    script_dir = str(Path.cwd())
    project_root = str(root_dir)
    logger.info("Script directory: %s", script_dir)
    logger.info("Project root: %s", project_root)

    # Ground truth
    ground_truth_dir = root_dir / "data" / "ground-truth" / "txt"
    doc_names = get_doc_names(ground_truth_dir, "txt", keep_prefix=False)

    # results/ paths
    all_models = get_all_models(
        "txt",
        os.path.join(txt_output_dir, f"llm-img2txt"),
        os.path.join(txt_output_dir, "ocr-img2txt"),
        os.path.join(txt_output_dir, f"ocr-llm-img2txt"),
    )

    #all_models = get_all_models(
        #"json",
        #os.path.join(json_output_dir, f"llm-img2json"),
        #os.path.join(json_output_dir, f"llm-txt2json"),
    #)
    logger.info(f"Models found: {all_models}")

    # ===========
    # Gather files
    # ===========

    # -> Gather ground truths and put into dict:
    ground_truths, all_texts = get_docs(ground_truth_dir, doc_names, "txt", name_has_prefix=True)
    ground_truths["__ALL__"] = all_texts

    doc_lengths_normalized = {
        doc: len(clean_text_normalized(text)) for doc, text in ground_truths.items()
    }
    doc_lengths_nonorm = {
        doc: len(clean_text_nonorm(text)) for doc, text in ground_truths.items()
    }
    total_doc_len_normalized = len(clean_text_normalized(ground_truths["__ALL__"]))
    total_doc_len_nonorm = len(clean_text_nonorm(ground_truths["__ALL__"]))

    #doc_lengths_normalized, doc_lengths_nonorm, total_doc_len_normalized, total_doc_len_nonorm = {}, {}, 0, 0
    #for doc, json_data in ground_truths.items():

        # Loop over each entry in json object array
        #for entry in json_data["entries"]:

            # Loop over each field's value in the entry
            #for text in entry.values():
                #doc_lengths_normalized[doc] = doc_lengths_normalized.get(doc, 0) + len(entry)
                #doc_lengths_nonorm[doc] = doc_lengths_nonorm.get(doc, 0) + len(entry)
        
        # Add up the totals as we go along with doc_lengths_normalized etc.
        #total_doc_len_normalized += doc_lengths_normalized[doc]
        #total_doc_len_nonorm += doc_lengths_nonorm[doc]

    # -> Gather each transcribed document and put into dict:

    # Structure: results[model][doc]
    results = {}

    for model_type, model in all_models:
        logger.info("Collecting results for model: %s", model)
        model_path = os.path.join(txt_output_dir, model_type, model)
        results[model_type] = results.get(model_type, {})
        results[model_type][model], results[model_type][model]["__ALL__"] = get_docs(model_path, doc_names, "txt", name_has_prefix=False)
        logger.info("Collected results for model_type: %s, model: %s", model_type, model)

    # ===============
    # Compute metrics
    # ===============

    normalized_results_data = {}
    nonorm_results_data = {}

    for model_type, model in all_models:
        normalized_results_data[model_type] = normalized_results_data.get(model_type, {})
        normalized_results_data[model_type][model] = normalized_results_data[model_type].get(model, {})
        nonorm_results_data[model_type] = nonorm_results_data.get(model_type, {})
        nonorm_results_data[model_type][model] = nonorm_results_data[model_type].get(model, {})

        logger.info("Computing metrics for model_type: %s, model: %s", model_type, model)
        for doc in doc_names:
            logger.info("Computing metrics for document: %s", doc)
            normalized_results_data[model_type][model][doc] = compute_metrics(
                ground_truths[doc], results[model_type][model][doc], "txt", normalized=True
            )
            nonorm_results_data[model_type][model][doc] = compute_metrics(
                ground_truths[doc], results[model_type][model][doc], "txt", normalized=False
            )

        normalized_results_data[model_type][model]["__ALL__"] = compute_metrics(
            ground_truths["__ALL__"], results[model_type][model]["__ALL__"], "txt", normalized=True
        )
        nonorm_results_data[model_type][model]["__ALL__"] = compute_metrics(
            ground_truths["__ALL__"], results[model_type][model]["__ALL__"], "txt", normalized=False
        )

    # Compute metrics separately for __ALL__]

    # ====================
    # Put metrics in table
    # ====================

    time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    results_base_dir = root_dir / "benchmarking-results" / f"txt-accuracy"

    # Create different results directory for each model type
    for model_type, _ in all_models:
        results_dir = results_base_dir / model_type
        results_dir.mkdir(parents=True, exist_ok=True)

        normalized_df = build_dataframe(
            f"normalized_{time}",
            doc_names,
            normalized_results_data[model_type],
            doc_lengths_normalized,
            total_doc_len_normalized,
        )
        nonorm_df = build_dataframe(
            f"nonorm_{time}",
            doc_names,
            nonorm_results_data[model_type],
            doc_lengths_nonorm,
            total_doc_len_nonorm,
        )

        # ============
        # Save results
        # ============

        # # Default save to project_root/benchmarking-results/txt-accuracy
        # results_path = os.path.join(project_root, "benchmarking-results", "txt-accuracy")
        # if not os.path.exists(results_path):
        #     os.makedirs(results_path)
        normalized_df.to_csv(os.path.join(str(results_dir), f"normalized_{time}.csv"))
        nonorm_df.to_csv(os.path.join(str(results_dir), f"nonorm_{time}.csv"))


if __name__ == "__main__":
    main()

b. JSON benchmarking accuracy

In [18]:
import glob
import json
import sys
sys.path.append(str(Path.cwd().
parent))
from benchmarking.json_accuracy import filter_expected_columns, build_dataframe, compare_dataframes_normalized, compare_dataframes_exact, compare_dataframes_fuzzy
from tools.file_retrieval import get_doc_names, get_docs, get_all_models
from venv import logger
from datetime import datetime
import pandas as pd

def main():
    """
    Prerequisites:
    - Ground truth JSON files located at `project_root/ground-truth/json/gt_kbaa-pXYZ.json`
    - LLM/OCR transcribed JSON files located at:
        - for ground truth text to JSON via LLM:
            - `project_root/results/gt-txt2json/<MODEL-NAME>/<MODEL-NAME>_img_kbaa-pXYZ.json`
        - for OCR text to JSON via LLM:
            - `project_root/results/ocr-txt2json/<MODEL-NAME>/<MODEL-NAME>_img_kbaa-pXYZ.json`
        - for image to JSON via LLM:
            - `project_root/results/llm-img2json/<MODEL-NAME>/<MODEL-NAME>_img_kbaa-pXYZ.json`
        - for text to JSON via LLM:
            - `project_root/results/llm-txt2json/<MODEL-NAME>/<MODEL-NAME>_img_kbaa-pXYZ.json`

    The main function will:
    - Gather all ground truth JSON files
    - For each ground truth JSON file and for each LLM/OCR model, open the JSON file's entries object as a Pandas dataframe
    - Clean all the JSON files (either basic cleaning and normalization)
    - Compute metrics for each file and model
    - Save results in two CSV files (one for normalized, one for non-normalized)
        - Results are saved in `project_root/benchmarking-results/txt-accuracy`
    """

    # =============
    # Preliminaries
    # =============

    #logger.info("Script directory: %s", script_dir)
    logger.info("Project root: %s", root_dir)

    # Ground truth
    ground_truth_dir = os.path.join(root_dir, "data", "ground-truth", "json")
    doc_names = get_doc_names(ground_truth_dir, "json", keep_prefix=False)

    # results/ paths
    all_models = get_all_models( "json",
        #os.path.join(root_dir, "results", "gt-txt2json"),
        #os.path.join(root_dir, "results", "ocr-txt2json"),
        os.path.join(root_dir, "results", "json", "llm-img2json"),
        os.path.join(root_dir, "results", "json", "llm-txt2json")
    )
    logger.info(f"Models found: {all_models}")

    # ===========
    # Gather files
    # ===========

    # -> Gather ground truths and put into dict:

    ground_truths_json, _ = get_docs(
        ground_truth_dir, doc_names, "json", name_has_prefix=True
    )

    logger.info("Collected ground truth results: %s", list(ground_truths_json.keys()))

    # Convert JSON to dataframe

    ground_truths_df = {
        doc_name: filter_expected_columns(pd.DataFrame(doc_json['entries'])) for doc_name, doc_json in ground_truths_json.items()
    }

    logger.info("Converted ground truths to dataframes")

    # -> Gather each transcribed document and put into dict:

    # Structure: results[(model_type, model)][doc]
    results_json = {} # Stores collected outputs as JSON
    results_df = {} # Stores collected outputs as dataframes

    for model_type, model in all_models:
        logger.info("Collecting results for model: %s/%s", model_type, model)

        model_path = os.path.join(root_dir, "results", "json", model_type, model)
        print(model_path)
        results_json[(model_type, model)], _ = get_docs(
            model_path, doc_names, "json", name_has_prefix=True
        )

        logger.info("Collected results for model: %s", list(results_json[(model_type, model)].keys()))

        results_df[(model_type, model)] = {
            doc_name: filter_expected_columns(pd.DataFrame(doc_json['entries'])) for doc_name, doc_json in results_json[(model_type, model)].items()
        }

        logger.info("Converted results to dataframes")


    # ===============
    # Compute metrics
    # ===============

    normalized_results_data = {}
    nonorm_results_data = {}
    fuzzy_results_data = {}

    for model_type, model in all_models:
        normalized_results_data[model_type] = normalized_results_data.get(model_type, {})
        normalized_results_data[model_type][model] = normalized_results_data[model_type].get(model, {})

        nonorm_results_data[model_type] = nonorm_results_data.get(model_type, {})
        nonorm_results_data[model_type][model] = nonorm_results_data[model_type].get(model, {})

        fuzzy_results_data[model_type] = fuzzy_results_data.get(model_type, {})
        fuzzy_results_data[model_type][model] = fuzzy_results_data[model_type].get(model, {})
        
        logger.info("Computing metrics for model: %s", model)

        for doc in doc_names:
            logger.info("Computing metrics for document: %s", doc)

            normalized_results_data[model_type][model][doc] = compare_dataframes_normalized(
                ground_truths_df[doc], results_df[(model_type, model)][doc]
            )
            nonorm_results_data[model_type][model][doc] = compare_dataframes_exact(
                ground_truths_df[doc], results_df[(model_type, model)][doc]
            )
            fuzzy_results_data[model_type][model][doc] = compare_dataframes_fuzzy(
                ground_truths_df[doc], results_df[(model_type, model)][doc]
            )


    # =====================================
    # Put metrics in table and save results
    # =====================================

    time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    # Iterate over model types:
    for model_type in normalized_results_data.keys():
        normalized_df = build_dataframe(f"{model_type}_normalized_{time}", doc_names, normalized_results_data[model_type])
        nonorm_df = build_dataframe(f"{model_type}_nonorm_{time}", doc_names, nonorm_results_data[model_type])
        fuzzy_df = build_dataframe(f"{model_type}_fuzzy_{time}", doc_names, fuzzy_results_data[model_type])

        results_path = os.path.join(root_dir, "benchmarking-results", "json-accuracy", model_type)
        if not os.path.exists(results_path):
            os.makedirs(results_path)

        normalized_df.to_csv(os.path.join(results_path, f"{model_type}_normalized_{time}.csv"))
        nonorm_df.to_csv(os.path.join(results_path, f"{model_type}_nonorm_{time}.csv"))
        fuzzy_df.to_csv(os.path.join(results_path, f"{model_type}_fuzzy_{time}.csv"))
    


if __name__ == "__main__":
    main()

[file retrieval] 2025-07-14 15:28:05 [INFO] Project root: /Users/timyu/projects/GC-20C-Text-Lab/ocr-benchmarking
[file retrieval] 2025-07-14 15:28:05 [INFO] Found ground-truth txt files: ['/Users/timyu/projects/GC-20C-Text-Lab/ocr-benchmarking/data/ground-truth/json/gt_kbaa-p114.json', '/Users/timyu/projects/GC-20C-Text-Lab/ocr-benchmarking/data/ground-truth/json/gt_kbaa-p006.json', '/Users/timyu/projects/GC-20C-Text-Lab/ocr-benchmarking/data/ground-truth/json/gt_kbaa-p010.json', '/Users/timyu/projects/GC-20C-Text-Lab/ocr-benchmarking/data/ground-truth/json/gt_kbaa-p067.json', '/Users/timyu/projects/GC-20C-Text-Lab/ocr-benchmarking/data/ground-truth/json/gt_kbaa-p071.json', '/Users/timyu/projects/GC-20C-Text-Lab/ocr-benchmarking/data/ground-truth/json/gt_kbaa-p070.json', '/Users/timyu/projects/GC-20C-Text-Lab/ocr-benchmarking/data/ground-truth/json/gt_kbaa-p066.json', '/Users/timyu/projects/GC-20C-Text-Lab/ocr-benchmarking/data/ground-truth/json/gt_kbaa-p119.json', '/Users/timyu/projec

/Users/timyu/projects/GC-20C-Text-Lab/ocr-benchmarking/results/json/llm-img2json/gemini-2.5-flash
/Users/timyu/projects/GC-20C-Text-Lab/ocr-benchmarking/results/json/llm-txt2json/gemini-2.5-flash
/Users/timyu/projects/GC-20C-Text-Lab/ocr-benchmarking/results/json/llm-img2json/gpt-4o


[file retrieval] 2025-07-14 15:28:05 [INFO] Collecting results for model: llm-txt2json/gpt-4o
[file retrieval] 2025-07-14 15:28:05 [INFO] Collected results for model: ['kbaa-p114', 'kbaa-p006', 'kbaa-p010', 'kbaa-p067', 'kbaa-p071', 'kbaa-p070', 'kbaa-p066', 'kbaa-p119', 'kbaa-p011', 'kbaa-p007', 'kbaa-p115', 'kbaa-p061', 'kbaa-p124', 'kbaa-p113', 'kbaa-p060', 'kbaa-p063', 'kbaa-p059', 'kbaa-p038', 'kbaa-p043', 'kbaa-p106', 'kbaa-p151', 'kbaa-p003', 'kbaa-p107', 'kbaa-p039', 'kbaa-p058', 'kbaa-p062', 'kbaa-p069', 'kbaa-p100', 'kbaa-p012', 'kbaa-p004', 'kbaa-p073', 'kbaa-p065', 'kbaa-p049', 'kbaa-p008', 'kbaa-p009', 'kbaa-p048', 'kbaa-p121', 'kbaa-p064', 'kbaa-p072', 'kbaa-p005', 'kbaa-p101', 'kbaa-p068']
[file retrieval] 2025-07-14 15:28:05 [INFO] Converted results to dataframes
[file retrieval] 2025-07-14 15:28:05 [INFO] Computing metrics for model: gemini-2.5-flash
[file retrieval] 2025-07-14 15:28:05 [INFO] Computing metrics for document: kbaa-p114
[file retrieval] 2025-07-14 15:28:

/Users/timyu/projects/GC-20C-Text-Lab/ocr-benchmarking/results/json/llm-txt2json/gpt-4o


[file retrieval] 2025-07-14 15:28:05 [INFO] Computing metrics for document: kbaa-p124
[file retrieval] 2025-07-14 15:28:05 [INFO] Computing metrics for document: kbaa-p113
[file retrieval] 2025-07-14 15:28:05 [INFO] Computing metrics for document: kbaa-p060
[file retrieval] 2025-07-14 15:28:05 [INFO] Computing metrics for document: kbaa-p063
[file retrieval] 2025-07-14 15:28:05 [INFO] Computing metrics for document: kbaa-p059
[file retrieval] 2025-07-14 15:28:05 [INFO] Computing metrics for document: kbaa-p038
[file retrieval] 2025-07-14 15:28:05 [INFO] Computing metrics for document: kbaa-p043
[file retrieval] 2025-07-14 15:28:05 [INFO] Computing metrics for document: kbaa-p106
[file retrieval] 2025-07-14 15:28:05 [INFO] Computing metrics for document: kbaa-p151
[file retrieval] 2025-07-14 15:28:05 [INFO] Computing metrics for document: kbaa-p003
[file retrieval] 2025-07-14 15:28:05 [INFO] Computing metrics for document: kbaa-p107
[file retrieval] 2025-07-14 15:28:05 [INFO] Computing 