# OCR-mLLM Pipeline

## 1. Run pytesseract

In [None]:
import os
from pathlib import Path
import pytesseract
from PIL import Image

DEBUG = True

# Get the user's path for the images folder assuming all images are stored here in .png format
source_dir = Path.cwd().parent.parent / "images"

# Get the user's path for the output folder, create one if it doesn't exist
target_dir = Path.cwd().parent.parent / "output"
target_dir.mkdir(parents=True, exist_ok=True)

# Add all filenames in images directory into the `filenames` array with the ENTIRE filepath
img_filepaths = []
for path in source_dir.iterdir():
  if path.is_file():
    img_filepaths.append(path)

In [None]:
# Read the files from ocr-benchmarking/images folder & write to output folder
for path in img_filepaths:
    file_name = target_dir / path.stem
    file_name = str(file_name) + ".txt"
    
    with open(file_name, 'w') as file:
        file.write(pytesseract.image_to_string(Image.open(str(path)))) # TODO: Change config as needed

# 2. Send to OpenAI

## (i) Prepare the prompt

In [None]:
prompt_template = """
You are a text correction assistant. Your task is to clean up and correct errors from raw OCR output.
The text may contain misrecognized characters, broken words, or incorrect formatting.
Carefully read the provided OCR output and produce a corrected version that is grammatically accurate 
and as faithful to the original content as possible. Because this is a historical document, try to 
preserve archaic spelling or formatting where clearly intended. Only correct obvious OCR errors.

Input (Raw OCR Text):
{input}
"""
input = ""
with open("/Users/muhammadkhalid/Desktop/map2025/ocr-benchmarking/output/kbaa-p 096.txt", 'r') as file:
    input += file.read()

prompt = prompt_template.format(input=input).strip()

# prompt = """
# From the provided image, give me the first word and nothing else
# """

Author Ent


## (ii) API Call

In [None]:
from openai import OpenAI
import base64
from dotenv import load_dotenv

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')
    
base64_image = encode_image(source_dir / "kbaa-p 096.png")

client = OpenAI(api_key=openai_api_key)
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[
        {
            "role": "user", 
            "content": [
                {
                    "type": "text",
                    "text": prompt
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{base64_image}"
                    }
                }
            ]
        }
        ]
)
response.choices[0].message.content

'A-No.'

# 3. Benchmark results