In [1]:
import os
import io
import pytesseract
from PIL import Image


SUPPORTED_IMAGE_EXTS = {'.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp', '.gif'}


def _tesseract_config():
    return r'--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?;:" '


def _open_pil_image(image_input, is_bytes=False):
    if is_bytes:
        return Image.open(io.BytesIO(image_input))
    if not os.path.isfile(image_input):
        raise FileNotFoundError("Image file not found")
    ext = os.path.splitext(image_input)[1].lower()
    if ext not in SUPPORTED_IMAGE_EXTS:
        raise ValueError(f"Unsupported image format '{ext}'")
    return Image.open(image_input)


def _preprocess_for_ocr(pil_img: Image.Image) -> Image.Image:
    if pil_img.mode != 'RGB':
        pil_img = pil_img.convert('RGB')
    w, h = pil_img.size
    min_dim = 300
    if w < min_dim or h < min_dim:
        scale = max(min_dim / w, min_dim / h)
        pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.Resampling.LANCZOS)
    return pil_img


def ocr_image(image_path_or_bytes, is_bytes=False) -> str:
    try:
        img = _open_pil_image(image_path_or_bytes, is_bytes=is_bytes)
        img = _preprocess_for_ocr(img)
        text = pytesseract.image_to_string(img, config=_tesseract_config())
        return '\n'.join(l.strip() for l in text.splitlines() if l.strip())
    except Exception as exc:
        return f"Error during OCR: {exc}"
