In [None]:
import os
import fitz  # PyMuPDF
from utils import ocr_image


def extract_pdf_text_and_ocr(pdf_path: str) -> str:
    if not os.path.isfile(pdf_path):
        return "Error: PDF not found"
    blocks = []
    try:
        with fitz.open(pdf_path) as doc:
            for page_idx, page in enumerate(doc, start=1):
                page_text = page.get_text()
                if page_text and page_text.strip():
                    blocks.append(f"--- Page {page_idx} (Text) ---")
                    blocks.append(page_text)

                # Extract and OCR embedded images
                for img_idx, img in enumerate(page.get_images(full=True), start=1):
                    try:
                        xref = img[0]  # image reference id
                        pix = fitz.Pixmap(doc, xref)
                        if pix.n - pix.alpha < 4:
                            png_bytes = pix.tobytes("png")
                            ocr_text = ocr_image(png_bytes, is_bytes=True)
                            if ocr_text and not ocr_text.startswith("Error") and len(ocr_text.strip()) > 10:
                                blocks.append(f"--- Page {page_idx} (Image {img_idx} OCR) ---")
                                blocks.append(ocr_text)
                        pix = None
                    except Exception:
                        continue
        return '\n\n'.join(blocks).strip() or "Warning: No extractable content found in PDF"
    except Exception as exc:
        return f"Error reading PDF: {exc}"
