In [None]:
import os
import docx
from utils import ocr_image


def _extract_docx_text(docx_path: str) -> str:
    doc = docx.Document(docx_path)
    text_chunks = []

    for para in doc.paragraphs:
        if para.text and para.text.strip():
            text_chunks.append(para.text)

    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                if cell.text and cell.text.strip():
                    text_chunks.append(cell.text)

    if text_chunks:
        return "--- Document Text ---\n" + "\n".join(text_chunks)
    return ""


def _extract_docx_images_ocr(docx_path: str) -> str:
    try:
        doc = docx.Document(docx_path)
        ocr_blocks = []
        for rel in doc.part.rels.values():
            if "image" in getattr(rel, "target_ref", ""):
                try:
                    blob = rel.target_part.blob
                    ocr_text = ocr_image(blob, is_bytes=True)
                    if ocr_text and not ocr_text.startswith("Error") and len(ocr_text.strip()) > 10:
                        ocr_blocks.append("--- Embedded Image OCR ---")
                        ocr_blocks.append(ocr_text)
                except Exception:
                    continue
        return "\n\n".join(ocr_blocks)
    except Exception as exc:
        return f"Error scanning DOCX images: {exc}"


def extract_docx_text_and_ocr(docx_path: str) -> str:
    if not os.path.isfile(docx_path):
        return "Error: DOCX not found"
    try:
        text_part = _extract_docx_text(docx_path)
        image_part = _extract_docx_images_ocr(docx_path)
        combined = "\n\n".join(p for p in [text_part, image_part] if p).strip()
        return combined or "Warning: No extractable content found in DOCX"
    except Exception as exc:
        return f"Error reading DOCX: {exc}"
