In [30]:
import os
import re
from pathlib import Path
from typing import List, Optional, Dict, Any

import typer
from rich import print as rprint
from rich.table import Table
from pydantic_settings import BaseSettings

# LangChain & friends
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms.openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.schema.runnable import Runnable
from langchain.chains import RetrievalQA

import fitz  # PyMuPDF
from pdf2image import convert_from_path
import pytesseract

from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    rprint("[red]Error: OPENAI_API_KEY environment variable not set. Please set it in your environment or .env file.[/red]")
    raise typer.Exit(code=1)

In [31]:
if "POPPLER_PATH" in os.environ:
    poppler_path = os.environ["POPPLER_PATH"]
    if not Path(poppler_path).exists():
        rprint(f"[red]Warning: POPPLER_PATH is set to '{poppler_path}' but that path does not exist.[/red]")
    else:
        os.environ["PATH"] += os.pathsep + poppler_path
else:
    os.environ["PATH"] = "C:/tools/poppler-25.07.0/Library/bin"

if "TESSDATA_PREFIX" in os.environ:
    tessdata_prefix = os.environ["TESSDATA_PREFIX"]
    if not Path(tessdata_prefix).exists():
        rprint(f"[red]Warning: TESSDATA_PREFIX is set to '{tessdata_prefix}' but that path does not exist.[/red]")
    else:
        pytesseract.pytesseract.tesseract_cmd = str(Path(tessdata_prefix) / "tesseract.exe")
else:
    pytesseract.pytesseract.tesseract_cmd = "C:/Program Files/Tesseract-OCR/tesseract.exe"

class Settings(BaseSettings):
    # Embedding model
    embed_model: str = "sentence-transformers/all-MiniLM-L6-v2"

    openai_model: str = "o4-mini"

    # Chunking
    chunk_size: int = 1200
    chunk_overlap: int = 150

    # OCR threshold – if extracted text is shorter than this, we try OCR
    ocr_trigger_chars: int = 800

    class Config:
        env_prefix = "PARSER_"

In [32]:
def get_embeddings(settings: Settings):
    return HuggingFaceEmbeddings(model_name=settings.embed_model)


def get_llm(settings: Settings):
    """Return an LLM or None. Try OpenAI first; fallback to Ollama."""
    # Try OpenAI (chat)
    try:
        if OPENAI_API_KEY:
            return ChatOpenAI(model=settings.openai_model)
    except Exception as e:
        rprint(f"[yellow]OpenAI init failed: {e}[/yellow]")

    # No LLM available
    return None


# -----------------------------
# PDF Parsing & OCR
# -----------------------------


def ocr_pdf_to_text(pdf_path: Path, dpi: int = 300) -> str:
    images = convert_from_path(str(pdf_path), dpi=dpi)
    text_parts: List[str] = []
    for img in images:
        text_parts.append(pytesseract.image_to_string(img))
    return "\n".join(text_parts).strip()


def extract_tables_with_camelot(pdf_path: Path, flavor: str = "lattice") -> List[str]:
    if not HAS_CAMELOT:
        return []
    try:
        tables = camelot.read_pdf(str(pdf_path), flavor=flavor, pages="all")
        csv_snippets = []
        for i, t in enumerate(tables):
            csv_snippets.append(t.df.to_csv(index=False))
        return csv_snippets
    except Exception as e:
        rprint(f"[yellow]Camelot failed: {e}. Skipping tables for {pdf_path.name}[/yellow]")
        return []


# -----------------------------
# Pre‑processing & Chunking
# -----------------------------

def clean_text(text: str) -> str:
    # Light cleanup; keep it conservative to not harm semantics
    text = re.sub(r"\u00a0", " ", text)  # non‑breaking space
    text = re.sub(r"\s+\n", "\n", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


def make_documents(
    raw_text: str,
    source: str,
    metadata: Optional[Dict[str, Any]] = None,
    chunk_size: int = 1200,
    chunk_overlap: int = 150,
) -> List[Document]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""],
        length_function=len,
    )
    chunks = splitter.split_text(raw_text)
    docs = [
        Document(
            page_content=chunk,
            metadata={"source": source, **(metadata or {}), "chunk": i},
        )
        for i, chunk in enumerate(chunks)
    ]
    return docs


# -----------------------------
# Index Build & Persist
# -----------------------------

def build_or_update_faiss(
    docs: List[Document], index_dir: Path, settings: Settings
) -> FAISS:
    embeddings = get_embeddings(settings)
    if index_dir.exists():
        vs = FAISS.load_local(str(index_dir), embeddings, allow_dangerous_deserialization=True)
        vs.add_documents(docs)
    else:
        vs = FAISS.from_documents(docs, embeddings)
        index_dir.mkdir(parents=True, exist_ok=True)
    vs.save_local(str(index_dir))
    return vs


In [33]:
input_path = Path("docs")
index_path = Path("./index/faiss")
ocr = True
include_tables = True

all_docs: List[Document] = []

pdf_files = []
if input_path.is_file() and input_path.suffix.lower() == ".pdf":
    pdf_files = [input_path]
else:
    pdf_files = list(input_path.rglob("*.pdf"))

if not pdf_files:
    rprint(f"[red]No PDFs found under {input_path}[/red]")
    raise typer.Exit(code=1)

In [34]:
dec2024_file_pdf = pdf_files[1]

from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser
)
from llama_index.core.schema import BaseNode, Document
from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

def get_llama_embeddings(settings: Settings):
    return HuggingFaceEmbedding(model_name=settings.embed_model)

def semantic_spliter_chunker(pdf_path: Path) -> List[BaseNode]:
    settings = Settings()
    embed_model = get_llama_embeddings(settings)
    splitter = SemanticSplitterNodeParser(
        buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
    )
    base_splitter = SentenceSplitter(chunk_size=512)
    documents = SimpleDirectoryReader(input_files=[str(pdf_path)]).load_data()
    return splitter.get_nodes_from_documents(documents)



def extract_text_with_pymupdf(pdf_path: Path) -> str:
    doc = fitz.open(pdf_path)
    parts: List[str] = []
    for page in doc:
        print(page.number)
        print(page.get_textpage().extractText())
        parts.append(page.get_textpage().extractText())
    return "\n".join(parts).strip()

rprint(f"[bold cyan]Parsing:[/bold cyan] {dec2024_file_pdf}")
text = extract_text_with_pymupdf(dec2024_file_pdf)
# print(text)

0
Q4 2024 Update
February 4th, 2025 

1
Table of
Contents
Key Highlights
Financial Summary
MAUs & Subscribers
Product & Platform
Outlook
Financial Statements
Executive Summary
2
p.03
p.04
p.06
p.13
p.16
p.21
p.24

2
Executive Summary
USER & FINANCIAL SUMMARY
Q4 2023
Q3 2024
Q4 2024
Y/Y
Q/Q
USERS (M)
Total Monthly Active Users ("MAUs")
602
640
675
12%
5%
Premium Subscribers
236
252
263
11%
4%
Ad-Supported MAUs
379
402
425
12%
6%
FINANCIALS (€M)
Premium
3,170
3,516
3,705
17%
5%
Ad-Supported
501
472
537
7%
14%
Total Revenue
3,671
3,988
4,242
16%
6%
Gross Profit
980
1,240
1,368
40%
10%
Gross Margin
26.7%
31.1%
32.2%
--
--
Operating (Loss)/Income
(75)
454
477
--
5%
Operating Margin
(2.0%)
11.4%
11.2%
--
--
Net Cash Flows From Operating Activities
397
715
883
122%
23%
Free Cash Flow*
396
711
877
121%
23%
* Constant Currency adjusted measures and Free Cash Flow are non-IFRS measures. See "Use of Non-IFRS Measures" and "Reconciliation of IFRS to Non-IFRS Results" for additional information.
We

In [35]:
# nodes = semantic_spliter_chunker(dec2024_file_pdf)

In [36]:
## Need to create MARKDOWN files from nodes and then use SimpleDirectoryReader to read them in and create Documents

In [37]:
doc = fitz.open(dec2024_file_pdf)
parts: List[str] = []
for page in doc:
    rd = page.get_text("rawdict")
    for blk in rd.get("blocks", []):
        btype = blk.get("type", 0)
        bbox = tuple(blk.get("bbox", (0, 0, 0, 0)))
        x0, y0, x1, y1 = bbox
        if btype == 0:  # text block
            # reconstruct text from lines/spans in order
            lines_out: List[str] = []
            for line in blk.get("lines", []):
                spans = line.get("spans", [])
                # join spans for a visual line
                line_text = "".join(span.get("text", "") for span in spans)
                if line_text.strip():
                    lines_out.append(line_text)
            text = "\n".join(lines_out).strip()
            if text:
                items.append({"kind": "text", "bbox": bbox, "y": y0, "x": x0, "content": text})

        elif btype == 1:  # image block
            xref = blk.get("xref")
            if xref:
                # extract image bytes
                img_info = page.parent.extract_image(xref)
                ext = img_info.get("ext", "png")
                img_bytes = img_info.get("image")
                img_name = f"page{page.number+1}_xref{xref}.{ext}"
                img_path = IMG_DIR / img_name
                with open(img_path, "wb") as f:
                    f.write(img_bytes)
                items.append({
                    "kind": "image",
                    "bbox": bbox, "y": y0, "x": x0,
                    "path": str(img_path),
                    "content": f"[IMAGE: {img_name}]"
                })

In [38]:
from pathlib import Path
from typing import List, Dict, Any

IMG_DIR = Path("extracted_images")
IMG_DIR.mkdir(parents=True, exist_ok=True)

def md_table(rows: List[List[str]]) -> str:
    if not rows:
        return ""
    # ensure strings
    rows = [[("" if c is None else str(c)) for c in r] for r in rows]
    header = rows[0]
    sep = ["---"] * len(header)
    body = rows[1:] if len(rows) > 1 else []
    lines = ["| " + " | ".join(header) + " |", "| " + " | ".join(sep) + " |"]
    lines += ["| " + " | ".join(r) + " |" for r in body]
    return "\n".join(lines)

def save_block_image(page, blk, page_idx: int) -> str | None:
    """
    Save an image for a type=1 block. Supports:
      - xref-based extraction (typical for rawdict)
      - inline 'image' bytes (dict/rawdict with TEXT_PRESERVE_IMAGES)
    Returns the relative file path, or None if skipped.
    """
    # Heuristic: skip tracking pixels / dots
    w = blk.get("width", 0)
    h = blk.get("height", 0)
    byte_size = blk.get("size", 0)
    if (w and h and w <= 8 and h <= 8) or (byte_size and byte_size < 500):
        return None  # skip tiny pixels

    # Prefer xref path if available
    xref = blk.get("xref")
    if xref:
        try:
            img_info = page.parent.extract_image(xref)
            ext = img_info.get("ext", "png")
            data = img_info.get("image")
            if data:
                fname = f"page{page_idx+1}_xref{xref}.{ext}"
                fpath = IMG_DIR / fname
                fpath.write_bytes(data)
                return str(fpath)
        except Exception:
            pass  # fall back to inline

    # Inline bytes path (your case)
    data = blk.get("image")
    if data:
        ext = blk.get("ext", "png")
        fname = f"page{page_idx+1}_img{blk.get('number', 0)}.{ext}"
        fpath = IMG_DIR / fname
        fpath.write_bytes(data)
        return str(fpath)

    return None


def collect_page_items(page) -> List[Dict[str, Any]]:
    items: List[Dict[str, Any]] = []

    # 1) Text & Images via rawdict blocks (have bboxes)
    rd = page.get_text("rawdict")
    for blk in rd.get("blocks", []):
        btype = blk.get("type", 0)
        bbox = tuple(blk.get("bbox", (0, 0, 0, 0)))
        x0, y0, x1, y1 = bbox

        if btype == 0:  # text block
            # reconstruct text from lines/spans in order
            lines_out: List[str] = []
            for line in blk.get("lines", []):
                spans = line.get("spans", [])
                # join spans for a visual line
                line_text = "".join(
                    ch.get("c", "") for span in spans for ch in span.get("chars", [])
                )
                if line_text.strip():
                    lines_out.append(line_text)
            text = "\n".join(lines_out).strip()
            if text:
                items.append({"kind": "text", "bbox": bbox, "y": y0, "x": x0, "content": text})

        elif btype == 1:  # image block
            img_path = save_block_image(page, blk, page.number)
            if img_path:
                items.append({
                    "kind": "image",
                    "bbox": bbox, "y": y0, "x": x0,
                    "path": img_path,
                    "content": f"![{Path(img_path).name}]({img_path})"
                })

    # 2) Tables (if available) – merge by bbox like other blocks
    # PyMuPDF’s table finder returns objects with .bbox and .extract() (matrix of cells)
    try:
        tf = page.find_tables()  # may be TableFinder or list depending on version
        tables = getattr(tf, "tables", tf)  # support both shapes
        for t in tables or []:
            tbbox = tuple(getattr(t, "bbox", (0, 0, 0, 0)))
            x0, y0, x1, y1 = tbbox
            # Extract cells; normalize to Markdown
            try:
                matrix = t.extract()  # list of row lists
                table_md = md_table(matrix)
            except Exception:
                table_md = "[TABLE: extraction failed]"
            items.append({
                "kind": "table",
                "bbox": tbbox, "y": y0, "x": x0,
                "content": table_md
            })
    except Exception:
        # No table support / no tables found; ignore
        pass

    # 3) Sort by position to preserve reading order
    items.sort(key=lambda it: (round(it["y"], 2), round(it["x"], 2)))
    return items

def extract_pdf_stream(pdf_path: str) -> List[Dict[str, Any]]:
    doc = fitz.open(pdf_path)
    all_items: List[Dict[str, Any]] = []
    for page in doc:
        page_items = collect_page_items(page)
        # add page number for reference
        for it in page_items:
            it["page"] = page.number + 1
        all_items.extend(page_items)
    return all_items

# ---- Run extraction ----
stream = extract_pdf_stream(dec2024_file_pdf)

# Example: build a single markdown-ish output preserving order
output_lines: List[str] = []
for it in stream:
    print(it)
    if it["kind"] == "text":
        output_lines.append(it["content"])
    elif it["kind"] == "image":
        output_lines.append(f'![{Path(it["path"]).name}]({it["path"]})')
    elif it["kind"] == "table":
        output_lines.append(it["content"])
final_markdown = "\n\n".join(output_lines)

# Save combined output if you like
Path("pdf_extracted.md").write_text(final_markdown, encoding="utf-8")
print("Wrote pdf_extracted.md and any images under extracted_images/")


{'kind': 'image', 'bbox': (0.0, 0.0, 720.0, 405.0), 'y': 0.0, 'x': 0.0, 'path': 'extracted_images\\page1_img0.png', 'content': '![page1_img0.png](extracted_images\\page1_img0.png)', 'page': 1}
{'kind': 'image', 'bbox': (91.11402130126953, 130.27365112304688, 153.9912567138672, 147.4980010986328), 'y': 130.27365112304688, 'x': 91.11402130126953, 'path': 'extracted_images\\page1_img3.jpeg', 'content': '![page1_img3.jpeg](extracted_images\\page1_img3.jpeg)', 'page': 1}
{'kind': 'text', 'bbox': (91.11417388916016, 172.87831115722656, 393.83416748046875, 212.87831115722656), 'y': 172.87831115722656, 'x': 91.11417388916016, 'content': 'Q4 2024 Update', 'page': 1}
{'kind': 'text', 'bbox': (91.11417388916016, 232.21202087402344, 247.31814575195312, 250.21202087402344), 'y': 232.21202087402344, 'x': 91.11417388916016, 'content': 'February 4th, 2025', 'page': 1}
{'kind': 'image', 'bbox': (0.0, 0.0, 720.0, 405.0), 'y': 0.0, 'x': 0.0, 'path': 'extracted_images\\page2_img0.png', 'content': '![page2

In [39]:
import base64
import io
import json
import math
from pathlib import Path
from typing import List, Tuple, Dict, Any

import fitz  # PyMuPDF
from PIL import Image

import layoutparser as lp
import torch
from openai import OpenAI

from transformers import (
    AutoImageProcessor,
    AutoModelForObjectDetection,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)

# ---------- Config ----------
OUT_MD = "extracted.md"
IMG_DIR = Path("extracted_images"); IMG_DIR.mkdir(exist_ok=True, parents=True)

# PubLayNet categories -> we’ll use only these three for routing
LAYOUT_LABELS = {0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}

# Models
PUBLAYNET_CFG = "lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config"  # Detectron2 via layoutparser
PUBLAYNET_THRESH = 0.5

TATR_DET_MODEL = "microsoft/table-transformer-detection"
TATR_STR_MODEL = "microsoft/table-structure-recognition-v1.1-all"

DEPLOT_MODEL = "google/deplot"
DEPLOT_MAX_NEW_TOKENS = 512
DEPLOT_CONF_WORDS = ("table", "|")  # crude heuristic: output looks table-ish

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ---------- Load models ----------
print("Loading layout model...")
layout_model = lp.models.Detectron2LayoutModel(
    PUBLAYNET_CFG, extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", PUBLAYNET_THRESH],
    label_map=LAYOUT_LABELS
)

print("Loading TATR models...")
tatr_det_processor = AutoImageProcessor.from_pretrained(TATR_DET_MODEL)
tatr_det = AutoModelForObjectDetection.from_pretrained(TATR_DET_MODEL).to(DEVICE)

tatr_str_processor = AutoImageProcessor.from_pretrained(TATR_STR_MODEL)
tatr_str = AutoModelForObjectDetection.from_pretrained(TATR_STR_MODEL).to(DEVICE)

# print("Loading DePlot...")
# deplot_tokenizer = AutoTokenizer.from_pretrained(DEPLOT_MODEL)
# deplot = AutoModelForSeq2SeqLM.from_pretrained(DEPLOT_MODEL).to(DEVICE)

# ---------- Utils ----------
def pil_from_page(page, dpi=200) -> Image.Image:
    zoom = dpi / 72.0
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap()
    return Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")

def crop_pil(img: Image.Image, bbox: Tuple[float, float, float, float]) -> Image.Image:
    # bbox is in image pixels already (we detect on raster), so direct crop
    x0, y0, x1, y1 = bbox
    return img.crop((int(x0), int(y0), int(x1), int(y1)))

def bbox_to_tuple(b) -> Tuple[float, float, float, float]:
    return (float(b[0]), float(b[1]), float(b[2]), float(b[3]))

def round_sort_key(bbox: Tuple[float, float, float, float]):
    x0, y0, x1, y1 = bbox[0], bbox[1], bbox[2], bbox[3]
    return (round(y0, 2), round(x0, 2))

# ---------- Text from PDF (vector text) ----------
def extract_pdf_text_in_bbox(page, bbox: Tuple[float, float, float, float]) -> str:
    # Use PyMuPDF to get text inside bbox; prefer raw dict to handle chars/span joins well
    text = page.get_textbox(bbox)  # simple & effective for region text
    textt = page.get_text()
    return text.strip()

# ---------- TATR: table detection+structure -> Markdown ----------
def tatr_tables_to_md(img: Image.Image) -> List[str]:
    """Detect tables in the given crop (often the crop is already a table area).
       If multiple tables are detected, return a list of markdown tables."""
    # 1) Table detection
    inputs = tatr_det_processor(images=img, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        det = tatr_det(**inputs)

    target_sizes = torch.tensor([img.size[::-1]], device=DEVICE)  # (h, w)
    results = tatr_det_processor.post_process_object_detection(det, threshold=0.6, target_sizes=target_sizes)[0]

    md_tables = []
    for i, score in enumerate(results["scores"]):
        label = int(results["labels"][i].item())
        if label != 1:  # 1 == table
            continue
        box = results["boxes"][i].tolist()  # x0,y0,x1,y1 in pixels
        table_img = img.crop((int(box[0]), int(box[1]), int(box[2]), int(box[3])))

        # 2) Structure recognition on the detected table crop
        inputs2 = tatr_str_processor(images=table_img, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            pred = tatr_str(**inputs2)

        r2 = tatr_str_processor.post_process_object_detection(
            pred, threshold=0.5, target_sizes=torch.tensor([table_img.size[::-1]], device=DEVICE)
        )[0]

        # Convert structure boxes into a simple grid by row/col heuristics
        cells = []
        for j in range(len(r2["scores"])):
            lab = int(r2["labels"][j].item())  # 0 row,1 column,2 cell,3 header,4 projected-row,5 projected-col
            if lab in (2, 3):  # cells & header-cells
                x0, y0, x1, y1 = r2["boxes"][j].tolist()
                cells.append({"bbox": (x0, y0, x1, y1), "is_header": lab == 3})

        # gridify: sort by y, then x
        cells.sort(key=lambda c: (round(c["bbox"][1], 1), round(c["bbox"][0], 1)))
        # group into rows by y overlap
        rows: List[List[Dict[str, Any]]] = []
        for c in cells:
            placed = False
            for row in rows:
                y0_row = sum(cc["bbox"][1] for cc in row) / len(row)
                # if vertical overlap is decent, same row
                if abs(c["bbox"][1] - y0_row) < 10:
                    row.append(c); placed = True; break
            if not placed:
                rows.append([c])
        # within each row, sort by x
        for r in rows:
            r.sort(key=lambda c: c["bbox"][0])

        # Rough OCR of each cell via PyMuPDF isn’t available here, so we rely on DePlot-like text?
        # In practice you’d run an OCR over each cell crop. For a compact example, we just use a placeholder.
        # If your PDF is vector, better: use PyMuPDF page.get_textbox with transformed bbox.
        # Here, put empty placeholders to keep format:
        # (You likely want to plug Tesseract or doctr here.)
        md = []
        # Header detection: first row with many header cells
        header_idx = 0
        if rows and sum(1 for c in rows[0] if c["is_header"]) >= max(1, math.floor(len(rows[0])*0.5)):
            header_idx = 1
            header = ["H{}".format(i+1) for i in range(len(rows[0]))]
            md.append("| " + " | ".join(header) + " |")
            md.append("| " + " | ".join(["---"]*len(header)) + " |")

        for r in rows[header_idx:]:
            md.append("| " + " | ".join(["" for _ in r]) + " |")

        md_tables.append("\n".join(md) if md else "[TABLE (structure detected, text OCR not wired in this demo)]")
    return md_tables or ["[TABLE (no structure detected)]"]

def _pil_to_data_url(img: Image.Image) -> str:
    """Convert a PIL image to a data URL (base64-encoded PNG)."""
    buf = io.BytesIO()
    img.save(buf, format="PNG")
    b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
    return f"data:image/png;base64,{b64}"

OPENAI_VISION_MODEL = os.getenv("OPENAI_VISION_MODEL", "gpt-4o-mini")

# ---------- DePlot: chart → table-like text ----------
def deplot_chart_to_text(img: Image.Image) -> str:
    """
    Uses OpenAI Vision to convert a chart/table image into plain text.
    No strict schema; just structured prose that's easy to parse by another agent.
    """
    client = OpenAI()
    data_url = _pil_to_data_url(img)

    system_prompt = (
        "You are a precise vision analyst. "
        "Describe the structural information visible in a chart or table image as plain text. "
        "Avoid any code fences or JSON. Be concise and factual—do not guess."
    )

    user_instruction = (
        "Extract the structure of this figure in plain text.\n\n"
        "If it's a TABLE:\n"
        "- TYPE: table\n"
        "- TITLE: <title if visible, else blank>\n"
        "- COLUMNS: <comma-separated column headers>\n"
        "- ROWS:\n"
        "  <row 1 cells comma-separated>\n"
        "  <row 2 cells comma-separated>\n"
        "  ... (only include what is clearly legible)\n\n"
        "If it's a CHART (bar/line/scatter/pie/etc.):\n"
        "- TYPE: <bar_chart|line_chart|scatter_plot|pie_chart|other>\n"
        "- TITLE: <title if visible, else blank>\n"
        "- X-AXIS: <label if visible> | <type if inferable: category/numeric/datetime> | <unit if visible>\n"
        "- Y-AXIS: <label if visible> | <type if inferable: numeric/percentage> | <unit if visible>\n"
        "- SERIES:\n"
        "  name=<series name or 'series 1'>\n"
        "  POINTS:\n"
        "    <x> -> <y>\n"
        "    <x> -> <y>\n"
        "    ... (only include clearly legible points; summarize if many)\n"
        "- LEGEND: <comma-separated labels if present>\n\n"
        "End with:\n"
        "- NOTES: <any caveats/uncertainties/annotations seen on the figure>\n\n"
        "Return plain text only. No markdown tables, no JSON, no code fences."
    )

    resp = client.chat.completions.create(
        model=OPENAI_VISION_MODEL,
        temperature=0,
        messages=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": user_instruction},
                    {"type": "image_url", "image_url": {"url": data_url, "detail": "high"}},
                ],
            },
        ],
    )

    return (resp.choices[0].message.content or "").strip()



def looks_like_tabular(text: str) -> bool:
    s = text.lower()
    return any(k in s for k in DEPLOT_CONF_WORDS) or ("\n" in text and "|" in text)

block_positions = []

def save_pil_with_blocks(
        block_positions: List[Tuple[str, Tuple[float, float, float, float]]],
        pil_page: Image.Image,
        out_path: Path
):
    draw = pil_page.convert("RGB").copy()
    import PIL.ImageDraw as ImageDraw
    import PIL.ImageFont as ImageFont
    draw_ctx = ImageDraw.Draw(draw)
    try:
        font = ImageFont.truetype("arial.ttf", 12)
    except Exception:
        font = ImageFont.load_default()

    for label, bbox in block_positions:
        x0, y0, x1, y1 = bbox
        draw_ctx.rectangle([x0, y0, x1, y1], outline="red", width=2)
        draw_ctx.text((x0 + 3, y0 + 3), label, fill="blue", font=font)

    draw.save(out_path)

# ---------- Main extraction ----------
def extract_pdf_to_markdown(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    md_lines: List[str] = []

    for pidx, page in enumerate(doc):
        print(f"Processing page {pidx+1}/{len(doc)}...")
        pil_page = pil_from_page(page, dpi=200)

        save_pil(pil_page, IMG_DIR / f"page{pidx+1}_full.png")

        # 1) detect layout
        layout = layout_model.detect(pil_page)
        # 2) sort blocks by reading order
        layout = lp.Layout(sorted(layout, key=lambda l: (round(l.block.y_1, 2), round(l.block.x_1, 2))))

        block_positions.append([(lobj.type, bbox_to_tuple((
lobj.block.x_1, lobj.block.y_1, lobj.block.x_2, lobj.block.y_2
            ))) for lobj in layout])
        page_block_positions = [(lobj.type, bbox_to_tuple((
lobj.block.x_1, lobj.block.y_1, lobj.block.x_2, lobj.block.y_2
            ))) for lobj in layout]

        save_pil_with_blocks(
            page_block_positions,
            pil_page,
            IMG_DIR / f"page{pidx+1}_blocks.png"
        )

        for lobj in layout:
            label = lobj.type
            bbox = (lobj.block.x_1, lobj.block.y_1, lobj.block.x_2, lobj.block.y_2)
            crop = crop_pil(pil_page, bbox)

            if label in ("Text", "Title", "List"):
                # Pull vector text from the PDF region if possible
                text = extract_pdf_text_in_bbox(page, bbox)
                if text:
                    md_lines.append(text)
                else:
                    # fallback OCR placeholder (wire pytesseract/doctr if needed)
                    md_lines.append("[TEXT (no vector text in region)]")

            elif label == "Table":
                # Try TATR on the page crop; if it returns multiple tables, append them all
                md_tables = tatr_tables_to_md(crop)
                for tmd in md_tables:
                    md_lines.append(tmd)

            elif label == "Figure":
                # Try DePlot; if not confident, save image
                try:
                    txt = deplot_chart_to_text(crop)
                    print(f"DePlot output: {txt}")
                    if looks_like_tabular(txt):
                        md_lines.append("```chart-table\n" + txt + "\n```")
                    else:
                        # save image and reference
                        fn = IMG_DIR / f"page{pidx+1}_fig{int(lobj.id) if lobj.id else ''}.png"
                        crop.save(fn)
                        md_lines.append(f"![figure]({fn.as_posix()})")
                except Exception as e:
                    print("DePlot failed; saving image")
                    print(e)
                    fn = IMG_DIR / f"page{pidx+1}_fig{int(lobj.id) if lobj.id else ''}.png"
                    crop.save(fn)
                    md_lines.append(f"![figure]({fn.as_posix()})")
                
            else:
                print(f"Skipping unknown layout label: {label}")

        # page break
        md_lines.append("\n---\n")

    return "\n\n".join(md_lines)

def create_pdf_with_block_boxes(pdf_path: str, output_pdf: str):
    doc = fitz.open(pdf_path)
    for pidx, page in enumerate(doc):
        pil_page = pil_from_page(page, dpi=200)
        layout = layout_model.detect(pil_page)
        layout = lp.Layout(sorted(layout, key=lambda l: (round(l.block.y_1, 2), round(l.block.x_1, 2))))
        for lobj in layout:
            bbox = (lobj.block.x_1, lobj.block.y_1, lobj.block.x_2, lobj.block.y_2)
            rect = fitz.Rect(bbox)
            page.draw_rect(rect, color=(1, 0, 0), width=1)  # red box
            page.insert_text((rect.x0 + 2, rect.y0 + 2), lobj.type, fontsize=8, color=(0, 0, 1))  # blue label
    doc.save(output_pdf)
    print(f"Wrote annotated PDF with boxes to {output_pdf}")

if __name__ == "__main__":
    md = extract_pdf_to_markdown(dec2024_file_pdf)
    create_pdf_with_block_boxes(dec2024_file_pdf, "annotated_blocks.pdf")
    Path(OUT_MD).write_text(md, encoding="utf-8")
    print(f"Wrote {OUT_MD} with images in {IMG_DIR}/")


Loading layout model...
Loading TATR models...


Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 7796.10it/s]
Some weights of the model checkpoint at microsoft/table-transformer-detection were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 1 files: 100%|██████████| 1/1 [00:00<0

Processing page 1/37...
Processing page 2/37...
DePlot output: - TYPE: table
- TITLE: Table of Contents
- COLUMNS: Section, Page Number
- ROWS:
  Executive Summary, p.00
  Key Highlights, p.04
  Financial Summary, p.06
  MAUs & Subscribers, p.13
  Product & Platform, p.16
  Outlook, p.21
  Financial Statements, p.24
- NOTES: None
Processing page 3/37...
DePlot output: - TYPE: table
- TITLE: USER & FINANCIAL SUMMARY
- COLUMNS: Q4 2023, Q3 2024, Q4 2024, Y/Y, Q/Q
- ROWS:
  USERS (M), 602, 640, 675, 12%, 5%
  Total Monthly Active Users "MAUs", 602, 640, 675, 12%, 5%
  Premium Subscribers, 236, 252, 265, 11%, 4%
  Ad-Supported MAUs, 379, 402, 425, 12%, 6%
  FINANCIALS (€M), , , , , 
  Premium, 3,170, 3,516, 3,705, 17%, 5%
  Ad-Supported, 501, 572, 532, 7%, 14%
  Total Revenue, 3,671, 4,088, 4,242, 16%, 4%
  Gross Profit, 980, 1,240, 1,288, 40%, 10%
  Gross Margin, 26.7%, 31.1%, 32.2%, , 
  Operating (Loss)/Income, (75), 454, 477, --, 5%
  Operating Margin, (2.0)%, 11.4%, , , 
  Free Cash F