In [12]:
# Cell 1: configuration, imports, basic helpers

from __future__ import annotations

import json
import re
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import fitz  # PyMuPDF

ROOT = Path.cwd().parent

# === Configure your paths here ===
PDF_PATH = ROOT / "data/UVM_Class_Reference_Manual_1.2.pdf"
MINERU_OUT_DIR = ROOT/ "work/mineru_out/"
OUT_JSONL_PATH = ROOT / "work/json_out"
STD_TAG = "UVM-1.2"  # you can change to exact standard name if you like

# Regex for numeric heading inside TOC titles (e.g., "5.3.11.2 do_unpack")
HDR_NUM_ID = re.compile(r"^\s*((?:\d+)(?:\.\d+)*)\s+(.*)$")

# Regex for numeric heading inside body text (e.g., "5.3.11.2 do_unpack")
NUM_HEAD_RE = re.compile(r"^\s*(\d+(?:\.\d+)*)(?:\s+(.*))?$")


@dataclass
class TocNode:
    level: int          # 1 = chapter, 2 = section, etc.
    id: Optional[str]   # "5.3.11.2" or None if not numeric
    title: str          # pure text title, no numeric prefix
    start: int          # 1-based start page
    end: int            # 1-based end page (inclusive)


def _normalize_text(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

print("Config loaded.")
print("PDF_PATH:", PDF_PATH)
print("MINERU_OUT_DIR:", MINERU_OUT_DIR)
print("OUT_JSONL_PATH:", OUT_JSONL_PATH)


Config loaded.
PDF_PATH: c:\Users\41v1r\NEU\NLP\UVM-RAG\data\UVM_Class_Reference_Manual_1.2.pdf
MINERU_OUT_DIR: c:\Users\41v1r\NEU\NLP\UVM-RAG\work\mineru_out
OUT_JSONL_PATH: c:\Users\41v1r\NEU\NLP\UVM-RAG\work\json_out


In [13]:
# Cell 2: TOC handling (numeric + text) and immediate run

def build_toc_intervals(pdf_path: Path) -> List[TocNode]:
    """
    Read PDF outline, accept both numeric and non-numeric titles, and build
    page intervals [start, end] for each TOC node.

    - If a title starts with a numeric id, e.g. "5.3.11.2 do_unpack",
      we store id="5.3.11.2" and title="do_unpack".
    - If there is no numeric prefix, id=None and title is the full text.
    """
    doc = fitz.open(pdf_path.as_posix())
    raw = doc.get_toc(simple=True)  # [[level, title, page], ...]
    tmp: List[Dict[str, Any]] = []

    for lvl, title, page in raw:
        title = str(title).strip()
        m = HDR_NUM_ID.match(title)
        if m:
            numeric_id = m.group(1)        # e.g. "5.3.11.2"
            pure_title = m.group(2).strip()
        else:
            numeric_id = None
            pure_title = title
        tmp.append(
            {
                "level": int(lvl),
                "id": numeric_id,
                "title": pure_title,
                "page": int(page),
            }
        )

    # Build [start, end] intervals using a stack
    stack: List[Dict[str, Any]] = []
    intervals: List[TocNode] = []

    for e in tmp:
        while stack and stack[-1]["level"] >= e["level"]:
            done = stack.pop()
            end_page = e["page"] - 1
            intervals.append(
                TocNode(
                    level=done["level"],
                    id=done["id"],
                    title=done["title"],
                    start=done["page"],
                    end=end_page,
                )
            )
        e["start"] = e["page"]
        stack.append(e)

    last_page = doc.page_count
    while stack:
        done = stack.pop()
        intervals.append(
            TocNode(
                level=done["level"],
                id=done["id"],
                title=done["title"],
                start=done["page"],
                end=last_page,
            )
        )

    # Sort by (start asc, level desc) so deepest nodes come first per page
    intervals.sort(key=lambda x: (x.start, -x.level))
    print(f"[toc] intervals built: {len(intervals)} entries")
    return intervals


def section_fields_for_page(page: int, intervals: List[TocNode]) -> Dict[str, Any]:
    """
    Pick the deepest TOC node covering `page` and derive base section fields.

    - Always provide section_title from TOC.
    - If TOC has a numeric id, derive chapter/section/subsection from it.
    - header_path is the numeric id split on dots, when available.
    """
    candidates = [n for n in intervals if n.start <= page <= n.end]
    if not candidates:
        return {}
    candidates.sort(key=lambda n: n.level, reverse=True)
    chosen = candidates[0]

    meta: Dict[str, Any] = {
        "section_title": chosen.title,
    }

    if chosen.id:
        parts = chosen.id.split(".")
        meta["header_path"] = parts
        if len(parts) >= 1:
            meta["chapter"] = parts[0]
        if len(parts) >= 2:
            meta["section"] = ".".join(parts[:2])
        if len(parts) >= 3:
            meta["subsection"] = ".".join(parts[:3])

    return meta


# === Run TOC extraction immediately ===
toc_intervals = build_toc_intervals(PDF_PATH)
if not toc_intervals:
    print("[toc] WARNING: no TOC entries found; section fields may be empty.")
else:
    print("[toc] example entry:", toc_intervals[0])


[toc] intervals built: 169 entries
[toc] example entry: TocNode(level=1, id=None, title='UVM Class 1.2 Reference', start=1, end=3)


In [None]:
# Cell 3: MinerU runner and immediate run

def run_mineru(pdf_path: Path, out_dir: Path, rebuild: bool = False) -> None:
    """
    Run MinerU CLI on the given PDF, unless outputs already exist and rebuild is False.

    - If out_dir already contains any file, we assume MinerU has been run and
      skip unless rebuild=True.
    - This function does not require GPU; MinerU can run on CPU.
    """
    out_dir.mkdir(parents=True, exist_ok=True)
    if not rebuild:
        existing = list(out_dir.rglob("*"))
        if any(p.is_file() for p in existing):
            print(f"[mineru] outputs detected in {out_dir}; skipping because rebuild=False.")
            return

    cmd = ["mineru", "-p", pdf_path.as_posix(), "-o", out_dir.as_posix()]
    print(f"[mineru] running: {' '.join(cmd)}")
    subprocess.run(cmd, check=True)
    print("[mineru] done.")


# === Run MinerU once ===
# Set rebuild=True only if you want to force re-running MinerU.
run_mineru(PDF_PATH, MINERU_OUT_DIR, rebuild=False)


In [14]:
# Cell 4: load MinerU blocks with page numbers

def load_mineru_blocks_with_pages(out_dir: Path) -> List[Tuple[str, int]]:
    """
    Load MinerU's structured output (content_list.json) and return
    a list of (text, page_no) pairs, where page_no is 1-based.

    - We search recursively for *content_list.json under out_dir.
    - We only keep blocks where type is "text" or "title".
    - page_idx from MinerU is 0-based; we convert to 1-based page numbers.
    """
    candidates = list(out_dir.rglob("*content_list.json"))
    if not candidates:
        raise FileNotFoundError(
            f"No content_list.json found under {out_dir}. "
            "Please check MinerU output structure."
        )

    cl_path = candidates[0]
    print(f"[mineru] using content list: {cl_path.relative_to(out_dir)}")

    raw = json.loads(cl_path.read_text(encoding="utf-8", errors="ignore"))
    blocks_with_pages: List[Tuple[str, int]] = []

    for item in raw:
        if not isinstance(item, dict):
            continue
        ttype = item.get("type")
        if ttype not in ("text", "title"):
            continue

        text = item.get("text") or ""
        text = _normalize_text(str(text))
        if not text or len(text) < 20:
            continue

        page_idx = item.get("page_idx")
        if page_idx is None:
            continue
        page_no = int(page_idx) + 1  # convert 0-based to 1-based
        blocks_with_pages.append((text, page_no))

    print(f"[mineru] loaded {len(blocks_with_pages)} text/title blocks with page_idx")
    if not blocks_with_pages:
        raise RuntimeError(
            f"content_list.json at {cl_path} contained no usable text/title blocks."
        )

    return blocks_with_pages


# === Load MinerU blocks now ===
blocks_with_pages = load_mineru_blocks_with_pages(MINERU_OUT_DIR)
print("Example block:", blocks_with_pages[0] if blocks_with_pages else "None")


[mineru] using content list: UVM_Class_Reference_Manual_1.2\auto\UVM_Class_Reference_Manual_1.2_content_list.json
[mineru] loaded 8167 text/title blocks with page_idx
Example block: ('Universal Verification Methodology (UVM) 1.2 Class Reference', 1)


In [15]:
# Cell 5: numeric heading extraction + JSONL writing

def extract_numeric_heading(block: str) -> Tuple[Optional[str], Optional[str]]:
    """
    Try to parse a leading numeric section id from the block's first line.

    Returns (sec_id, sec_title), e.g., ("5.3.11.2", "do_unpack").
    If not found, returns (None, None).
    """
    first_line = block.splitlines()[0].strip()
    m = NUM_HEAD_RE.match(first_line)
    if not m:
        return None, None
    sec_id = m.group(1)
    title = m.group(2) or ""
    return sec_id, title.strip()


def write_jsonl(
    out_path: Path,
    pdf_name: str,
    std: str,
    blocks_with_pages: List[Tuple[str, int]],
    toc_intervals: List[TocNode],
) -> None:
    """
    Emit a JSONL file where each line represents one text block with:

    - type="text"
    - page_from, page_to
    - std (e.g. "UVM-1.2")
    - uri="/pdf/<pdf_name>.pdf"
    - anchor="#page=<page_from>"
    - section metadata:
        * section_title (from TOC, overridden by numeric heading if present)
        * chapter / section / subsection / header_path (from TOC or block)
    - content
    """
    out_path.parent.mkdir(parents=True, exist_ok=True)
    total = 0

    with out_path.open("w", encoding="utf-8") as f:
        for text, page in blocks_with_pages:
            # base metadata from TOC (text title and possible numeric id)
            meta = section_fields_for_page(page, toc_intervals)

            # refine with numeric heading from the block itself
            sec_id, sec_title = extract_numeric_heading(text)
            if sec_id:
                parts = sec_id.split(".")
                meta["header_path"] = parts
                if len(parts) >= 1:
                    meta["chapter"] = parts[0]
                if len(parts) >= 2:
                    meta["section"] = ".".join(parts[:2])
                if len(parts) >= 3:
                    meta["subsection"] = ".".join(parts[:3])
                if sec_title:
                    # prefer explicit block title over TOC title
                    meta["section_title"] = sec_title

            rec = {
                "type": "text",
                "page_from": page,
                "page_to": page,
                "std": std,
                "uri": f"/pdf/{pdf_name}",
                "anchor": f"#page={page}",
                **meta,
                "content": text,
            }
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
            total += 1

    size_kb = out_path.stat().st_size / 1024.0
    print(f"[emit] wrote {total} records to {out_path} ({size_kb:.1f} KB)")


# === Write JSONL now ===
write_jsonl(
    out_path=OUT_JSONL_PATH,
    pdf_name=PDF_PATH.name,
    std=STD_TAG,
    blocks_with_pages=blocks_with_pages,
    toc_intervals=toc_intervals,
)

print("Done. JSONL ready at:", OUT_JSONL_PATH)


[emit] wrote 8167 records to c:\Users\41v1r\NEU\NLP\UVM-RAG\work\json_out (2361.8 KB)
Done. JSONL ready at: c:\Users\41v1r\NEU\NLP\UVM-RAG\work\json_out
