<a href="https://colab.research.google.com/github/5heron/Compiler-Lab/blob/main/docmatismgrok.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ╔══════════════════════════════════════════════════════════════╗
# ║                    SMARTDOC V25 – COLAB EDITION             ║
# ║  PDF + PPTX + full DOCX + Hybrid OCR + Figure Dedup         ║
# ║  Cached embeddings • Syllabus ordering • Misc bucket        ║
# ╚══════════════════════════════════════════════════════════════╝

!pip install -q pdfplumber python-pptx python-docx rapidocr-onnxruntime sentence-transformers tqdm imagehash

import os
import re
import uuid
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict, Optional

from tqdm import tqdm
from google.colab import files

import numpy as np
import pdfplumber
from sentence_transformers import SentenceTransformer, util
from PIL import Image
import imagehash

# ======================= CONFIG =======================

PAGE_DPI = 180           # DPI for full-page rasterization
FIG_DPI = 180            # DPI for region crops (figures)
PARA_MIN_CHARS = 30      # Min characters for a paragraph to be kept
OCR_MIN_LINE_LEN = 20    # Min length of OCR lines to consider as extra
DEDUP_THRESHOLD = 0.85   # Paragraph similarity threshold for dedup
MISC_THRESHOLD = 0.40    # Topic similarity threshold → Misc
PHASH_SIZE = 16          # phash size
PHASH_DISTANCE = 10      # phash Hamming distance threshold for duplicates


# ======================= DATA STRUCTURES =======================

@dataclass
class FigureRecord:
    id: str
    img_path: str
    source_file: str
    source_page: int  # page/slide index (1-based for PDF/PPTX, 0 for DOCX)
    caption: str = ""


@dataclass
class TableRecord:
    id: str
    md_table: str
    source_file: str
    source_page: int


@dataclass
class ParagraphRecord:
    id: str
    text: str
    source_file: str
    source_page: int
    is_heading: bool = False
    topic_index: Optional[int] = None
    topic_score: float = 0.0


# ======================= SMARTDOC V25 CORE =======================

class SmartDocV25:
    """
    SmartDoc V25:
      - pdfplumber for PDFs (text-layer + tables + images)
      - RapidOCR (PP-OCRv4) for scanned pages & diagram text
      - Hybrid OCR on digital pages to recover labels in figures/tables
      - PPTX: text, images, tables
      - DOCX: text, images, tables, heading styles
      - Cached paragraph embeddings for dedup + topic assignment
      - Perceptual figure deduplication (phash)
      - Syllabus-aware ordering + Misc bucket for low relevance
      - Markdown textbook output with per-item source citations
    """

    def __init__(
        self,
        syllabus_topics: Optional[List[str]] = None,
        hybrid_page_ocr: bool = True,
        misc_threshold: float = MISC_THRESHOLD,
    ):
        from rapidocr_onnxruntime import RapidOCR

        self.ocr = RapidOCR()
        self.embedder = SentenceTransformer("all-MiniLM-L6-v2")

        self.assets_dir = "smartdoc_v25_assets"
        os.makedirs(self.assets_dir, exist_ok=True)

        self.hybrid_page_ocr = hybrid_page_ocr
        self.misc_threshold = misc_threshold

        # Default syllabus (from your AES course)
        if syllabus_topics is None:
            syllabus_topics = [
                # Module I – Earth as a system & equilibrium
                "basic concept of earth as a system",
                "earth system subsystems and interactions",
                "fundamental concepts of equilibrium in earth systems",
                "geomorphic agents and geomorphic processes",

                # Module II – Weathering & fluvial systems
                "weathering relevance and influence on earth systems",
                "types and controlling factors of weathering",
                "river as a system and hydrological cycle",
                "fluvial erosion transportation and deposition",
                "fluvial landforms and stages of stream development",
                "drainage patterns and their implications",

                # Module III – Soils & deserts
                "soil significance formation and controls",
                "soil profile and soil horizons",
                "soil erosion and soil conservation methods",
                "deserts distribution and controlling factors",

                # Module IV – Continental drift & plate tectonics
                "wegeners ideas of continental drift and limitations",
                "plate tectonics background and evidences",
                "plate boundaries and their features",
                "seismicity and volcanism vis a vis plate boundaries",
                "mechanisms of plate movements",

                # Module V – Marine and atmospheric systems
                "importance of marine environment",
                "ocean circulation surface and deep circulation",
                "coastal upwelling and downwelling",
                "ocean floor topography",
                "marine sediments and turbidity currents",
                "coral reefs types and formation",
                "structure and composition of the atmosphere",
                "heat budget and radiation balance of earth",
                "greenhouse effect and global warming causes and effects",
            ]

        self.syllabus_topics: List[str] = syllabus_topics

        # Cache topic embeddings as numpy
        self.topic_embeddings: np.ndarray = self.embedder.encode(
            syllabus_topics,
            normalize_embeddings=True,
            convert_to_numpy=True,
        )

        # Runtime collections
        self.paragraphs: List[ParagraphRecord] = []
        self.figures: List[FigureRecord] = []
        self.tables: List[TableRecord] = []

        # Cached paragraph embeddings (dedup + topics)
        self.para_embs: Optional[np.ndarray] = None

    # ---------------- HELPERS ----------------

    def has_text_layer(self, page) -> bool:
        """Heuristic: does this PDF page have a decent text layer?"""
        try:
            text = page.extract_text()
        except Exception:
            return False
        return bool(text and len(text.strip()) > 80 and "�" not in text[:300])

    @staticmethod
    def normalize_whitespace(text: str) -> str:
        if not text:
            return ""
        # Fix hyphenation across line breaks
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        # Collapse spaces / tabs
        text = re.sub(r"[ \t]+", " ", text)
        # Normalize 3+ newlines to 2
        text = re.sub(r"\n{3,}", "\n\n", text.strip())
        return text.strip()

    @staticmethod
    def is_heading_line(line: str) -> bool:
        s = line.strip()
        if not s:
            return False
        if len(s) > 80:
            return False
        # Numbered headings: "1.2 Title"
        if re.match(r"^(\d+(\.\d+)*)\s+.+", s):
            return True
        # "Chapter 1", "Module 2", etc.
        if re.match(r"^(chapter|unit|section|module)\s+\d+", s, re.IGNORECASE):
            return True
        # ALL CAPS short
        if s == s.upper() and len(s.split()) <= 10:
            return True
        # Title Case short
        if s.istitle() and 1 <= len(s.split()) <= 8:
            return True
        return False

    def ocr_image_text(self, pil_image: Image.Image) -> str:
        """Run RapidOCR on a PIL image and return concatenated text."""
        try:
            # Optional safety: downscale large images
            pil_image.thumbnail((2000, 2000))
            result, _ = self.ocr(pil_image)
            if not result:
                return ""
            return "\n".join([r[1] for r in result]).strip()
        except Exception:
            return ""

    # ---------------- PARAGRAPH EMBEDDING CACHE ----------------

    def compute_para_embeddings(self):
        """Compute paragraph embeddings once, cache for dedup + topic assignment."""
        if self.para_embs is not None:
            return

        if not self.paragraphs:
            self.para_embs = np.empty((0, 384), dtype=np.float32)
            return

        texts = [p.text for p in self.paragraphs]
        print(f"Computing embeddings for {len(texts)} paragraphs (cached)...")
        self.para_embs = self.embedder.encode(
            texts,
            normalize_embeddings=True,
            convert_to_numpy=True,
            batch_size=64,
            show_progress_bar=True,
        )

    # ---------------- FIGURE DEDUPLICATION (PHASH) ----------------

    def deduplicate_figures(
        self,
        hash_size: int = PHASH_SIZE,
        distance_threshold: int = PHASH_DISTANCE,
    ):
        if not self.figures:
            return
        print("Deduplicating figures (perceptual hash)...")
        seen_hashes = []
        new_figs: List[FigureRecord] = []
        for fig in tqdm(self.figures, desc="Figure dedup"):
            try:
                with Image.open(fig.img_path) as img:
                    h = imagehash.phash(img, hash_size=hash_size)
                is_dup = any((h - old_h) < distance_threshold for old_h in seen_hashes)
                if is_dup:
                    try:
                        os.remove(fig.img_path)
                    except FileNotFoundError:
                        pass
                    continue
                seen_hashes.append(h)
                new_figs.append(fig)
            except Exception:
                # If hashing fails, keep the figure (safer)
                new_figs.append(fig)
        self.figures = new_figs
        print(f"Figure deduplication: {len(self.figures)} unique figures kept.")

    # ---------------- PDF EXTRACTION (HYBRID) ----------------

    def extract_pdf(self, path: str):
        file_name = Path(path).name
        print(f"  PDF: {file_name}")
        try:
            pdf = pdfplumber.open(path)
        except Exception as e:
            print(f"  !! Failed to open PDF '{file_name}': {e}")
            return

        with pdf:
            for page_index, page in enumerate(pdf.pages, start=1):
                # FIGURES
                try:
                    for img in page.images:
                        bbox = (img["x0"], img["top"], img["x1"], img["bottom"])
                        try:
                            cropped = page.crop(bbox).to_image(resolution=FIG_DPI)
                            pil_img = cropped.original
                        except Exception:
                            continue

                        img_path = os.path.join(
                            self.assets_dir,
                            f"fig_{uuid.uuid4().hex[:8]}_{file_name}_p{page_index}.png",
                        )
                        pil_img.save(img_path)
                        pil_img.close()

                        with Image.open(img_path) as pil_fig:
                            ocr_caption = self.ocr_image_text(pil_fig)

                        caption = f"Figure from {file_name}, page {page_index}"
                        if ocr_caption:
                            caption += " — " + ocr_caption[:200]

                        self.figures.append(
                            FigureRecord(
                                id=uuid.uuid4().hex,
                                img_path=img_path,
                                source_file=file_name,
                                source_page=page_index,
                                caption=caption,
                            )
                        )
                except Exception:
                    pass

                # TABLES (simple text-layer based)
                try:
                    table_data = page.extract_table()
                except Exception:
                    table_data = None

                if table_data:
                    header = table_data[0] if table_data else []
                    rows = table_data[1:] if len(table_data) > 1 else []

                    header = [c if c is not None else "" for c in header]
                    rows = [
                        [c if c is not None else "" for c in r]
                        for r in rows
                    ]

                    if header:
                        md_lines = []
                        md_lines.append("| " + " | ".join(header) + " |")
                        md_lines.append("| " + " | ".join(["---"] * len(header)) + " |")
                        for r in rows:
                            md_lines.append("| " + " | ".join(r) + " |")
                        md_table = "\n".join(md_lines)

                        self.tables.append(
                            TableRecord(
                                id=uuid.uuid4().hex,
                                md_table=md_table,
                                source_file=file_name,
                                source_page=page_index,
                            )
                        )

                # TEXT (pdfplumber + hybrid OCR)
                try:
                    page_img = page.to_image(resolution=PAGE_DPI).original
                except Exception:
                    page_img = None

                text_layer_ok = self.has_text_layer(page)
                if text_layer_ok:
                    try:
                        base_text = page.extract_text() or ""
                    except Exception:
                        base_text = ""
                else:
                    if page_img is not None:
                        result, _ = self.ocr(page_img)
                        base_text = (
                            "\n".join([r[1] for r in (result or [])])
                            if result
                            else ""
                        )
                    else:
                        base_text = ""

                if page_img is not None:
                    page_img.close()

                base_text = self.normalize_whitespace(base_text)

                # Hybrid OCR: even when text-layer is good, pull extra text from diagrams
                if text_layer_ok and self.hybrid_page_ocr:
                    try:
                        # Re-render page at lower res for OCR to save time
                        page_img2 = page.to_image(resolution=PAGE_DPI).original
                        result_h, _ = self.ocr(page_img2)
                        page_img2.close()

                        ocr_full = (
                            "\n".join([r[1] for r in (result_h or [])])
                            if result_h
                            else ""
                        )
                        ocr_full = self.normalize_whitespace(ocr_full)
                        extra_lines = []
                        for line in ocr_full.splitlines():
                            line = line.strip()
                            if len(line) < OCR_MIN_LINE_LEN:
                                continue
                            if line not in base_text:
                                extra_lines.append(line)
                        if extra_lines:
                            base_text = base_text + "\n\n" + "\n".join(extra_lines)
                    except Exception:
                        pass

                text = self.normalize_whitespace(base_text)
                if not text:
                    continue

                # Split into paragraphs
                raw_paras = re.split(r"\n\s*\n", text)
                for para in raw_paras:
                    p_clean = self.normalize_whitespace(para)
                    if len(p_clean) < PARA_MIN_CHARS:
                        continue
                    lines = p_clean.split("\n")
                    is_heading = len(lines) == 1 and self.is_heading_line(lines[0])

                    self.paragraphs.append(
                        ParagraphRecord(
                            id=uuid.uuid4().hex,
                            text=p_clean,
                            source_file=file_name,
                            source_page=page_index,
                            is_heading=is_heading,
                        )
                    )

    # ---------------- PPTX EXTRACTION ----------------

    def extract_pptx(self, path: str):
        from pptx import Presentation

        file_name = Path(path).name
        print(f"  PPTX: {file_name}")

        try:
            prs = Presentation(path)
        except Exception as e:
            print(f"  !! Failed to open PPTX '{file_name}': {e}")
            return

        for slide_index, slide in enumerate(prs.slides, start=1):
            # FIGURES
            for shape in slide.shapes:
                if getattr(shape, "shape_type", None) == 13:  # PICTURE
                    try:
                        image = shape.image
                        img_bytes = image.blob
                        img_path = os.path.join(
                            self.assets_dir,
                            f"fig_{uuid.uuid4().hex[:8]}_{file_name}_s{slide_index}.png",
                        )
                        with open(img_path, "wb") as f:
                            f.write(img_bytes)

                        with Image.open(img_path) as pil_img:
                            ocr_caption = self.ocr_image_text(pil_img)

                        caption = f"Figure from {file_name}, slide {slide_index}"
                        if ocr_caption:
                            caption += " — " + ocr_caption[:200]

                        self.figures.append(
                            FigureRecord(
                                id=uuid.uuid4().hex,
                                img_path=img_path,
                                source_file=file_name,
                                source_page=slide_index,
                                caption=caption,
                            )
                        )
                    except Exception:
                        pass

            # TABLES
            for shape in slide.shapes:
                if hasattr(shape, "has_table") and shape.has_table:
                    try:
                        table = shape.table
                        rows = []
                        for row in table.rows:
                            cells = [c.text.strip() for c in row.cells]
                            rows.append(cells)
                        if rows:
                            header = rows[0]
                            data_rows = rows[1:] if len(rows) > 1 else []
                            header = [h if h is not None else "" for h in header]
                            md_lines = []
                            md_lines.append("| " + " | ".join(header) + " |")
                            md_lines.append("| " + " | ".join(["---"] * len(header)) + " |")
                            for r in data_rows:
                                r = [c if c is not None else "" for c in r]
                                md_lines.append("| " + " | ".join(r) + " |")
                            md_table = "\n".join(md_lines)

                            self.tables.append(
                                TableRecord(
                                    id=uuid.uuid4().hex,
                                    md_table=md_table,
                                    source_file=file_name,
                                    source_page=slide_index,
                                )
                            )
                    except Exception:
                        pass

            # TEXT
            text_parts = []
            for shape in slide.shapes:
                if hasattr(shape, "has_text_frame") and shape.has_text_frame:
                    t = shape.text or ""
                    if t.strip():
                        text_parts.append(t.strip())

            if not text_parts:
                continue

            full_text = "\n".join(text_parts)
            full_text = self.normalize_whitespace(full_text)
            if not full_text:
                continue

            raw_paras = re.split(r"\n\s*\n", full_text)
            for para in raw_paras:
                p_clean = self.normalize_whitespace(para)
                if len(p_clean) < PARA_MIN_CHARS:
                    continue
                lines = p_clean.split("\n")
                is_heading = len(lines) == 1 and self.is_heading_line(lines[0])

                self.paragraphs.append(
                    ParagraphRecord(
                        id=uuid.uuid4().hex,
                        text=p_clean,
                        source_file=file_name,
                        source_page=slide_index,
                        is_heading=is_heading,
                    )
                )

    # ---------------- DOCX EXTRACTION (FULL) ----------------

    def extract_docx(self, path: str):
        from docx import Document

        file_name = Path(path).name
        print(f"  DOCX: {file_name}")

        try:
            doc = Document(path)
        except Exception as e:
            print(f"  !! Failed to open DOCX '{file_name}': {e}")
            return

        # IMAGES (from relationships)
        for rel in list(doc.part.rels.values()):
            if rel.reltype == (
                "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"
            ):
                try:
                    img_bytes = rel.target_part.blob
                    img_path = os.path.join(
                        self.assets_dir,
                        f"fig_{uuid.uuid4().hex[:8]}_{file_name}_docx.png",
                    )
                    with open(img_path, "wb") as f:
                        f.write(img_bytes)

                    with Image.open(img_path) as pil_img:
                        ocr_caption = self.ocr_image_text(pil_img)

                    caption = f"Figure from {file_name} (DOCX)"
                    if ocr_caption:
                        caption += " — " + ocr_caption[:200]

                    self.figures.append(
                        FigureRecord(
                            id=uuid.uuid4().hex,
                            img_path=img_path,
                            source_file=file_name,
                            source_page=0,  # DOCX has no strict page mapping
                            caption=caption,
                        )
                    )
                except Exception:
                    pass

        # TABLES
        for table in doc.tables:
            try:
                rows = []
                for row in table.rows:
                    cells = [c.text.strip() for c in row.cells]
                    rows.append(cells)
                if not rows:
                    continue
                header = rows[0]
                data_rows = rows[1:] if len(rows) > 1 else []
                header = [h if h is not None else "" for h in header]
                md_lines = []
                md_lines.append("| " + " | ".join(header) + " |")
                md_lines.append("| " + " | ".join(["---"] * len(header)) + " |")
                for r in data_rows:
                    r = [c if c is not None else "" for c in r]
                    md_lines.append("| " + " | ".join(r) + " |")
                md_table = "\n".join(md_lines)

                self.tables.append(
                    TableRecord(
                        id=uuid.uuid4().hex,
                        md_table=md_table,
                        source_file=file_name,
                        source_page=0,
                    )
                )
            except Exception:
                pass

        # PARAGRAPHS (preserve heading styles)
        for para in doc.paragraphs:
            text = para.text or ""
            if not text.strip():
                continue
            p_clean = self.normalize_whitespace(text)
            if len(p_clean) < PARA_MIN_CHARS and not (
                "heading" in para.style.name.lower()
                or self.is_heading_line(p_clean)
            ):
                continue

            is_heading = (
                "heading" in para.style.name.lower() or self.is_heading_line(p_clean)
            )

            self.paragraphs.append(
                ParagraphRecord(
                    id=uuid.uuid4().hex,
                    text=p_clean,
                    source_file=file_name,
                    source_page=0,
                    is_heading=is_heading,
                )
            )

    # ---------------- MASTER EXTRACTION DISPATCH ----------------

    def extract_files(self, filepaths: List[str]):
        for fp in tqdm(filepaths, desc="Extracting documents"):
            ext = Path(fp).suffix.lower()
            if ext == ".pdf":
                self.extract_pdf(fp)
            elif ext == ".pptx":
                self.extract_pptx(fp)
            elif ext == ".docx":
                self.extract_docx(fp)
            else:
                print(f"Skipping unsupported file: {fp}")

    # ---------------- SEMANTIC PARAGRAPH DEDUP ----------------

    def deduplicate_paragraphs(self, similarity_threshold: float = DEDUP_THRESHOLD):
        if not self.paragraphs:
            print("No paragraphs to deduplicate.")
            return

        self.compute_para_embeddings()
        embeddings = self.para_embs
        if embeddings is None or len(embeddings) == 0:
            print("No embeddings available; skipping deduplication.")
            return

        kept_indices: List[int] = []
        kept_embs: List[np.ndarray] = []

        for idx, emb in enumerate(embeddings):
            if kept_embs:
                sims = util.cos_sim(
                    np.array([emb]), np.array(kept_embs)
                )[0].cpu().numpy()
                if np.max(sims) > similarity_threshold:
                    continue
            kept_indices.append(idx)
            kept_embs.append(emb)

        original_count = len(self.paragraphs)
        self.paragraphs = [self.paragraphs[i] for i in kept_indices]
        if kept_embs:
            self.para_embs = np.array(kept_embs)
        else:
            self.para_embs = np.empty((0, embeddings.shape[1]), dtype=np.float32)

        print(f"Deduplicated: {original_count} → {len(self.paragraphs)} paragraphs.")

    # ---------------- SYLLABUS-BASED TOPIC ASSIGNMENT ----------------

    def assign_topics(self):
        if not self.paragraphs:
            print("No paragraphs to assign topics to.")
            return

        self.compute_para_embeddings()
        if self.para_embs is None or self.para_embs.size == 0:
            print("No embeddings available; skipping topic assignment.")
            return

        sims = util.cos_sim(self.para_embs, self.topic_embeddings).cpu().numpy()
        misc_idx = len(self.syllabus_topics)

        for i, p in enumerate(self.paragraphs):
            row = sims[i]
            best_idx = int(np.argmax(row))
            best_score = float(row[best_idx])

            if best_score < self.misc_threshold:
                p.topic_index = misc_idx
            else:
                p.topic_index = best_idx
            p.topic_score = best_score

    # ---------------- BOOK BUILDER (MARKDOWN) ----------------

    def build_markdown_book(self, output_file: str = "SMARTDOC_V25_TEXTBOOK.md") -> str:
        topic_to_paras: Dict[int, List[ParagraphRecord]] = {}
        misc_idx = len(self.syllabus_topics)

        for p in self.paragraphs:
            idx = p.topic_index if p.topic_index is not None else misc_idx
            topic_to_paras.setdefault(idx, []).append(p)

        # Sort paragraphs within each topic: highest score first, then by source
        for idx in topic_to_paras:
            topic_to_paras[idx].sort(
                key=lambda pr: (-pr.topic_score, pr.source_file, pr.source_page)
            )

        with open(output_file, "w", encoding="utf-8") as f:
            # Title
            f.write("# SMARTDOC V25 TEXTBOOK\n\n---\n\n")

            # TOC
            f.write("## Table of Contents\n\n")
            toc_num = 1
            for i, topic in enumerate(self.syllabus_topics):
                if i in topic_to_paras:
                    f.write(f"{toc_num}. {topic.title()}\n")
                    toc_num += 1

            if misc_idx in topic_to_paras:
                f.write(f"{toc_num}. Miscellaneous / Low-Relevance\n")
                toc_num += 1
            if self.figures:
                f.write(f"{toc_num}. Extracted Figures\n")
                toc_num += 1
            if self.tables:
                f.write(f"{toc_num}. Extracted Tables\n")
            f.write("\n---\n\n")

            # MAIN CHAPTERS
            for topic_idx, topic_name in enumerate(self.syllabus_topics):
                paras = topic_to_paras.get(topic_idx, [])
                if not paras:
                    continue
                f.write(f"## {topic_name.title()}\n\n")
                for p in paras:
                    if p.is_heading:
                        f.write(f"### {p.text.strip()}\n\n")
                    else:
                        f.write(p.text + "\n\n")
                    source = f"Source: {p.source_file}"
                    if p.source_page and p.source_page > 0:
                        source += f", page {p.source_page}"
                    f.write(f"*{source}*\n\n")

            # MISCELLANEOUS
            misc_paras = topic_to_paras.get(misc_idx, [])
            if misc_paras:
                misc_paras.sort(key=lambda p: (p.source_file, p.source_page))
                f.write("## Miscellaneous / Low-Relevance\n\n")
                for p in misc_paras:
                    if p.is_heading:
                        f.write(f"### {p.text.strip()}\n\n")
                    else:
                        f.write(p.text + "\n\n")
                    source = f"Source: {p.source_file}"
                    if p.source_page and p.source_page > 0:
                        source += f", page {p.source_page}"
                    f.write(f"*{source}*\n\n")

            # FIGURES
            if self.figures:
                f.write("\n## Extracted Figures\n\n")
                for fig in self.figures:
                    rel_path = os.path.relpath(
                        fig.img_path, Path(output_file).parent
                    )
                    f.write(f"![{fig.caption}]({rel_path})\n\n")
                    source = f"Source: {fig.source_file}"
                    if fig.source_page and fig.source_page > 0:
                        source += f", page {fig.source_page}"
                    f.write(f"*{source}*\n\n")

            # TABLES
            if self.tables:
                f.write("\n## Extracted Tables\n\n")
                for tab in self.tables:
                    f.write(tab.md_table + "\n\n")
                    source = f"Table from {tab.source_file}"
                    if tab.source_page and tab.source_page > 0:
                        source += f", page {tab.source_page}"
                    f.write(f"*{source}*\n\n")

        print(f"\nTEXTBOOK BUILT: {output_file}")
        return output_file


# ======================= DRIVER / COLAB LOGIC =======================

print("SMARTDOC V25 – Hybrid Vision Edition (PDF + PPTX + full DOCX)")

uploaded = files.upload()
filepaths = list(uploaded.keys())

if not filepaths:
    print("No files provided.")
else:
    engine = SmartDocV25(hybrid_page_ocr=True)

    print("\n=== EXTRACTION STAGE ===")
    engine.extract_files(filepaths)

    print("\n=== FIGURE DEDUPLICATION STAGE ===")
    engine.deduplicate_figures()

    print("\n=== PARAGRAPH DEDUPLICATION STAGE ===")
    engine.deduplicate_paragraphs(similarity_threshold=DEDUP_THRESHOLD)

    print("\n=== TOPIC ASSIGNMENT STAGE ===")
    engine.assign_topics()

    print("\n=== BOOK BUILDING STAGE ===")
    out_md = engine.build_markdown_book("SMARTDOC_V25_TEXTBOOK.md")

    print("\n=== DOWNLOAD ===")
    files.download(out_md)
    print("\nDone. SmartDoc V25 textbook is ready.")


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.8/472.8 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.9/14.9 MB[0m [31m73.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.7/296.7 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Saving AES_M4_class notes.pdf to AES_M4_class notes.pdf
Saving AES_Module 5_PPT 1.pdf to AES_Module 5_PPT 1.pdf
Saving AES_Module 5_PPT 2.pdf to AES_Module 5_PPT 2.pdf
Saving AES-module 5.pdf to AES-module 5.pdf
Saving AES-M3.pptx to AES-M3.pptx
Saving AES_Module 3.pdf to AES_Module 3.pdf
Saving AES_Module 1 PPT 3 (1) (1).pdf to AES_Module 1 PPT 3 (1) (1).pdf
Saving AES-M 2_class note.pdf to AES-M 2_class note.pdf
Saving Module 1 AES (1).pptx to Module 1 AES (1).pptx


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


=== EXTRACTION STAGE ===


Extracting documents:   0%|          | 0/9 [00:00<?, ?it/s]

  PDF: AES_M4_class notes.pdf


Extracting documents:  11%|█         | 1/9 [10:01<1:20:10, 601.33s/it]

  PDF: AES_Module 5_PPT 1.pdf


Extracting documents:  22%|██▏       | 2/9 [30:21<1:52:38, 965.49s/it]

  PDF: AES_Module 5_PPT 2.pdf


Extracting documents:  33%|███▎      | 3/9 [38:36<1:15:04, 750.72s/it]

  PDF: AES-module 5.pdf


Extracting documents:  44%|████▍     | 4/9 [40:36<41:47, 501.43s/it]  

  PPTX: AES-M3.pptx


Extracting documents:  56%|█████▌    | 5/9 [41:13<22:16, 334.01s/it]

  PDF: AES_Module 3.pdf


Extracting documents:  67%|██████▋   | 6/9 [1:00:34<30:45, 615.28s/it]

  PDF: AES_Module 1 PPT 3 (1) (1).pdf


Extracting documents:  78%|███████▊  | 7/9 [1:14:50<23:08, 694.06s/it]

  PDF: AES-M 2_class note.pdf


Extracting documents:  89%|████████▉ | 8/9 [1:21:08<09:53, 593.19s/it]

  PPTX: Module 1 AES (1).pptx


Extracting documents: 100%|██████████| 9/9 [1:24:14<00:00, 561.64s/it]



=== FIGURE DEDUPLICATION STAGE ===
Deduplicating figures (perceptual hash)...


Figure dedup: 100%|██████████| 722/722 [00:15<00:00, 45.87it/s]

Figure deduplication: 687 unique figures kept.

=== PARAGRAPH DEDUPLICATION STAGE ===
Computing embeddings for 1377 paragraphs (cached)...





Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Deduplicated: 1377 → 1033 paragraphs.

=== TOPIC ASSIGNMENT STAGE ===

=== BOOK BUILDING STAGE ===

TEXTBOOK BUILT: SMARTDOC_V25_TEXTBOOK.md

=== DOWNLOAD ===


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Done. SmartDoc V25 textbook is ready.


In [None]:
# ================================================================
#              SMARTDOC V26 — LAYER 1: TRUTH KERNEL
#      (Extraction engine + Truth kernel + Transformation bridge)
# ================================================================
!pip install -q pdfplumber python-pptx python-docx rapidocr-onnxruntime sentence-transformers tqdm imagehash

import os
import re
import uuid
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict, Optional

import numpy as np
import pdfplumber
from PIL import Image
import imagehash
from sentence_transformers import SentenceTransformer, util

# ================================================================
#                         CONFIG CONSTANTS
# ================================================================

PAGE_DPI = 180
FIG_DPI = 180
PARA_MIN_CHARS = 30
OCR_MIN_LINE_LEN = 20
DEDUP_THRESHOLD = 0.85
MISC_THRESHOLD = 0.40
PHASH_SIZE = 16
PHASH_DISTANCE = 10

# ================================================================
#             V25 DATA MODELS (Extraction Layer)
# ================================================================

@dataclass
class V25Paragraph:
    id: str
    text: str
    source_file: str
    source_page: int
    is_heading: bool
    topic_index: Optional[int] = None
    topic_score: float = 0.0

@dataclass
class V25Figure:
    id: str
    img_path: str
    source_file: str
    source_page: int
    caption: str = ""

@dataclass
class V25Table:
    id: str
    md_table: str
    source_file: str
    source_page: int

# ================================================================
#              TRUTH KERNEL V26 (Clean Representation)
# ================================================================

@dataclass
class TKParagraph:
    id: str
    text: str
    topic_index: Optional[int]
    topic_score: float
    source_file: str
    source_page: int
    is_heading: bool

@dataclass
class TKFigure:
    id: str
    caption: str
    img_path: str
    source_file: str
    source_page: int

@dataclass
class TKTable:
    id: str
    markdown: str
    source_file: str
    source_page: int
    topic_index: Optional[int] = None

@dataclass
class TruthKernel:
    paragraphs: List[TKParagraph]
    figures: List[TKFigure]
    tables: List[TKTable]
    topics: List[str]

    # ---------- Query Helpers ----------
    def get_paragraphs_by_topic(self, topic_index: int):
        return [p for p in self.paragraphs if p.topic_index == topic_index]

    def get_paragraphs_by_source(self, file):
        return [p for p in self.paragraphs if p.source_file == file]

    def get_topic_by_name(self, name: str):
        for name in self.topics:
            return name
        return None

# ================================================================
#               EXTRACTION ENGINE (Based on V25)
#        Only functional refactor, logic unchanged
# ================================================================

class ExtractorV25:
    """
    Identical logic to V25 extraction, but contained in a single class.
    Produces V25Paragraph, V25Figure, V25Table records.
    """

    def __init__(self, syllabus_topics: List[str], hybrid_page_ocr: bool = True):
        from rapidocr_onnxruntime import RapidOCR
        self.ocr = RapidOCR()
        self.embedder = SentenceTransformer("all-MiniLM-L6-v2")

        self.syllabus_topics = syllabus_topics
        self.topic_embeddings = self.embedder.encode(
            syllabus_topics,
            normalize_embeddings=True,
            convert_to_numpy=True,
        )

        self.hybrid_page_ocr = hybrid_page_ocr
        self.assets_dir = "smartdoc_v26_assets"
        os.makedirs(self.assets_dir, exist_ok=True)

        # Accumulated extracted content (V25 objects)
        self.paragraphs: List[V25Paragraph] = []
        self.figures: List[V25Figure] = []
        self.tables: List[V25Table] = []

        self.para_embs = None  # cached embeddings for dedup + topics

    # ---------- Utility methods copied from V25 ----------

    def normalize_whitespace(self, text: str) -> str:
        if not text:
            return ""
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        text = re.sub(r"[ \t]+", " ", text)
        text = re.sub(r"\n{3,}", "\n\n", text.strip())
        return text.strip()

    def is_heading_line(self, line: str) -> bool:
        s = line.strip()
        if not s:
            return False
        if len(s) > 80:
            return False
        if re.match(r"^(\d+(\.\d+)*)\s+.+", s):
            return True
        if re.match(r"^(chapter|unit|section|module)\s+\d+", s, re.IGNORECASE):
            return True
        if s == s.upper() and len(s.split()) <= 10:
            return True
        if s.istitle() and 1 <= len(s.split()) <= 8:
            return True
        return False

    def has_text_layer(self, page) -> bool:
        try:
            text = page.extract_text()
        except:
            return False
        return bool(text and len(text.strip()) > 80 and "�" not in text[:300])

    def ocr_image_text(self, img: Image.Image) -> str:
        try:
            img.thumbnail((2000, 2000))
            result, _ = self.ocr(img)
            if not result:
                return ""
            return "\n".join([r[1] for r in result]).strip()
        except:
            return ""

    # ============================================================
    #                    EXTRACTION DISPATCH
    # ============================================================

    def extract_files(self, paths: List[str]):
        for fp in paths:
            ext = fp.lower().split(".")[-1]
            if ext == "pdf":
                self.extract_pdf(fp)
            elif ext == "pptx":
                self.extract_pptx(fp)
            elif ext == "docx":
                self.extract_docx(fp)
            else:
                print(f"Skipping unsupported: {fp}")

    # ============================================================
    #                     PDF Extraction
    # ============================================================

    def extract_pdf(self, path: str):
        file = Path(path).name
        print("PDF:", file)
        try:
            pdf = pdfplumber.open(path)
        except Exception as e:
            print("Could not open PDF:", e)
            return

        with pdf:
            for pageno, page in enumerate(pdf.pages, 1):
                # FIGURES -------------------------------------------------
                for img in page.images:
                    bbox = (img["x0"], img["top"], img["x1"], img["bottom"])
                    try:
                        pil_img = page.crop(bbox).to_image(resolution=FIG_DPI).original
                    except:
                        continue

                    img_path = os.path.join(
                        self.assets_dir,
                        f"fig_{uuid.uuid4().hex[:8]}_{file}_p{pageno}.png"
                    )
                    pil_img.save(img_path)

                    with Image.open(img_path) as pil:
                        ocr_cap = self.ocr_image_text(pil)

                    cap = f"Figure from {file}, page {pageno}"
                    if ocr_cap:
                        cap += " — " + ocr_cap[:200]

                    self.figures.append(
                        V25Figure(
                            id=uuid.uuid4().hex,
                            img_path=img_path,
                            source_file=file,
                            source_page=pageno,
                            caption=cap
                        )
                    )

                # TABLES --------------------------------------------------
                try:
                    tbl = page.extract_table()
                except:
                    tbl = None

                if tbl:
                    header = [c or "" for c in tbl[0]]
                    rows = [[c or "" for c in r] for r in tbl[1:]]
                    md = []
                    md.append("| " + " | ".join(header) + " |")
                    md.append("| " + " | ".join(["---"] * len(header)) + " |")
                    for r in rows:
                        md.append("| " + " | ".join(r) + " |")
                    self.tables.append(
                        V25Table(
                            id=uuid.uuid4().hex,
                            md_table="\n".join(md),
                            source_file=file,
                            source_page=pageno
                        )
                    )

                # TEXT ----------------------------------------------------
                base_text = ""
                page_img = None
                try:
                    page_img = page.to_image(PAGE_DPI).original
                except:
                    pass

                if self.has_text_layer(page):
                    base_text = page.extract_text() or ""
                else:
                    if page_img:
                        result, _ = self.ocr(page_img)
                        base_text = "\n".join([r[1] for r in (result or [])]) if result else ""

                base_text = self.normalize_whitespace(base_text)

                # Hybrid OCR
                if self.has_text_layer(page) and self.hybrid_page_ocr and page_img:
                    result_h, _ = self.ocr(page_img)
                    ocr_full = (
                        "\n".join([r[1] for r in (result_h or [])]) if result_h else ""
                    )
                    ocr_full = self.normalize_whitespace(ocr_full)
                    for line in ocr_full.splitlines():
                        line = line.strip()
                        if len(line) < OCR_MIN_LINE_LEN:
                            continue
                        if line not in base_text:
                            base_text += "\n" + line

                if not base_text:
                    continue

                paras = re.split(r"\n\s*\n", base_text)
                for para in paras:
                    txt = self.normalize_whitespace(para)
                    if len(txt) < PARA_MIN_CHARS:
                        continue
                    ishead = self.is_heading_line(txt.split("\n")[0])
                    self.paragraphs.append(
                        V25Paragraph(
                            id=uuid.uuid4().hex,
                            text=txt,
                            source_file=file,
                            source_page=pageno,
                            is_heading=ishead
                        )
                    )

                if page_img:
                    page_img.close()

    # ============================================================
    #                    PPTX Extraction
    # ============================================================
    def extract_pptx(self, path: str):
        from pptx import Presentation
        file = Path(path).name
        print("PPTX:", file)

        try:
            prs = Presentation(path)
        except:
            return

        for s_idx, slide in enumerate(prs.slides, 1):

            # FIGURES: identical to V25
            for shape in slide.shapes:
                if getattr(shape, "shape_type", None) == 13:
                    img_bytes = shape.image.blob
                    img_path = os.path.join(
                        self.assets_dir,
                        f"fig_{uuid.uuid4().hex[:8]}_{file}_s{s_idx}.png"
                    )
                    with open(img_path, "wb") as f:
                        f.write(img_bytes)

                    with Image.open(img_path) as pil:
                        captxt = self.ocr_image_text(pil)

                    cap = f"Figure from {file}, slide {s_idx}"
                    if captxt:
                        cap += " — " + captxt[:200]

                    self.figures.append(
                        V25Figure(
                            id=uuid.uuid4().hex,
                            img_path=img_path,
                            source_file=file,
                            source_page=s_idx,
                            caption=cap
                        )
                    )

            # TABLES
            for shape in slide.shapes:
                if hasattr(shape, "has_table") and shape.has_table:
                    table = shape.table
                    rows = [[c.text.strip() for c in row.cells] for row in table.rows]
                    if not rows:
                        continue
                    header = rows[0]
                    data = rows[1:]
                    md = []
                    md.append("| " + " | ".join(header) + " |")
                    md.append("| " + " | ".join(["---"] * len(header)) + " |")
                    for r in data:
                        md.append("| " + " | ".join(r) + " |")

                    self.tables.append(
                        V25Table(
                            id=uuid.uuid4().hex,
                            md_table="\n".join(md),
                            source_file=file,
                            source_page=s_idx
                        )
                    )

            # TEXT
            tparts = []
            for shape in slide.shapes:
                if hasattr(shape, "has_text_frame") and shape.has_text_frame:
                    if shape.text and shape.text.strip():
                        tparts.append(shape.text.strip())

            if not tparts:
                continue

            full = self.normalize_whitespace("\n".join(tparts))
            paras = re.split(r"\n\s*\n", full)
            for p in paras:
                clean = self.normalize_whitespace(p)
                if len(clean) < PARA_MIN_CHARS:
                    continue
                ishead = self.is_heading_line(clean.split("\n")[0])
                self.paragraphs.append(
                    V25Paragraph(
                        id=uuid.uuid4().hex,
                        text=clean,
                        source_file=file,
                        source_page=s_idx,
                        is_heading=ishead
                    )
                )

    # ============================================================
    #                    DOCX Extraction
    # ============================================================
    def extract_docx(self, path: str):
        from docx import Document
        file = Path(path).name
        print("DOCX:", file)

        try:
            doc = Document(path)
        except:
            return

        # IMAGES
        for rel in list(doc.part.rels.values()):
            if rel.reltype.endswith("image"):
                img_bytes = rel.target_part.blob
                img_path = os.path.join(
                    self.assets_dir,
                    f"fig_{uuid.uuid4().hex[:8]}_{file}_docx.png"
                )
                with open(img_path, "wb") as f:
                    f.write(img_bytes)

                with Image.open(img_path) as pil:
                    captxt = self.ocr_image_text(pil)

                cap = f"Figure from {file} (DOCX)"
                if captxt:
                    cap += " — " + captxt[:200]

                self.figures.append(
                    V25Figure(
                        id=uuid.uuid4().hex,
                        img_path=img_path,
                        source_file=file,
                        source_page=0,
                        caption=cap
                    )
                )

        # TABLES
        for table in doc.tables:
            rows = [[c.text.strip() for c in row.cells] for row in table.rows]
            if not rows:
                continue
            header = rows[0]
            data = rows[1:]
            md = []
            md.append("| " + " | ".join(header) + " |")
            md.append("| " + " | ".join(["---"] * len(header)) + " |")
            for r in data:
                md.append("| " + " | ".join(r) + " |")

            self.tables.append(
                V25Table(
                    id=uuid.uuid4().hex,
                    md_table="\n".join(md),
                    source_file=file,
                    source_page=0
                )
            )

        # PARAGRAPHS
        for para in doc.paragraphs:
            txt = para.text or ""
            if not txt.strip():
                continue
            clean = self.normalize_whitespace(txt)
            ishead = ("heading" in para.style.name.lower()) or self.is_heading_line(clean)
            if len(clean) < PARA_MIN_CHARS and not ishead:
                continue

            self.paragraphs.append(
                V25Paragraph(
                    id=uuid.uuid4().hex,
                    text=clean,
                    source_file=file,
                    source_page=0,
                    is_heading=ishead
                )
            )

    # ============================================================
    #               Deduplication & Topic Assignment
    # ============================================================

    def compute_para_embeddings(self):
        if self.para_embs is not None:
            return
        if not self.paragraphs:
            self.para_embs = np.empty((0,384))
            return
        texts = [p.text for p in self.paragraphs]
        self.para_embs = self.embedder.encode(
            texts, normalize_embeddings=True, convert_to_numpy=True
        )

    def deduplicate(self):
        if not self.paragraphs:
            return
        self.compute_para_embeddings()
        kept = []
        kept_embs = []
        for i, emb in enumerate(self.para_embs):
            if kept_embs:
                sims = util.cos_sim(np.array([emb]), np.array(kept_embs))[0].cpu().numpy()
                if np.max(sims) > DEDUP_THRESHOLD:
                    continue
            kept.append(i)
            kept_embs.append(emb)
        self.paragraphs = [self.paragraphs[i] for i in kept]
        self.para_embs = np.array(kept_embs)

    def assign_topics(self):
        self.compute_para_embeddings()
        if not self.paragraphs:
            return
        sims = util.cos_sim(self.para_embs, self.topic_embeddings).cpu().numpy()
        misc = len(self.syllabus_topics)
        for i,p in enumerate(self.paragraphs):
            row = sims[i]
            idx = int(np.argmax(row))
            score = float(row[idx])
            if score < MISC_THRESHOLD:
                p.topic_index = misc
            else:
                p.topic_index = idx
            p.topic_score = score

# ================================================================
#     TRUTH KERNEL BUILDER (Converts V25 objects → V26 format)
# ================================================================

class TruthKernelBuilder:
    @staticmethod
    def from_extractor(ext: ExtractorV25) -> TruthKernel:
        # Build topics
        topics = [name for name in ext.syllabus_topics]
        # Convert paragraphs
        paras = [
            TKParagraph(
                id=p.id,
                text=p.text,
                topic_index=p.topic_index,
                topic_score=p.topic_score,
                source_file=p.source_file,
                source_page=p.source_page,
                is_heading=p.is_heading
            )
            for p in ext.paragraphs
        ]

        # Convert figures
        figs = [
            TKFigure(
                id=f.id,
                caption=f.caption,
                img_path=f.img_path,
                source_file=f.source_file,
                source_page=f.source_page,
            )
            for f in ext.figures
        ]

        # Convert tables
        tables = [
            TKTable(
                id=t.id,
                markdown=t.md_table,
                source_file=t.source_file,
                source_page=t.source_page
            )
            for t in ext.tables
        ]

        return TruthKernel(paragraphs=paras, figures=figs, tables=tables, topics=topics)


In [None]:
# ================================================================
#              SMARTDOC V26 – LAYER 2: DOCUMENT DSL
#          (Declarative "document program" specification)
# ================================================================

from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any, Literal, Union

# We will try to support YAML; if not available, the user can pass dicts or JSON
try:
    import yaml  # PyYAML
    _HAS_YAML = True
except Exception:
    _HAS_YAML = False

SummarizationLevel = Literal["none", "low", "medium", "high"]


# ---------------------- DSL DATA STRUCTURES ----------------------


@dataclass
class DocumentMetaConfig:
    """
    High-level metadata about the desired document.

    - id:         unique identifier for this document spec
    - type:       semantic type (e.g., "project_report", "seminar_report", "textbook")
    - template:   name of the formatting template to use later (LaTeX/DOCX/Markdown)
    - title:      optional human-friendly title
    - author:     optional author name
    - extra:      optional free-form metadata (college name, course, etc.)
    """
    id: str
    type: str
    template: str
    title: Optional[str] = None
    author: Optional[str] = None
    extra: Dict[str, Any] = field(default_factory=dict)


@dataclass
class SectionConfig:
    """
    Describes a logical section in the output document.

    - id:              unique within this document
    - title:           section heading
    - from_topics:     list of topic names (strings) to pull content from
                       (these should match SmartDocV25.syllabus_topics entries)
    - summarization:   "none" | "low" | "medium" | "high"
                       (compiler will decide how aggressively to compress)
    - max_paragraphs:  optional cap on number of paragraphs to include
    - include_headings: whether to include heading-style paragraphs
    - include_figures: whether to attach relevant figures
    - include_tables:  whether to attach relevant tables
    """
    id: str
    title: str
    from_topics: List[str] = field(default_factory=list)
    summarization: SummarizationLevel = "none"
    max_paragraphs: Optional[int] = None
    include_headings: bool = True
    include_figures: bool = True
    include_tables: bool = True


@dataclass
class DocumentSpec:
    """
    Full DSL configuration for a single desired document:
    meta + ordered list of sections.
    """
    meta: DocumentMetaConfig
    sections: List[SectionConfig] = field(default_factory=list)


# ---------------------- DSL ERRORS ----------------------


class DslParseError(Exception):
    """Raised when the DSL file/string cannot be parsed structurally."""
    pass


class DslValidationError(Exception):
    """Raised when required DSL fields are missing or invalid."""
    pass


# ---------------------- DSL PARSER ----------------------


class DocumentDslParser:
    """
    Parses DSL from:

    - Python dicts
    - YAML strings / files (if PyYAML is installed)
    - JSON-compatible dicts

    and produces a validated DocumentSpec object.
    """

    # ---- Public API ----

    @staticmethod
    def from_dict(raw: Dict[str, Any]) -> DocumentSpec:
        """
        Parse and validate a Python dict into DocumentSpec.
        Raises DslValidationError on invalid structure.
        """
        DocumentDslParser._validate_top_level(raw)

        # Parse document meta
        doc_meta = raw["document"]
        meta = DocumentMetaConfig(
            id=DocumentDslParser._require_str(doc_meta, "id"),
            type=DocumentDslParser._require_str(doc_meta, "type"),
            template=DocumentDslParser._require_str(doc_meta, "template"),
            title=doc_meta.get("title"),
            author=doc_meta.get("author"),
            extra=doc_meta.get("extra", {}) or {},
        )

        # Parse sections
        sections_raw = raw.get("sections", [])
        if not isinstance(sections_raw, list):
            raise DslValidationError("'sections' must be a list")

        sections: List[SectionConfig] = []
        seen_ids = set()
        for idx, s in enumerate(sections_raw):
            if not isinstance(s, dict):
                raise DslValidationError(f"Section at index {idx} must be an object/dict")

            sid = DocumentDslParser._require_str(s, "id")
            if sid in seen_ids:
                raise DslValidationError(f"Duplicate section id: '{sid}'")
            seen_ids.add(sid)

            title = DocumentDslParser._require_str(s, "title")
            from_topics = s.get("from_topics", [])
            if not isinstance(from_topics, list):
                raise DslValidationError(
                    f"Section '{sid}': 'from_topics' must be a list of strings"
                )
            from_topics = [str(t) for t in from_topics]

            summarization = s.get("summarization", "none")
            if summarization not in ("none", "low", "medium", "high"):
                raise DslValidationError(
                    f"Section '{sid}': invalid summarization '{summarization}'"
                )

            max_paragraphs = s.get("max_paragraphs", None)
            if max_paragraphs is not None and not isinstance(max_paragraphs, int):
                raise DslValidationError(
                    f"Section '{sid}': 'max_paragraphs' must be an integer or null"
                )

            include_headings = bool(s.get("include_headings", True))
            include_figures = bool(s.get("include_figures", True))
            include_tables = bool(s.get("include_tables", True))

            sections.append(
                SectionConfig(
                    id=sid,
                    title=title,
                    from_topics=from_topics,
                    summarization=summarization,  # type: ignore[arg-type]
                    max_paragraphs=max_paragraphs,
                    include_headings=include_headings,
                    include_figures=include_figures,
                    include_tables=include_tables,
                )
            )

        return DocumentSpec(meta=meta, sections=sections)

    @staticmethod
    def from_yaml_string(yaml_str: str) -> DocumentSpec:
        """
        Parse DSL from a YAML string.
        Requires PyYAML to be available.
        """
        if not _HAS_YAML:
            raise DslParseError("PyYAML is not installed; cannot parse YAML.")
        try:
            raw = yaml.safe_load(yaml_str)
        except Exception as e:
            raise DslParseError(f"Failed to parse YAML: {e}")
        if not isinstance(raw, dict):
            raise DslValidationError("Top-level YAML must be a mapping (dict).")
        return DocumentDslParser.from_dict(raw)

    @staticmethod
    def from_yaml_file(path: Union[str, Path]) -> DocumentSpec:
        """
        Parse DSL from a YAML file on disk.
        """
        if not _HAS_YAML:
            raise DslParseError("PyYAML is not installed; cannot parse YAML file.")
        path = Path(path)
        try:
            text = path.read_text(encoding="utf-8")
        except Exception as e:
            raise DslParseError(f"Failed to read DSL file '{path}': {e}")
        return DocumentDslParser.from_yaml_string(text)

    # ---- Internal helpers ----

    @staticmethod
    def _validate_top_level(raw: Dict[str, Any]):
        if "document" not in raw:
            raise DslValidationError("Missing required top-level field: 'document'")
        if not isinstance(raw["document"], dict):
            raise DslValidationError("'document' must be an object/dict")

    @staticmethod
    def _require_str(d: Dict[str, Any], key: str) -> str:
        if key not in d:
            raise DslValidationError(f"Missing required field: '{key}'")
        val = d[key]
        if not isinstance(val, str):
            raise DslValidationError(f"Field '{key}' must be a string")
        if not val.strip():
            raise DslValidationError(f"Field '{key}' must not be empty")
        return val.strip()




In [None]:
# ================================================================
#          SMARTDOC V26 – LAYER 3: DOCUMENT COMPILER / AST
#    (Builds a logical document structure from V25 + DSL spec)
# ================================================================

from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional, Set

# We assume
# - SmartDocV25, V25Paragraph, already defined (from your V25 code)
# - DocumentSpec, SectionConfig from the DSL layer are defined above


# ---------------------- AST DATA STRUCTURES ----------------------


@dataclass
class AstParagraph:
    type: str = "paragraph"
    text: str = ""
    source_ref: Dict[str, Any] = field(default_factory=dict)


@dataclass
class AstFigure:
    type: str = "figure"
    ref_id: str = ""
    caption: str = ""
    img_path: str = ""
    source_ref: Dict[str, Any] = field(default_factory=dict)


@dataclass
class AstTable:
    type: str = "table"
    ref_id: str = ""
    markdown: str = ""
    source_ref: Dict[str, Any] = field(default_factory=dict)


@dataclass
class AstSection:
    type: str = "section"
    id: str = ""
    title: str = ""
    blocks: List[Any] = field(default_factory=list)  # paragraphs, figures, tables


@dataclass
class AstDocument:
    type: str = "document"
    meta: Dict[str, Any] = field(default_factory=dict)
    sections: List[AstSection] = field(default_factory=list)


# ---------------------- SUMMARIZER STUB ----------------------


class ParagraphSummarizer:
    """
    Simple, deterministic summarizer stub.

    For now, "summarization level" only controls how many paragraphs
    are kept in a section. No textual rewriting is done (no LLM).
    You can swap this later with a more advanced summarizer.
    """

    def keep_count_for_level(self, total: int, level: str) -> int:
        if total <= 0:
            return 0
        if level == "none":
            return total
        if level == "low":
            # Slight reduction
            return max(1, int(total * 0.75))
        if level == "medium":
            # Moderate reduction
            return max(1, int(total * 0.5))
        if level == "high":
            # Aggressive reduction
            return max(1, int(total * 0.3))
        # Fallback: keep everything
        return total

    def select_paragraphs(
        self,
        paragraphs: List[V25Paragraph],
        level: str,
        hard_max: Optional[int] = None,
    ) -> List[V25Paragraph]:
        """
        Decide which paragraphs to keep based on summarization level
        and optional hard max_paragraphs from the DSL.
        """
        total = len(paragraphs)
        keep = self.keep_count_for_level(total, level)
        if hard_max is not None:
            keep = min(keep, hard_max)
        return paragraphs[:keep]


# ---------------------- DOCUMENT COMPILER ----------------------


class DocumentCompilerV26:
    """
    Compiles:

        SmartDocV25 (engine) + DocumentSpec (DSL) → AstDocument

    This layer:
      - selects paragraphs per section based on topics / headings / max_paragraphs
      - attaches relevant figures / tables (based on source_file & source_page)
      - produces a clean AST, without any formatting or LaTeX/DOCX/HTML concerns.
    """

    def __init__(self, engine: "SmartDocV25", summarizer: Optional[ParagraphSummarizer] = None):
        self.engine = engine
        self.summarizer = summarizer or ParagraphSummarizer()

    # ---------- PUBLIC API ----------

    def compile(self, spec: DocumentSpec) -> AstDocument:
        """
        Build an AstDocument given the V25 engine state and a DSL spec.

        Assumes:
          - engine.paragraphs, engine.figures, engine.tables are already filled
          - engine.assign_topics() has been called (topic_index/topic_score set)
        """
        meta = {
            "id": spec.meta.id,
            "type": spec.meta.type,
            "template": spec.meta.template,
            "title": spec.meta.title,
            "author": spec.meta.author,
            "extra": spec.meta.extra or {},
        }

        sections: List[AstSection] = []
        for sec_cfg in spec.sections:
            section_ast = self._build_section(sec_cfg)
            sections.append(section_ast)

        return AstDocument(meta=meta, sections=sections)

    # ---------- INTERNAL HELPERS ----------

    def _build_section(self, cfg: SectionConfig) -> AstSection:
        """
        Build a single AstSection from the DSL config, using engine content.
        """
        # 1. Determine which topic indices this section cares about
        topic_indices: Set[int] = set()
        if cfg.from_topics:
            for name in cfg.from_topics:
                # Exact match against syllabus_topics strings
                if name in self.engine.topics:
                    idx = self.engine.topics.index(name)
                    topic_indices.add(idx)
                else:
                    # Topic name not found – silently ignore, or log if you want
                    pass
        else:
            # If no topics specified, interpret as: "use all paragraphs"
            topic_indices = set(
                i for i, _ in enumerate(self.engine.topics)
            )

        # 2. Gather candidate paragraphs
        candidates: List[V25Paragraph] = []
        for p in self.engine.paragraphs:
            if p.topic_index is None:
                continue
            if p.topic_index not in topic_indices:
                continue
            if not cfg.include_headings and p.is_heading:
                continue
            candidates.append(p)

        # 3. Sort candidates deterministically
        #    First by descending topic_score, then by source_file, then page, then id
        candidates.sort(
            key=lambda p: (-p.topic_score, p.source_file, p.source_page, p.id)
        )

        # 4. Apply summarization & max_paragraphs
        selected_paras = self.summarizer.select_paragraphs(
            candidates, cfg.summarization, hard_max=cfg.max_paragraphs
        )

        # 5. Convert paragraphs to AST nodes
        ast_blocks: List[Any] = []
        used_page_keys: Set[tuple] = set()  # (source_file, source_page)

        for p in selected_paras:
            ast_blocks.append(
                AstParagraph(
                    text=p.text,
                    source_ref={
                        "paragraph_id": p.id,
                        "source_file": p.source_file,
                        "source_page": p.source_page,
                        "topic_index": p.topic_index,
                        "topic_score": p.topic_score,
                        "is_heading": p.is_heading,
                    },
                )
            )
            used_page_keys.add((p.source_file, p.source_page))

        # 6. Attach figures/tables that share source_file & source_page
        #    This is a simple but deterministic heuristic.
        if cfg.include_figures:
            for fig in self.engine.figures:
                key = (fig.source_file, fig.source_page)
                if key in used_page_keys:
                    ast_blocks.append(
                        AstFigure(
                            ref_id=fig.id,
                            caption=fig.caption,
                            img_path=fig.img_path,
                            source_ref={
                                "figure_id": fig.id,
                                "source_file": fig.source_file,
                                "source_page": fig.source_page,
                            },
                        )
                    )

        if cfg.include_tables:
            for tab in self.engine.tables:
                key = (tab.source_file, tab.source_page)
                if key in used_page_keys:
                    ast_blocks.append(
                        AstTable(
                            ref_id=tab.id,
                            markdown=tab.markdown,
                            source_ref={
                                "table_id": tab.id,
                                "source_file": tab.source_file,
                                "source_page": tab.source_page,
                            },
                        )
                    )

        # 7. Build final section
        return AstSection(
            id=cfg.id,
            title=cfg.title,
            blocks=ast_blocks,
        )




In [None]:
# ================================================================
#              SMARTDOC V26 – LAYER 4: TEMPLATE ENGINE
#     Takes AstDocument → Rendered Markdown (or future formats)
# ================================================================

from dataclasses import dataclass, field
from typing import Dict, Any, List, Optional

# We assume:
# - AstDocument, AstSection, AstParagraph, AstFigure, AstTable already exist
# - DocumentSpec exists (DSL)
# - compiler exists and produces AST
# - engine extraction untouched


# --------------------------------------------------------
#   TEMPLATE CONFIG DATA STRUCTURES
# --------------------------------------------------------

@dataclass
class TemplateStyle:
    heading_1_prefix: str = "# "
    heading_2_prefix: str = "## "
    heading_3_prefix: str = "### "
    paragraph_spacing: int = 1
    figure_style: str = "markdown"    # or latex, html, docx, etc.
    table_style: str = "markdown"     # more in Layer 5
    include_toc: bool = True
    include_metadata_header: bool = True
    extra: Dict[str, Any] = field(default_factory=dict)


@dataclass
class TemplateConfig:
    name: str
    style: TemplateStyle


# --------------------------------------------------------
#   TEMPLATE REGISTRY (you can add new templates)
# --------------------------------------------------------

class TemplateRegistryV26:
    """
    Holds multiple template configurations, selectable by name.
    """

    def __init__(self):
        self.templates: Dict[str, TemplateConfig] = {}

        # Default Markdown template (similar to V25 but modular)
        default_style = TemplateStyle(
            heading_1_prefix="# ",
            heading_2_prefix="## ",
            heading_3_prefix="### ",
            paragraph_spacing=1,
            figure_style="markdown",
            table_style="markdown",
            include_toc=True,
            include_metadata_header=True,
        )
        self.register(TemplateConfig(
            name="markdown_default_v26",
            style=default_style
        ))

        # Example: "college project report" formatting
        college_style = TemplateStyle(
            heading_1_prefix="# ",
            heading_2_prefix="## ",
            heading_3_prefix="### ",
            paragraph_spacing=1,
            include_toc=True,
            include_metadata_header=True,
            extra={
                "title_page": True,
                "title_center": True,
                "toc_depth": 2,
            }
        )
        self.register(TemplateConfig(
            name="college_project_v1",
            style=college_style
        ))

    def register(self, template_config: TemplateConfig):
        self.templates[template_config.name] = template_config

    def get(self, name: str) -> TemplateConfig:
        if name not in self.templates:
            raise ValueError(f"Unknown template: {name}")
        return self.templates[name]


# --------------------------------------------------------
#   MARKDOWN RENDERER
# --------------------------------------------------------

class MarkdownRendererV26:
    """
    Converts AstDocument into Markdown text using a TemplateConfig.
    """

    def __init__(self, template: TemplateConfig):
        self.template = template
        self.style = template.style

    # --------------------------- PUBLIC API ---------------------------

    def render(self, ast: AstDocument) -> str:
        out: List[str] = []

        # Metadata header / title block
        if self.style.include_metadata_header:
            out.extend(self._render_metadata(ast.meta))
            out.append("\n")

        # Title Page (optional)
        if self.style.extra.get("title_page", False):
            out.append(self._render_title_page(ast.meta))
            out.append("\n\n---\n\n")

        # TOC
        if self.style.include_toc:
            out.append(self._render_toc(ast.sections))
            out.append("\n\n---\n\n")

        # Sections
        for sec in ast.sections:
            out.append(self._render_section(sec))
            out.append("\n")

        return "\n".join(out).strip() + "\n"

    # --------------------------- RENDER HELPERS ---------------------------

    def _render_metadata(self, meta: Dict[str, Any]) -> List[str]:
        lines = [
            f"<!-- Document ID: {meta.get('id', '')} -->",
            f"<!-- Template: {meta.get('template', '')} -->",
            "",
        ]
        if meta.get("title"):
            lines.append(f"{self.style.heading_1_prefix}{meta['title']}")
        if meta.get("author"):
            lines.append(f"**Author:** {meta['author']}")
        if meta.get("extra"):
            for k, v in meta["extra"].items():
                lines.append(f"**{k.title()}:** {v}")

        lines.append("\n")
        return lines

    def _render_title_page(self, meta: Dict[str, Any]) -> str:
        title = meta.get("title", "")
        author = meta.get("author", "")
        extra = meta.get("extra", {})

        center = self.style.extra.get("title_center", False)

        lines = []
        if center:
            lines.append("<div align='center'>")

        if title:
            lines.append(f"# {title}\n")
        if author:
            lines.append(f"**{author}**\n")
        for k, v in extra.items():
            lines.append(f"**{k.title()}:** {v}\n")

        if center:
            lines.append("</div>")

        return "\n".join(lines)

    def _render_toc(self, sections: List[AstSection]) -> str:
        lines = ["## Table of Contents\n"]
        for i, sec in enumerate(sections, start=1):
            lines.append(f"{i}. {sec.title}")
        return "\n".join(lines)

    def _render_section(self, sec: AstSection) -> str:
        lines = []
        lines.append(f"{self.style.heading_2_prefix}{sec.title}\n")

        for block in sec.blocks:
            if isinstance(block, AstParagraph):
                lines.extend(self._render_paragraph(block))
            elif isinstance(block, AstFigure):
                lines.extend(self._render_figure(block))
            elif isinstance(block, AstTable):
                lines.extend(self._render_table(block))

        return "\n".join(lines)

    def _render_paragraph(self, p: AstParagraph) -> List[str]:
        spacing = "\n" * self.style.paragraph_spacing
        return [p.text + spacing]

    def _render_figure(self, f: AstFigure) -> List[str]:
        # Markdown figure
        if self.style.figure_style == "markdown":
            return [
                f"![{f.caption}]({f.img_path})\n",
                f"*Source: {f.source_ref.get('source_file', '')} "
                f"p.{f.source_ref.get('source_page', '')}*",
                "\n"
            ]
        # Extension point for future formats
        return [f"[FIGURE {f.ref_id} – unsupported style]\n"]

    def _render_table(self, t: AstTable) -> List[str]:
        if self.style.table_style == "markdown":
            return [
                t.markdown + "\n",
                f"*Source: {t.source_ref.get('source_file', '')} "
                f"p.{t.source_ref.get('source_page', '')}*",
                "\n"
            ]
        return [f"[TABLE {t.ref_id} – unsupported style]\n"]


# --------------------------------------------------------
#   TEMPLATE ENGINE WRAPPER
# --------------------------------------------------------

class TemplateEngineV26:
    """
    User-facing: Given AST + template name → markdown string.
    """

    def __init__(self):
        self.registry = TemplateRegistryV26()

    def render(self, ast: AstDocument, template_name: str) -> str:
        template = self.registry.get(template_name)
        renderer = MarkdownRendererV26(template)
        return renderer.render(ast)


In [None]:
# ================================================================
#          SMARTDOC V26 – LAYER 5: LaTeX & PDF RENDERERS
#     Takes AstDocument → LaTeX source (and optionally → PDF)
# ================================================================

import subprocess
import shutil
from pathlib import Path

# We assume:
#   - AstDocument, AstSection, AstParagraph, AstFigure, AstTable exist
#   - DocumentCompilerV26, DocumentDslParser, SmartDocV25, etc. exist above


# --------------------------------------------------------
#   LATEX ESCAPING UTIL
# --------------------------------------------------------

_LATEX_SPECIALS = {
    "\\": r"\textbackslash{}",
    "{": r"\{",
    "}": r"\}",
    "$": r"\$",
    "&": r"\&",
    "#": r"\#",
    "_": r"\_",
    "%": r"\%",
    "~": r"\textasciitilde{}",
    "^": r"\textasciicircum{}",
}


def latex_escape(text: str) -> str:
    """Escape LaTeX special characters in plain text."""
    if not text:
        return ""
    out = []
    for ch in text:
        out.append(_LATEX_SPECIALS.get(ch, ch))
    return "".join(out)


# --------------------------------------------------------
#   LATEX RENDERER
# --------------------------------------------------------

class LaTeXRendererV26:

    def __init__(
        self,
        docclass: str = "report",
        fontsize: str = "12pt",
        geometry_opts: str = "margin=1in",
    ):
        self.docclass = docclass
        self.fontsize = fontsize
        self.geometry_opts = geometry_opts

    # ---------- PUBLIC API ----------

    def render(self, ast: AstDocument) -> str:
        lines: List[str] = []

        # Preamble
        lines.append(self._preamble(ast.meta))

        # Begin document
        lines.append(r"\begin{document}")
        lines.append("")

        # Title
        lines.extend(self._render_title_block(ast.meta))

        # TOC
        lines.append(r"\tableofcontents")
        lines.append(r"\clearpage")
        lines.append("")

        # Sections
        for sec in ast.sections:
            lines.extend(self._render_section(sec))

        # End document
        lines.append(r"\end{document}")
        lines.append("")

        return "\n".join(lines)

    # ---------- INTERNAL HELPERS ----------

    def _preamble(self, meta: Dict[str, Any]) -> str:
        title = latex_escape(meta.get("title", "") or "")
        author = latex_escape(meta.get("author", "") or "")

        pre = [
            rf"\documentclass[{self.fontsize}]{{{self.docclass}}}",
            r"\usepackage[utf8]{inputenc}",
            r"\usepackage[T1]{fontenc}",
            r"\usepackage{graphicx}",
            r"\usepackage{geometry}",
            rf"\geometry{{{self.geometry_opts}}}",
            r"\usepackage{hyperref}",
            r"\usepackage{setspace}",
            r"\usepackage{caption}",
            r"\usepackage{float}",  # for [H]
            "",
            rf"\title{{{title}}}" if title else "",
            rf"\author{{{author}}}" if author else "",
            r"\date{\today}",
            "",
        ]
        # Filter out empty lines
        return "\n".join([l for l in pre if l.strip() != ""])

    def _render_title_block(self, meta: Dict[str, Any]) -> List[str]:
        lines = []
        if meta.get("title") or meta.get("author"):
            lines.append(r"\maketitle")
            lines.append("")
        extra = meta.get("extra", {}) or {}
        if extra:
            lines.append(r"\begin{center}")
            for k, v in extra.items():
                key = latex_escape(str(k).title())
                val = latex_escape(str(v))
                lines.append(rf"\textbf{{{key}:}} {val}\\")
            lines.append(r"\end{center}")
            lines.append("")
        return lines

    def _render_section(self, sec: AstSection) -> List[str]:
        lines: List[str] = []

        # Use chapter as main level for now
        title = latex_escape(sec.title or "")
        if title:
            lines.append(rf"\chapter{{{title}}}")
            lines.append("")

        for block in sec.blocks:
            if isinstance(block, AstParagraph):
                lines.extend(self._render_paragraph(block))
            elif isinstance(block, AstFigure):
                lines.extend(self._render_figure(block))
            elif isinstance(block, AstTable):
                lines.extend(self._render_table(block))

        return lines

    def _render_paragraph(self, p: AstParagraph) -> List[str]:
        text = latex_escape(p.text)
        # Simple one-line paragraphs, blank line after
        return [text, ""]

    def _render_figure(self, f: AstFigure) -> List[str]:
        caption = latex_escape(f.caption or "")
        src_file = f.source_ref.get("source_file", "")
        src_page = f.source_ref.get("source_page", "")

        src_str = ""
        if src_file or src_page:
            src_str = f" (Source: {src_file} p.{src_page})"
        caption_full = caption + latex_escape(src_str)

        img_path = f.img_path.replace("\\", "/")  # LaTeX likes forward slashes

        lines = [
            r"\begin{figure}[H]",
            r"  \centering",
            rf"  \includegraphics[width=0.8\textwidth]{{{latex_escape(img_path)}}}",
            rf"  \caption{{{caption_full}}}",
            rf"  \label{{fig:{latex_escape(f.ref_id)}}}",
            r"\end{figure}",
            "",
        ]
        return lines

    def _render_table(self, t: AstTable) -> List[str]:
        """
        For now, we embed the markdown table as verbatim text.
        You can later upgrade this to a real LaTeX tabular environment.
        """
        src_file = t.source_ref.get("source_file", "")
        src_page = t.source_ref.get("source_page", "")
        src_str = f"Source: {src_file} p.{src_page}" if (src_file or src_page) else ""

        lines = [
            r"\begin{table}[H]",
            r"  \centering",
            r"  \begin{minipage}{0.95\textwidth}",
            r"  \begin{verbatim}",
            t.markdown,
            r"  \end{verbatim}",
            r"  \end{minipage}",
        ]
        if src_str:
            lines.append(rf"  \caption*{{{latex_escape(src_str)}}}")
        lines.append(r"\end{table}")
        lines.append("")
        return lines


# --------------------------------------------------------
#   PDF RENDERER (LaTeX → PDF via pdflatex)
# --------------------------------------------------------

class PdfRendererViaLatexV26:
    """
    Helper that writes a LaTeX file to disk and calls pdflatex to build a PDF.

    This does NOT run automatically; you call it explicitly.

    Requirements:
      - pdflatex must be available on PATH (e.g., TeX Live, MiKTeX).
      - running in an environment where subprocess is allowed (e.g. Colab, local).
    """

    def __init__(self, tex_engine: str = "pdflatex"):
        self.tex_engine = tex_engine

    def render_to_pdf(
        self,
        latex_source: str,
        output_dir: str | Path = ".",
        basename: str = "smartdoc_output",
        runs: int = 1,
        clean_aux: bool = True,
    ) -> Path:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

        tex_path = output_dir / f"{basename}.tex"
        pdf_path = output_dir / f"{basename}.pdf"

        tex_path.write_text(latex_source, encoding="utf-8")

        if shutil.which(self.tex_engine) is None:
            raise RuntimeError(
                f"LaTeX engine '{self.tex_engine}' not found on PATH. "
                f"Install TeX Live / MiKTeX or adjust tex_engine."
            )

        cmd = [
            self.tex_engine,
            "-interaction=nonstopmode",
            "-halt-on-error",
            str(tex_path),
        ]

        for i in range(runs):
            print(f"[LaTeX] Run {i+1}/{runs}: {' '.join(cmd)}")
            proc = subprocess.run(
                cmd,
                cwd=str(output_dir),
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                text=True,
            )
            if proc.returncode != 0:
                print(proc.stdout)
                raise RuntimeError(f"LaTeX compilation failed with code {proc.returncode}")

        if clean_aux:
            for ext in [".aux", ".log", ".out", ".toc"]:
                f = output_dir / f"{basename}{ext}"
                if f.exists():
                    f.unlink()

        if not pdf_path.exists():
            raise RuntimeError("LaTeX reported success but PDF not found.")

        print(f"[LaTeX] PDF generated at: {pdf_path}")
        return pdf_path



In [None]:
# ================================================================
#           SMARTDOC V26 — COMPLETE TEXTBOOK BUILDER
#        MCN 401 Industrial Safety Engineering + CET 425
#        Clean • Layered • TruthKernel-First • Ready-to-Run
# ================================================================

!pip install -q pdfplumber python-pptx python-docx rapidocr-onnxruntime \
                sentence-transformers tqdm imagehash PyYAML

import os
import json
import uuid
import re
from pathlib import Path
from dataclasses import dataclass, field
from typing import List, Dict, Optional, Any, Literal
from google.colab import files

import pdfplumber
from PIL import Image
import imagehash
from sentence_transformers import SentenceTransformer, util
import numpy as np
import logging
logging.getLogger("pdfminer").setLevel(logging.ERROR)
logging.getLogger("pdfminer.pdfinterp").setLevel(logging.ERROR)

# ================================================================
#                        CONFIG & CONSTANTS
# ================================================================

PAGE_DPI = 180
FIG_DPI = 180
PARA_MIN_CHARS = 30
DEDUP_THRESHOLD = 0.85
MISC_THRESHOLD = 0.40

# ================================================================
#                     SYLLABUS TOPICS (FULL COVERAGE)
# ================================================================

SYLLABUS_TOPICS = [
    # MCN 401 - Industrial Safety Engineering
    "need for safety", "safety and productivity", "accident", "injury", "unsafe act",
    "unsafe condition", "dangerous occurrence", "reportable accidents",
    "theories of accident causation", "domino theory", "heinrich theory",
    "safety organization", "role of management", "supervisors", "workmen", "unions",
    "government and voluntary agencies", "safety policy", "safety officer",
    "responsibilities", "authority", "safety committee", "types", "advantages",
    "personal protective equipment", "ppe", "respiratory", "non-respiratory",
    "housekeeping", "5s", "work permit system", "hot work", "cold work", "confined spaces",
    "construction safety", "excavation", "scaffolds", "ladders", "blasting", "demolition",
    "ergonomics", "musculoskeletal disorders", "machine guarding", "point of operation",
    "welding safety", "material handling", "lifting techniques", "hearing conservation",
    "hazard identification", "hazop", "dow index", "fire explosion", "msds", "chemical hazards",
]

# ================================================================
#                    LAYER 1 & 2: TRUTH KERNEL
# ================================================================

print("Please upload your lecture slides (PDF/PPTX/DOCX)...")
uploaded = files.upload()
filepaths = list(uploaded.keys())

extractor = ExtractorV25(syllabus_topics=SYLLABUS_TOPICS, hybrid_page_ocr=True)
print("\n=== LAYER 1: EXTRACTING FILES ===")
extractor.extract_files(filepaths)
print(f"Extracted {len(extractor.paragraphs)} paragraphs, {len(extractor.figures)} figures")

print("\n=== LAYER 2: BUILDING TRUTH KERNEL ===")
extractor.deduplicate()
extractor.assign_topics()
truth_kernel = TruthKernelBuilder.from_extractor(extractor)

Path("layer2_truthkernel.json").write_text(json.dumps({
    "total_paragraphs": len(truth_kernel.paragraphs),
    "syllabus_matched": sum(1 for p in truth_kernel.paragraphs if p.topic_index is not None and p.topic_index < len(truth_kernel.topics)),
    "figures": len(truth_kernel.figures),
    "tables": len(truth_kernel.tables)
}, indent=2))

print("TruthKernel built — clean, deduplicated, syllabus-aligned")

# ================================================================
#                    LAYER 3: DOCUMENT DSL
# ================================================================

dsl_input = {
    "document": {
        "id": "mcn401_official_textbook",
        "type": "textbook",
        "template": "markdown_default_v26",
        "title": "MCN 401: Industrial Safety Engineering",
        "subtitle": "Complete Reference Textbook • Department of Mechanical & Civil Engineering",
        "author": "SmartDoc V26 • Auto-Generated from Official Lecture Slides",
        "cover_image": "smartdoc_v26_assets/cover_mcn401.png"  # We'll make this the clean title slide
    },
    "sections": [
        # ==================== FRONT MATTER ====================
        {"id": "cover",           "title": "MCN 401 — Industrial Safety Engineering", "from_topics": [], "max_paragraphs": 1, "priority_figures": True},
        {"id": "syllabus",        "title": "Course Syllabus & Structure", "from_topics": ["introduction to industrial safety engineering"], "include_tables": True, "max_paragraphs": 5},

        # ==================== MODULE I ====================
        {"id": "module1_intro",   "title": "Module I: Foundations of Industrial Safety", "from_topics": [], "max_paragraphs": 2},
        {"id": "need_safety",     "title": "Need for Safety & Productivity Relationship", "from_topics": ["need for safety", "safety and productivity"], "summarization": "high", "max_paragraphs": 10},
        {"id": "definitions",     "title": "Key Definitions", "from_topics": ["accident", "injury", "unsafe act", "unsafe condition", "dangerous occurrence", "reportable accidents", "near miss"], "summarization": "high", "max_paragraphs": 15},
        {"id": "accident_theories","title": "Theories of Accident Causation", "from_topics": ["domino theory", "heinrich", "pyramid", "multiple causation", "bird", "accident causation"], "summarization": "high", "max_paragraphs": 20, "include_figures": True},

        # ==================== MODULE II ====================
        {"id": "safety_org",      "title": "Module II: Safety Organization", "from_topics": ["safety organization"], "max_paragraphs": 3},
        {"id": "roles",           "title": "Roles & Responsibilities", "from_topics": ["role of management", "supervisors", "workmen", "unions", "government", "voluntary agencies"], "summarization": "medium", "max_paragraphs": 15},
        {"id": "safety_policy",   "title": "Safety Policy", "from_topics": ["safety policy"], "max_paragraphs": 8},
        {"id": "safety_officer",  "title": "Safety Officer: Duties & Authority", "from_topics": ["safety officer", "responsibilities", "authority"], "summarization": "high", "max_paragraphs": 12},
        {"id": "safety_committee","title": "Safety Committee: Types & Advantages", "from_topics": ["safety committee", "types", "advantages"], "summarization": "high", "max_paragraphs": 10},

        # ==================== MODULE III ====================
        {"id": "module3_ppe",     "title": "Module III: Personal Protective Equipment", "from_topics": ["personal protective equipment", "ppe"], "max_paragraphs": 4},
        {"id": "ppe_types",       "title": "Types of PPE", "from_topics": ["respiratory", "non-respiratory", "head protection", "eye", "hearing", "hand", "foot"], "summarization": "medium", "max_paragraphs": 20, "include_figures": True},
        {"id": "housekeeping",    "title": "Housekeeping & 5S Principles", "from_topics": ["housekeeping", "5s"], "summarization": "high", "max_paragraphs": 12, "include_figures": True},
        {"id": "work_permits",    "title": "Work Permit Systems", "from_topics": ["work permit", "hot work", "cold work", "confined space", "height work"], "summarization": "high", "max_paragraphs": 18, "include_figures": True},

        # ==================== MODULE IV ====================
        {"id": "module4_const",   "title": "Module IV: Construction Safety", "from_topics": ["construction safety"], "max_paragraphs": 3},
        {"id": "const_hazards",   "title": "Construction Hazards & Controls", "from_topics": ["excavation", "scaffolding", "ladders", "blasting", "demolition", "fall protection"], "summarization": "medium", "max_paragraphs": 25, "include_figures": True},
        {"id": "ergonomics",      "title": "Ergonomics & Musculoskeletal Disorders", "from_topics": ["ergonomics", "musculoskeletal", "lifting", "repetitive strain"], "summarization": "high", "max_paragraphs": 15, "include_figures": True},
        {"id": "machine_guarding", "title": "Machine Guarding & Point of Operation", "from_topics": ["machine guarding", "point of operation", "guards"], "summarization": "high", "max_paragraphs": 18, "include_figures": True},
        {"id": "material_handling","title": "Safe Material Handling & Lifting", "from_topics": ["material handling", "lifting techniques", "manual handling"], "max_paragraphs": 15, "include_figures": True},

        # ==================== MODULE V ====================
        {"id": "module5_hazards", "title": "Module V: Hazard Identification & Control", "from_topics": ["hazard identification"], "max_paragraphs": 3},
        {"id": "hazop_dow",       "title": "HAZOP & Dow Fire/Explosion Index", "from_topics": ["hazop", "dow index", "fire explosion"], "summarization": "high", "max_paragraphs": 20, "include_figures": True},
        {"id": "msds",            "title": "Material Safety Data Sheets (MSDS)", "from_topics": ["msds", "material safety data sheet"], "summarization": "high", "max_paragraphs": 15, "include_figures": True},
        {"id": "chemical_hazards","title": "Chemical Hazards & GHS Classification", "from_topics": ["chemical hazards", "ghs", "flammable", "toxic", "corrosive"], "summarization": "high", "max_paragraphs": 20, "include_figures": True},
        {"id": "hearing",         "title": "Hearing Conservation Program", "from_topics": ["hearing conservation", "noise", "noise induced"], "max_paragraphs": 12, "include_figures": True},
        {"id": "electrical",      "title": "Electrical Safety", "from_topics": ["electrical safety", "lockout", "tagout", "loto"], "max_paragraphs": 10, "include_figures": True},

        # ==================== BACK MATTER ====================
        {"id": "figures",         "title": "List of Figures", "from_topics": [], "include_figures": True, "max_paragraphs": 0},
        {"id": "tables",          "title": "List of Tables", "from_topics": [], "include_tables": True, "max_paragraphs": 0}
    ]
}

print("\n=== LAYER 3: PARSING DSL ===")
spec = DocumentDslParser.from_dict(dsl_input)
print(f"Document spec loaded — {len(spec.sections)} sections defined")

# ================================================================
#                    LAYER 4: COMPILING AST
# ================================================================

print("\n=== LAYER 4: COMPILING AST ===")
compiler = DocumentCompilerV26(truth_kernel)
ast_doc = compiler.compile(spec)

Path("layer4_ast_summary.json").write_text(json.dumps({
    "title": ast_doc.meta.get("title", "Untitled"),
    "sections": len(ast_doc.sections),
    "total_blocks": sum(len(s.blocks) for s in ast_doc.sections)
}, indent=2))

print(f"AST compiled — {len(ast_doc.sections)} sections ready")

# ================================================================
#                    LAYER 5A: MARKDOWN OUTPUT
# ================================================================

print("\n=== LAYER 5A: RENDERING MARKDOWN ===")
template_engine = TemplateEngineV26()
md_output = template_engine.render(ast_doc, "markdown_default_v26")

output_md = "MCN401_CET425_SMARTDOC_TEXTBOOK.md"
Path(output_md).write_text(md_output, encoding="utf-8")
print(f"Markdown textbook saved → {output_md}")

# ================================================================
#                    LAYER 5B: LATEX OUTPUT
# ================================================================

print("\n=== LAYER 5B: RENDERING LATEX ===")
latex_renderer = LaTeXRendererV26()
tex_output = latex_renderer.render(ast_doc)

output_tex = "MCN401_CET425_SMARTDOC_TEXTBOOK.tex"
Path(output_tex).write_text(tex_output, encoding="utf-8")
print(f"LaTeX source saved → {output_tex}")

# ================================================================
#                    DOWNLOAD YOUR TEXTBOOK
# ================================================================

print("\nDownloading your complete textbook...")
files.download(output_md)
files.download(output_tex)

print("\n" + "="*60)
print("           TEXTBOOK GENERATION COMPLETE!")
print("   MCN 401 Industrial Safety + CET 425 Earth Systems")
print("        Built with SmartDoc V26 — Layered Intelligence")
print("="*60)

Please upload your lecture slides (PDF/PPTX/DOCX)...


Saving Mod1.pdf to Mod1 (4).pdf
Saving Mod2.pdf to Mod2 (4).pdf
Saving Mod3.pdf to Mod3 (4).pdf
Saving Mod4.pdf to Mod4 (4).pdf
Saving Mod5.pdf to Mod5 (4).pdf

=== LAYER 1: EXTRACTING FILES ===
PDF: Mod1 (4).pdf
PDF: Mod2 (4).pdf
PDF: Mod3 (4).pdf
PDF: Mod4 (4).pdf
PDF: Mod5 (4).pdf
Extracted 706 paragraphs, 652 figures

=== LAYER 2: BUILDING TRUTH KERNEL ===
TruthKernel built — clean, deduplicated, syllabus-aligned

=== LAYER 3: PARSING DSL ===
Document spec loaded — 28 sections defined

=== LAYER 4: COMPILING AST ===
AST compiled — 28 sections ready

=== LAYER 5A: RENDERING MARKDOWN ===
Markdown textbook saved → MCN401_CET425_SMARTDOC_TEXTBOOK.md

=== LAYER 5B: RENDERING LATEX ===
LaTeX source saved → MCN401_CET425_SMARTDOC_TEXTBOOK.tex

Downloading your complete textbook...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


           TEXTBOOK GENERATION COMPLETE!
   MCN 401 Industrial Safety + CET 425 Earth Systems
        Built with SmartDoc V26 — Layered Intelligence


In [None]:
# ================================================================
# SMARTDOC V26 — LAYER 1: TRUTH KERNEL (EXTRACTION ENGINE)
# - PDF + PPTX + DOCX
# - Hybrid OCR (text-layer + full-page OCR)
# - Figure extraction + perceptual dedup (phash)
# - Table extraction to Markdown
# - Paragraph extraction + semantic dedup + topic assignment
# - TruthKernel builder for downstream layers
# - NO ML cleanup here (Layer 2 will handle that)
# ================================================================

!pip install -q pdfplumber python-pptx python-docx rapidocr-onnxruntime \
               sentence-transformers imagehash

import os
import re
import uuid
import logging
from pathlib import Path
from dataclasses import dataclass
from typing import List, Optional, Dict

import numpy as np
import pdfplumber
from PIL import Image
import imagehash
from sentence_transformers import SentenceTransformer, util

logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger("Layer1_TruthKernel")

# ================================================================
# CONFIG CONSTANTS
# ================================================================

PAGE_DPI = 180           # for full-page rasterization
FIG_DPI = 180            # for figure crops
PARA_MIN_CHARS = 30      # min chars for non-heading paragraphs
OCR_MIN_LINE_LEN = 20    # OCR lines shorter than this are ignored for hybrid extra
DEDUP_THRESHOLD = 0.85   # paragraph similarity threshold
MISC_THRESHOLD = 0.40    # topic score < this → Misc bucket
PHASH_SIZE = 16          # figure phash size
PHASH_DISTANCE = 10      # max Hamming distance to consider duplicate

# ================================================================
# RAW EXTRACTION DATA MODELS (V25)
# ================================================================

@dataclass
class V25Paragraph:
    id: str
    text: str
    source_file: str
    source_page: int
    is_heading: bool
    topic_index: Optional[int] = None
    topic_score: float = 0.0

@dataclass
class V25Figure:
    id: str
    img_path: str
    source_file: str
    source_page: int
    caption: str = ""

@dataclass
class V25Table:
    id: str
    md_table: str
    source_file: str
    source_page: int

# ================================================================
# TRUTH KERNEL MODELS (V26)
# ================================================================

@dataclass
class TKParagraph:
    id: str
    text: str
    topic_index: Optional[int]
    topic_score: float
    source_file: str
    source_page: int
    is_heading: bool

@dataclass
class TKFigure:
    id: str
    caption: str
    img_path: str
    source_file: str
    source_page: int

@dataclass
class TKTable:
    id: str
    markdown: str
    source_file: str
    source_page: int
    topic_index: Optional[int] = None

@dataclass
class TruthKernel:
    """
    Immutable content contract for downstream layers.
    """
    paragraphs: List[TKParagraph]
    figures: List[TKFigure]
    tables: List[TKTable]
    topics: List[str]

    # Convenience helpers (for later layers/testing)
    def get_paragraphs_by_topic(self, topic_index: int) -> List[TKParagraph]:
        return [p for p in self.paragraphs if p.topic_index == topic_index]

    def get_paragraphs_by_source(self, file_name: str) -> List[TKParagraph]:
        return [p for p in self.paragraphs if p.source_file == file_name]

    def get_topic_index_by_name(self, name: str) -> Optional[int]:
        for i, t in enumerate(self.topics):
            if t == name:
                return i
        return None

# ================================================================
# EXTRACTOR V25 → SOURCE OF TRUTH FOR V26
# ================================================================

class ExtractorV25:
    """
    SMARTDOC V26 — Layer 1 Extraction Engine

    Responsibilities:
      - Ingest PDF / PPTX / DOCX
      - Extract paragraphs, figures, tables
      - Hybrid OCR on PDFs (text-layer + full-page OCR)
      - Perceptual figure dedup
      - Paragraph semantic dedup
      - Topic assignment from syllabus
      - Provide V25 objects that are sealed into a TruthKernel
    """

    def __init__(
        self,
        syllabus_topics: List[str],
        hybrid_page_ocr: bool = True,
    ):
        from rapidocr_onnxruntime import RapidOCR

        self.ocr_engine = RapidOCR()
        self.embedder = SentenceTransformer("all-MiniLM-L6-v2")

        self.syllabus_topics = syllabus_topics
        self.topic_embeddings: np.ndarray = (
            self.embedder.encode(
                syllabus_topics,
                normalize_embeddings=True,
                convert_to_numpy=True,
            )
            if syllabus_topics else np.empty((0, 384))
        )

        self.hybrid_page_ocr = hybrid_page_ocr
        self.assets_dir = "smartdoc_v26_assets"
        os.makedirs(self.assets_dir, exist_ok=True)

        # runtime collections
        self.paragraphs: List[V25Paragraph] = []
        self.figures: List[V25Figure] = []
        self.tables: List[V25Table] = []
        self.para_embs: Optional[np.ndarray] = None

    # ============================================================
    # UTILITIES
    # ============================================================

    def normalize_whitespace(self, text: str) -> str:
        if not text:
            return ""
        # join hyphen+newline splits
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        # collapse spaces/tabs
        text = re.sub(r"[ \t]+", " ", text)
        # collapse 3+ newlines → 2
        text = re.sub(r"\n{3,}", "\n\n", text.strip())
        return text.strip()

    def _clean_table_cell(self, text: str) -> str:
        return text.replace("\n", " ").strip() if text else ""

    def is_heading_line(self, s: str) -> bool:
        s = s.strip()
        if not s or len(s) > 80:
            return False
        # numbered headings
        if re.match(r"^(\d+(\.\d+)*)\s+.+", s):
            return True
        # "Chapter 1", "Module 2", etc.
        if re.match(r"^(chapter|unit|section|module)\s+\d+", s, re.IGNORECASE):
            return True
        # ALL CAPS short
        if s == s.upper() and len(s.split()) <= 10:
            return True
        # Title Case short
        if s.istitle() and len(s.split()) <= 8:
            return True
        return False

    def has_text_layer(self, page) -> bool:
        """
        Heuristic to detect usable PDF text layer.
        """
        try:
            text = page.extract_text()
            if not text:
                return False
            text = text.strip()
            if len(text) <= 80:
                return False
            if "�" in text[:300]:
                return False
            return True
        except Exception:
            return False

    def _run_ocr(self, pil_img: Image.Image) -> str:
        """
        Safe OCR wrapper using RapidOCR.
        """
        try:
            pil_img.thumbnail((2000, 2000))
            arr = np.array(pil_img)
            result, _ = self.ocr_engine(arr)
            if not result:
                return ""
            return "\n".join([r[1] for r in result if r[1]]).strip()
        except Exception as e:
            logger.error(f"OCR failure: {e}")
            return ""

    # ============================================================
    # FILE DISPATCH
    # ============================================================

    def extract_files(self, paths: List[str]):
        """
        Entry point: extract from all given file paths.
        """
        for fp in paths:
            if not os.path.exists(fp):
                print(f"File not found: {fp}")
                continue

            ext = Path(fp).suffix.lower()
            try:
                if ext == ".pdf":
                    self.extract_pdf(fp)
                elif ext == ".pptx":
                    self.extract_pptx(fp)
                elif ext == ".docx":
                    self.extract_docx(fp)
                else:
                    print(f"Skipping unsupported extension: {fp}")
            except Exception as e:
                print(f"Critical failure processing {fp}: {e}")

    # ============================================================
    # PDF EXTRACTION
    # ============================================================

    def extract_pdf(self, path: str):
        file_name = Path(path).name
        print(f"[PDF] {file_name}")

        try:
            pdf = pdfplumber.open(path)
        except Exception as e:
            print(f"Failed to open PDF: {path} ({e})")
            return

        with pdf:
            for pageno, page in enumerate(pdf.pages, 1):
                text_layer_ok = self.has_text_layer(page)

                # ------------------ FIGURES ------------------
                for img in page.images:
                    try:
                        x0, top, x1, bottom = img["x0"], img["top"], img["x1"], img["bottom"]
                        if x1 <= x0 or bottom <= top:
                            continue
                        cropped = page.crop((x0, top, x1, bottom))
                        pil_img = cropped.to_image(resolution=FIG_DPI).original

                        img_id = uuid.uuid4().hex
                        out_name = f"fig_{img_id[:8]}_{file_name}_p{pageno}.png"
                        out_path = os.path.join(self.assets_dir, out_name)
                        pil_img.save(out_path)

                        cap = f"Figure from {file_name}, page {pageno}"
                        ocr_cap = self._run_ocr(pil_img)
                        if ocr_cap:
                            cap += " — " + ocr_cap[:200]

                        self.figures.append(V25Figure(
                            id=img_id,
                            img_path=out_path,
                            source_file=file_name,
                            source_page=pageno,
                            caption=cap
                        ))

                    except Exception as e:
                        logger.warning(f"Image extract fail p{pageno}: {e}")

                # ------------------ TABLES ------------------
                try:
                    tbl = page.extract_table()
                    if tbl and len(tbl) > 1:
                        cleaned = [[self._clean_table_cell(c or "") for c in r] for r in tbl]
                        header = cleaned[0]
                        rows = cleaned[1:]

                        # optional: skip heavily ragged tables
                        if any(len(r) != len(header) for r in rows):
                            # Keep raw text; table ML cleanup can deal later
                            pass
                        else:
                            md = ["| " + " | ".join(header) + " |"]
                            md.append("| " + " | ".join(["---"] * len(header)) + " |")
                            for r in rows:
                                md.append("| " + " | ".join(r) + " |")

                            self.tables.append(V25Table(
                                id=uuid.uuid4().hex,
                                md_table="\n".join(md),
                                source_file=file_name,
                                source_page=pageno
                            ))
                except Exception as e:
                    logger.warning(f"Table extract fail p{pageno}: {e}")

                # ------------------ TEXT (Native + Hybrid OCR) ------------------
                try:
                    base = ""
                    page_img = None

                    # 1. Native text if layer is good
                    if text_layer_ok:
                        base = page.extract_text() or ""

                    # 2. Prepare page image if needed:
                    #    - no base text (OCR fallback)
                    #    - or hybrid OCR enabled for diagram labels
                    if (not base) or (self.hybrid_page_ocr and text_layer_ok):
                        try:
                            page_img = page.to_image(resolution=PAGE_DPI).original
                        except Exception:
                            page_img = None

                    # 3. Pure OCR fallback when no text layer / empty
                    if not base and page_img is not None:
                        base = self._run_ocr(page_img)

                    base = self.normalize_whitespace(base)

                    # 4. Hybrid augment: add OCR-only lines not in base
                    if self.hybrid_page_ocr and text_layer_ok and page_img is not None and base:
                        ocr_full = self._run_ocr(page_img)
                        ocr_full = self.normalize_whitespace(ocr_full)
                        for line in ocr_full.splitlines():
                            line = line.strip()
                            if len(line) < OCR_MIN_LINE_LEN:
                                continue
                            if line not in base:
                                base += "\n" + line

                    if not base:
                        continue

                    # 5. Split into paragraphs
                    paras = re.split(r"\n\s*\n", base)
                    for para in paras:
                        clean = self.normalize_whitespace(para)
                        if not clean:
                            continue

                        first_line = clean.split("\n")[0]
                        ishead = self.is_heading_line(first_line)

                        if len(clean) < PARA_MIN_CHARS and not ishead:
                            continue

                        self.paragraphs.append(V25Paragraph(
                            id=uuid.uuid4().hex,
                            text=clean,
                            source_file=file_name,
                            source_page=pageno,
                            is_heading=ishead
                        ))

                except Exception as e:
                    logger.error(f"Text extraction fail p{pageno}: {e}")

    # ============================================================
    # PPTX EXTRACTION
    # ============================================================

    def extract_pptx(self, path: str):
        from pptx import Presentation

        file_name = Path(path).name
        print(f"[PPTX] {file_name}")

        try:
            prs = Presentation(path)
        except Exception as e:
            print(f"Failed to open PPTX: {e}")
            return

        for s_idx, slide in enumerate(prs.slides, 1):

            # 1. FIGURES
            for shape in slide.shapes:
                try:
                    if getattr(shape, "shape_type", None) == 13:  # PICTURE
                        img_bytes = shape.image.blob
                        img_id = uuid.uuid4().hex
                        out_name = f"fig_{img_id[:8]}_{file_name}_s{s_idx}.png"
                        out_path = os.path.join(self.assets_dir, out_name)

                        with open(out_path, "wb") as f:
                            f.write(img_bytes)

                        with Image.open(out_path) as pil:
                            ocr_cap = self._run_ocr(pil)

                        cap = f"Figure from {file_name}, slide {s_idx}"
                        if ocr_cap:
                            cap += " — " + ocr_cap[:200]

                        self.figures.append(V25Figure(
                            id=img_id,
                            img_path=out_path,
                            source_file=file_name,
                            source_page=s_idx,
                            caption=cap
                        ))
                except Exception as e:
                    logger.warning(f"PPTX image fail s{s_idx}: {e}")

            # 2. TABLES
            for shape in slide.shapes:
                try:
                    if hasattr(shape, "has_table") and shape.has_table:
                        rows = []
                        for row in shape.table.rows:
                            r = [self._clean_table_cell(getattr(c, "text", "")) for c in row.cells]
                            rows.append(r)
                        if rows:
                            header = rows[0]
                            data = rows[1:]

                            if any(len(r) != len(header) for r in data):
                                pass
                            else:
                                md = ["| " + " | ".join(header) + " |"]
                                md.append("| " + " | ".join(["---"] * len(header)) + " |")
                                for r in data:
                                    md.append("| " + " | ".join(r) + " |")

                                self.tables.append(V25Table(
                                    id=uuid.uuid4().hex,
                                    md_table="\n".join(md),
                                    source_file=file_name,
                                    source_page=s_idx
                                ))
                except Exception as e:
                    logger.warning(f"PPTX table fail s{s_idx}: {e}")

            # 3. TEXT
            try:
                parts = []
                for shape in slide.shapes:
                    if hasattr(shape, "has_text_frame") and shape.has_text_frame:
                        if shape.text and shape.text.strip():
                            parts.append(shape.text.strip())

                if not parts:
                    continue

                full = self.normalize_whitespace("\n".join(parts))
                paras = re.split(r"\n\s*\n", full)

                for p in paras:
                    clean = self.normalize_whitespace(p)
                    if not clean:
                        continue

                    ishead = self.is_heading_line(clean)

                    if len(clean) < PARA_MIN_CHARS and not ishead:
                        continue

                    self.paragraphs.append(V25Paragraph(
                        id=uuid.uuid4().hex,
                        text=clean,
                        source_file=file_name,
                        source_page=s_idx,
                        is_heading=ishead
                    ))
            except Exception as e:
                logger.error(f"PPTX text fail s{s_idx}: {e}")

    # ============================================================
    # DOCX EXTRACTION
    # ============================================================

    def extract_docx(self, path: str):
        from docx import Document

        file_name = Path(path).name
        print(f"[DOCX] {file_name}")

        try:
            doc = Document(path)
        except Exception as e:
            print(f"Failed to open DOCX: {e}")
            return

        # 1. IMAGES
        for rel in list(doc.part.rels.values()):
            if "image" in rel.reltype:
                try:
                    img_bytes = rel.target_part.blob
                    img_id = uuid.uuid4().hex
                    out_name = f"fig_{img_id[:8]}_{file_name}_docx.png"
                    out_path = os.path.join(self.assets_dir, out_name)

                    with open(out_path, "wb") as f:
                        f.write(img_bytes)

                    with Image.open(out_path) as pil:
                        ocr_cap = self._run_ocr(pil)

                    cap = f"Figure from {file_name} (DOCX)"
                    if ocr_cap:
                        cap += " — " + ocr_cap[:200]

                    self.figures.append(V25Figure(
                        id=img_id,
                        img_path=out_path,
                        source_file=file_name,
                        source_page=0,
                        caption=cap
                    ))
                except Exception as e:
                    logger.warning(f"DOCX image fail: {e}")

        # 2. TABLES
        for table in doc.tables:
            try:
                rows = [[self._clean_table_cell(c.text) for c in row.cells]
                        for row in table.rows]
                if rows:
                    header = rows[0]
                    data = rows[1:]

                    if any(len(r) != len(header) for r in data):
                        pass
                    else:
                        md = ["| " + " | ".join(header) + " |"]
                        md.append("| " + " | ".join(["---"] * len(header)) + " |")
                        for r in data:
                            md.append("| " + " | ".join(r) + " |")

                        self.tables.append(V25Table(
                            id=uuid.uuid4().hex,
                            md_table="\n".join(md),
                            source_file=file_name,
                            source_page=0
                        ))
            except Exception as e:
                logger.warning(f"DOCX table fail: {e}")

        # 3. PARAGRAPHS
        for para in doc.paragraphs:
            try:
                txt = para.text or ""
                if not txt.strip():
                    continue

                clean = self.normalize_whitespace(txt)
                if not clean:
                    continue

                style_name = getattr(para.style, "name", "").lower()
                ishead = ("heading" in style_name) or self.is_heading_line(clean)

                if len(clean) < PARA_MIN_CHARS and not ishead:
                    continue

                self.paragraphs.append(V25Paragraph(
                    id=uuid.uuid4().hex,
                    text=clean,
                    source_file=file_name,
                    source_page=0,
                    is_heading=ishead
                ))
            except Exception as e:
                logger.error(f"DOCX paragraph fail: {e}")

    # ============================================================
    # DEDUPLICATION + TOPIC ASSIGNMENT + FIGURE DEDUP
    # ============================================================

    def compute_para_embeddings(self):
        if self.para_embs is not None:
            return
        if not self.paragraphs:
            self.para_embs = np.empty((0, 384))
            return

        texts = [p.text for p in self.paragraphs]
        try:
            self.para_embs = self.embedder.encode(
                texts,
                normalize_embeddings=True,
                convert_to_numpy=True,
            )
        except Exception as e:
            logger.error(f"Embedding fail: {e}")
            self.para_embs = np.zeros((len(texts), 384))

    def deduplicate_paragraphs(self, similarity_threshold: float = DEDUP_THRESHOLD):
        """
        Semantic paragraph deduplication using cosine similarity.
        """
        if not self.paragraphs:
            return

        self.compute_para_embeddings()
        kept_idx: List[int] = []
        kept_embs: List[np.ndarray] = []

        for i, emb in enumerate(self.para_embs):
            if kept_embs:
                sims = util.cos_sim(
                    np.array([emb]), np.array(kept_embs)
                )[0].cpu().numpy()
                if np.max(sims) > similarity_threshold:
                    continue
            kept_idx.append(i)
            kept_embs.append(emb)

        self.paragraphs = [self.paragraphs[i] for i in kept_idx]
        self.para_embs = np.array(kept_embs) if kept_embs else np.empty((0, 384))

    def deduplicate_figures(self):
        """
        Perceptual figure dedup using phash.
        """
        if not self.figures:
            return

        print("Deduplicating figures (phash)...")
        seen_hashes = []
        unique_figs: List[V25Figure] = []

        for fig in self.figures:
            try:
                with Image.open(fig.img_path) as im:
                    h = imagehash.phash(im, hash_size=PHASH_SIZE)
                if any((h - old) < PHASH_DISTANCE for old in seen_hashes):
                    try:
                        os.remove(fig.img_path)
                    except OSError:
                        pass
                    continue
                seen_hashes.append(h)
                unique_figs.append(fig)
            except Exception:
                unique_figs.append(fig)

        self.figures = unique_figs

    def assign_topics(self):
        """
        Assign topic index + score to each paragraph using syllabus topics.
        """
        if not self.paragraphs:
            return

        self.compute_para_embeddings()

        if self.topic_embeddings.shape[0] == 0:
            # no syllabus → everything goes to topic 0 with score 0
            for p in self.paragraphs:
                p.topic_index = 0
                p.topic_score = 0.0
            return

        sims = util.cos_sim(self.para_embs, self.topic_embeddings).cpu().numpy()
        misc_idx = len(self.syllabus_topics)

        for i, p in enumerate(self.paragraphs):
            row = sims[i]
            best_idx = int(np.argmax(row))
            score = float(row[best_idx])

            if score >= MISC_THRESHOLD:
                p.topic_index = best_idx
            else:
                p.topic_index = misc_idx
            p.topic_score = score

# ================================================================
# TRUTH KERNEL BUILDER
# ================================================================

class TruthKernelBuilder:
    """
    Converts V25 objects from ExtractorV25 into a TruthKernel.
    """

    @staticmethod
    def from_extractor(ext: ExtractorV25) -> TruthKernel:
        paragraphs = [
            TKParagraph(
                id=p.id,
                text=p.text,
                topic_index=p.topic_index,
                topic_score=p.topic_score,
                source_file=p.source_file,
                source_page=p.source_page,
                is_heading=p.is_heading
            )
            for p in ext.paragraphs
        ]

        figures = [
            TKFigure(
                id=f.id,
                caption=f.caption,
                img_path=f.img_path,
                source_file=f.source_file,
                source_page=f.source_page
            )
            for f in ext.figures
        ]

        tables = [
            TKTable(
                id=t.id,
                markdown=t.md_table,
                source_file=t.source_file,
                source_page=t.source_page
            )
            for t in ext.tables
        ]

        return TruthKernel(
            paragraphs=paragraphs,
            figures=figures,
            tables=tables,
            topics=list(ext.syllabus_topics)
        )


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.7/67.7 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.8/472.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.9/14.9 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.7/296.7 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# 1. Define syllabus topics (like in your AES course)
syllabus_topics = [
    # MCN 401 - Industrial Safety Engineering
    "need for safety", "safety and productivity", "accident", "injury", "unsafe act",
    "unsafe condition", "dangerous occurrence", "reportable accidents",
    "theories of accident causation", "domino theory", "heinrich theory",
    "safety organization", "role of management", "supervisors", "workmen", "unions",
    "government and voluntary agencies", "safety policy", "safety officer",
    "responsibilities", "authority", "safety committee", "types", "advantages",
    "personal protective equipment", "ppe", "respiratory", "non-respiratory",
    "housekeeping", "5s", "work permit system", "hot work", "cold work", "confined spaces",
    "construction safety", "excavation", "scaffolds", "ladders", "blasting", "demolition",
    "ergonomics", "musculoskeletal disorders", "machine guarding", "point of operation",
    "welding safety", "material handling", "lifting techniques", "hearing conservation",
    "hazard identification", "hazop", "dow index", "fire explosion", "msds", "chemical hazards",
]

# 2. Create extractor
ext = ExtractorV25(
    syllabus_topics=syllabus_topics,
    hybrid_page_ocr=True,
)

# 3. Run extraction
filepaths = ["ISE.pdf"]  # or any list of PDFs/PPTX/DOCX
ext.extract_files(filepaths)

# 4. Deduplicate figures + paragraphs + assign topics
ext.deduplicate_figures()
ext.deduplicate_paragraphs()
ext.assign_topics()

# 5. Build TruthKernel (this is the handoff to Layer 2)
tk_raw = TruthKernelBuilder.from_extractor(ext)

print(len(tk_raw.paragraphs), "paragraphs")
print(len(tk_raw.figures), "figures")
print(len(tk_raw.tables), "tables")


[PDF] ISE.pdf




Deduplicating figures (phash)...
702 paragraphs
642 figures
237 tables


In [None]:
# ================================================================
# SMARTDOC V26 — LAYER 2: ML CLEANUP LAYER
# Uses a local LLM (via Ollama or any compatible chat model)
# to repair, clean, normalize extracted content WITHOUT altering meaning.
# ================================================================

import json
from dataclasses import dataclass
from typing import List
import subprocess
import tempfile
import os

# ------------------------------------------------------------
# Helper: run local LLM via ollama (modify if using another backend)
# ------------------------------------------------------------

def run_local_llm(prompt: str, model: str = "llama3.1") -> str:
    """
    Runs a local LLM using 'ollama run MODEL'.
    Ensures deterministic output and prevents any content deletion.

    Returns the model's cleaned text.
    """
    try:
        p = subprocess.Popen(
            ["ollama", "run", model],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        out, err = p.communicate(prompt, timeout=120)
        if err:
            print("[LLM warning]", err)
        return out.strip()
    except Exception as e:
        print("[LLM error]", e)
        return prompt  # return raw content if LLM fails


# ------------------------------------------------------------
# Internal Cleaned Data Models
# ------------------------------------------------------------

@dataclass
class CleanedParagraph:
    id: str
    text: str
    topic_index: int
    topic_score: float
    source_file: str
    source_page: int
    is_heading: bool

@dataclass
class CleanedTable:
    id: str
    markdown: str
    source_file: str
    source_page: int
    topic_index: int

@dataclass
class CleanTruthKernel:
    """This is the output of Layer 2."""
    paragraphs: List[CleanedParagraph]
    tables: List[CleanedTable]
    figures: list
    topics: list


# ------------------------------------------------------------
# LLM cleaning prompt
# ------------------------------------------------------------

CLEAN_PROMPT = """
You are a precision cleanup engine. You MUST:
- preserve all the content
- keep all wording the same
- do not paraphrase
- do not shorten
- do not merge or remove sentences
- do not add new content

You MAY:
- fix spacing
- fix punctuation
- fix OCR artefacts
- join broken words (ex: “geo-\nlogy” → “geology”)
- remove duplicated fragments inside the same paragraph
- unwrap broken lines into proper sentences
- fix markdown tables but keep all rows/columns identical

Always output ONLY the cleaned text, nothing else.
"""


# ------------------------------------------------------------
# SMARTDOC V26 — LAYER 2
# ------------------------------------------------------------

class MLTruthCleaner:

    def __init__(self, model_name: str = "llama3.1"):
        self.model = model_name

    def _clean_chunk(self, raw_text: str) -> str:
        """
        Clean a paragraph/table chunk using the local LLM.
        """
        prompt = CLEAN_PROMPT + "\n\n" + raw_text.strip()
        cleaned = run_local_llm(prompt, self.model)
        if not cleaned.strip():
            return raw_text
        return cleaned

    # ------------------ CLEAN PARAGRAPHS ------------------

    def clean_paragraphs(self, tk):
        cleaned = []
        for p in tk.paragraphs:
            out = self._clean_chunk(p.text)
            cleaned.append(
                CleanedParagraph(
                    id=p.id,
                    text=out,
                    topic_index=p.topic_index,
                    topic_score=p.topic_score,
                    source_file=p.source_file,
                    source_page=p.source_page,
                    is_heading=p.is_heading
                )
            )
        return cleaned

    # ------------------ CLEAN TABLES ------------------

    def clean_tables(self, tk):
        cleaned = []
        for t in tk.tables:
            out = self._clean_chunk(t.markdown)
            cleaned.append(
                CleanedTable(
                    id=t.id,
                    markdown=out,
                    source_file=t.source_file,
                    source_page=t.source_page,
                    topic_index=t.topic_index
                )
            )
        return cleaned

    # ------------------ MAIN CLEANUP ------------------

    def clean_truth_kernel(self, tk):
        """
        Returns a *new* CleanTruthKernel.
        """

        print("[Layer 2] Cleaning paragraphs with LLM…")
        cleaned_paras = self.clean_paragraphs(tk)

        print("[Layer 2] Cleaning tables with LLM…")
        cleaned_tables = self.clean_tables(tk)

        return CleanTruthKernel(
            paragraphs=cleaned_paras,
            tables=cleaned_tables,
            figures=tk.figures,
            topics=tk.topics,
        )


In [None]:
# you already have tk_raw from Layer 1
cleaner = MLTruthCleaner(model_name="llama3.1")  # any local model
tk_clean = cleaner.clean_truth_kernel(tk_raw)

print(len(tk_clean.paragraphs))
print(len(tk_clean.tables))


[Layer 2] Cleaning paragraphs with LLM…
[LLM error] [Errno 2] No such file or directory: 'ollama'
[LLM error] [Errno 2] No such file or directory: 'ollama'
[LLM error] [Errno 2] No such file or directory: 'ollama'
[LLM error] [Errno 2] No such file or directory: 'ollama'
[LLM error] [Errno 2] No such file or directory: 'ollama'
[LLM error] [Errno 2] No such file or directory: 'ollama'
[LLM error] [Errno 2] No such file or directory: 'ollama'
[LLM error] [Errno 2] No such file or directory: 'ollama'
[LLM error] [Errno 2] No such file or directory: 'ollama'
[LLM error] [Errno 2] No such file or directory: 'ollama'
[LLM error] [Errno 2] No such file or directory: 'ollama'
[LLM error] [Errno 2] No such file or directory: 'ollama'
[LLM error] [Errno 2] No such file or directory: 'ollama'
[LLM error] [Errno 2] No such file or directory: 'ollama'
[LLM error] [Errno 2] No such file or directory: 'ollama'
[LLM error] [Errno 2] No such file or directory: 'ollama'
[LLM error] [Errno 2] No such fi