In [31]:
import sys
import torch

print("Python Version:", sys.version)
print("PyTorch Version:", torch.__version__)
print("CUDA Version (PyTorch):", torch.version.cuda)
print("CUDA Available:", torch.cuda.is_available())

Python Version: 3.13.5 (tags/v3.13.5:6cb20a2, Jun 11 2025, 16:15:46) [MSC v.1943 64 bit (AMD64)]
PyTorch Version: 2.7.1+cu118
CUDA Version (PyTorch): 11.8
CUDA Available: True


In [12]:
# 02_text_chunk_and_doc_build.ipynb

from pathlib import Path
import json
import hashlib
from datetime import datetime, timezone
from typing import List, Dict, Any

from haystack.dataclasses import Document
from haystack.components.preprocessors import DocumentSplitter


In [19]:
# Paths: adjust to your actual filenames from step 01
ROOT = Path.cwd().parent
WORK_DIR = ROOT/"work"
JSON_IN = WORK_DIR / "json_out" / "uvm_text_paragraphs.jsonl"      # output of 01_toc_and_mineru_probe.py
JSON_OUT = WORK_DIR / "json_out" / "uvm_text_chunks.haystack.jsonl"  # chunk-level output

JSON_IN, JSON_OUT


(WindowsPath('c:/Users/41v1r/NEU/NLP/UVM-RAG/work/json_out/uvm_text_paragraphs.jsonl'),
 WindowsPath('c:/Users/41v1r/NEU/NLP/UVM-RAG/work/json_out/uvm_text_chunks.haystack.jsonl'))

In [17]:
def load_paragraph_jsonl(path: Path) -> List[Dict[str, Any]]:
    """
    Load paragraph-level JSONL produced by 01_toc_and_mineru_probe.py.
    Each line is a dict with at least: content, type, page_from, page_to, std, uri, anchor, section_title, etc.
    """
    records = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            records.append(json.loads(line))
    return records


def normalize_text(text: str) -> str:
    """
    Normalize text for checksum: collapse whitespace.
    This does not go back into Document.content; only for hashing.
    """
    return " ".join(text.split())


def compute_checksum(text: str) -> str:
    norm = normalize_text(text)
    return hashlib.sha1(norm.encode("utf-8")).hexdigest()


In [21]:
paragraph_rows = load_paragraph_jsonl(JSON_IN)
len(paragraph_rows)


8167

In [22]:
from copy import deepcopy
from pathlib import PurePosixPath

def rows_to_paragraph_docs(rows: List[Dict[str, Any]]) -> List[Document]:
    docs: List[Document] = []
    for row in rows:
        row = deepcopy(row)

        # Basic sanity
        content = row.pop("content", "").strip()
        if not content:
            continue

        # Derive a "source" if not already present
        uri = row.get("uri", "")
        default_source = ""
        if uri:
            # e.g. "/pdf/UVM_Class_Reference_Manual_1.2.pdf" -> "UVM_Class_Reference_Manual_1.2.pdf"
            default_source = PurePosixPath(uri).name

        meta: Dict[str, Any] = row
        meta.setdefault("source", default_source)

        # Let Document generate a stable id from content + meta (no checksum/indexed_at yet)
        doc = Document(content=content, meta=meta)
        docs.append(doc)

    return docs

paragraph_docs = rows_to_paragraph_docs(paragraph_rows)
len(paragraph_docs), paragraph_docs[0]


(8167,
 Document(id=ff8f74d96e6e7041e5a28982ca931bc415943c27c7da45c7e1c6ae09224fb955, content: 'Universal Verification Methodology (UVM) 1.2 Class Reference', meta: {'type': 'text', 'page_from': 1, 'page_to': 1, 'std': 'UVM-1.2', 'uri': '/pdf/UVM_Class_Reference_Manual_1.2.pdf', 'anchor': '#page=1', 'section_title': 'UVM Class 1.2 Reference', 'source': 'UVM_Class_Reference_Manual_1.2.pdf'}))

In [23]:
splitter = DocumentSplitter(
    split_by="word",
    split_length=240,   # target chunk size
    split_overlap=20,   # overlap between chunks
    split_threshold=50  # do not split tiny fragments
)

splitter


<haystack.components.preprocessors.document_splitter.DocumentSplitter object at 0x000001DAC9C7E660>
Inputs:
  - documents: list[Document]
Outputs:
  - documents: list[Document]

In [25]:
split_result = splitter.run(documents=paragraph_docs)
chunk_docs: List[Document] = split_result["documents"]

len(chunk_docs)


8168

In [26]:
INDEXED_AT = datetime.now(timezone.utc).isoformat()

for d in chunk_docs:
    if not d.content:
        continue

    d.meta["checksum"] = compute_checksum(d.content)
    # just a run-level timestamp; does not affect the Document.id anymore
    d.meta.setdefault("indexed_at", INDEXED_AT)


In [27]:
chunk_docs[0].meta


{'type': 'text',
 'page_from': 1,
 'page_to': 1,
 'std': 'UVM-1.2',
 'uri': '/pdf/UVM_Class_Reference_Manual_1.2.pdf',
 'anchor': '#page=1',
 'section_title': 'UVM Class 1.2 Reference',
 'source': 'UVM_Class_Reference_Manual_1.2.pdf',
 'source_id': 'ff8f74d96e6e7041e5a28982ca931bc415943c27c7da45c7e1c6ae09224fb955',
 'page_number': 1,
 'split_id': 0,
 'split_idx_start': 0,
 '_split_overlap': [],
 'checksum': '46aa7f6e66d720d7a15150a55b84782ff5146627',
 'indexed_at': '2025-12-06T00:27:56.145286+00:00'}

In [28]:
def write_docs_jsonl(docs: List[Document], path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        for d in docs:
            # Flatten meta into top-level keys; this matches Haystack 2.x expectations.
            data = d.to_dict(flatten=True)
            f.write(json.dumps(data, ensure_ascii=False) + "\n")

write_docs_jsonl(chunk_docs, JSON_OUT)
JSON_OUT, JSON_OUT.exists()


(WindowsPath('c:/Users/41v1r/NEU/NLP/UVM-RAG/work/json_out/uvm_text_chunks.haystack.jsonl'),
 True)

In [30]:
# 1) Ensure we have at least 10k chunks
# assert len(chunk_docs) >= 10_000, f"Only {len(chunk_docs)} chunks; maybe adjust split_length or check input."

# 2) Spot-check that citation fields are present
sample = chunk_docs[0]
assert "uri" in sample.meta and "anchor" in sample.meta, "Missing uri/anchor in meta â€“ check step 01 JSON."
assert sample.meta.get("type") == "text", "Chunk type should remain 'text'."

len(chunk_docs), sample.meta.get("section_title"), sample.meta.get("uri"), sample.meta.get("anchor")


(8168,
 'UVM Class 1.2 Reference',
 '/pdf/UVM_Class_Reference_Manual_1.2.pdf',
 '#page=1')