In [15]:
from langchain_community.document_loaders import PyPDFLoader

PDF_PATH = "./data/raw/The_Merck_Manual.pdf"

#Load the PDF with LangChain (page-level docs)
loader = PyPDFLoader(PDF_PATH)
docs = loader.load()

print("Loaded pages:", len(docs))
print("Example metadata:", docs[0].metadata)
print(docs[0].page_content[:500])


Loaded pages: 4114
Example metadata: {'producer': 'Atop CHM to PDF Converter', 'creator': 'Atop CHM to PDF Converter', 'creationdate': '2012-06-15T05:44:40+00:00', 'moddate': '2014-04-21T07:53:19+10:00', 'title': 'The Merck Manual of Diagnosis & Therapy, 19th Edition', 'source': './data/raw/The_Merck_Manual.pdf', 'total_pages': 4114, 'page': 0, 'page_label': 'i'}



In [16]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

CHUNK_SIZE = 1000
CHUNK_OVERLAP = 150
#Split into chunks (the retrieval unit)
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=["\n\n", "\n", ". ", " ", ""]
)

chunks = splitter.split_documents(docs)

print("Pages:", len(docs))
print("Chunks:", len(chunks))
print("Example chunk metadata:", chunks[0].metadata)
print(chunks[0].page_content[:400])


Pages: 4114
Chunks: 16634
Example chunk metadata: {'producer': 'Atop CHM to PDF Converter', 'creator': 'Atop CHM to PDF Converter', 'creationdate': '2012-06-15T05:44:40+00:00', 'moddate': '2014-04-21T07:53:19+10:00', 'title': 'The Merck Manual of Diagnosis & Therapy, 19th Edition', 'source': './data/raw/The_Merck_Manual.pdf', 'total_pages': 4114, 'page': 2, 'page_label': 'iii'}
Table of Contents
1
Front  
  ................................................................................................................................................................................................................
1
Cover  
  .....................................................................................................................................................


In [17]:
import os, json

#Save chunks to disk (reproducibility)
os.makedirs("../data/processed", exist_ok=True)
out_path = "../data/processed/chunks.jsonl"

with open(out_path, "w", encoding="utf-8") as f:
    for i, d in enumerate(chunks):
        record = {
            "chunk_id": i,
            "text": d.page_content,
            "metadata": d.metadata,
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print("Saved:", out_path)


Saved: ../data/processed/chunks.jsonl
