In [None]:
import os
import json
import hashlib
from PyPDF2 import PdfReader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_experimental.text_splitter import SemanticChunker


# === Setup ===
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
chunker = SemanticChunker(embeddings=embedder)

literature_dir = r"C:\Users\nagir\OneDrive\Desktop\projects\chunker\cedlib"
handbook_path = r"C:\Users\nagir\OneDrive\Desktop\projects\chunker\The Doctor-Approved Cannabis Handbook_DRC.pdf"
output_path = r"C:\Users\nagir\OneDrive\Desktop\projects\chunker\chunk_output.json"


def extract_text(pdf_path: str) -> str:
    try:
        reader = PdfReader(pdf_path)
        return " ".join([page.extract_text() or "" for page in reader.pages])
    except Exception as e:
        print(f" Error reading {pdf_path}: {e}")
        return ""


def generate_doi(title: str) -> str:
    safe_title = title.lower().replace(" ", "-").replace(".", "")
    return f"10.2025/medkb.{safe_title}"


def hash_chunk_id(text: str) -> str:
    return hashlib.md5(text.encode("utf-8")).hexdigest()[:10]


def chunk_pdf(pdf_path: str, is_literature: bool = False) -> list:
    filename = os.path.basename(pdf_path)
    base_title = os.path.splitext(filename)[0]
    doi = generate_doi(base_title) if is_literature else None

    print(f"\n Processing: {filename}")
    text = extract_text(pdf_path)

    if not text.strip() or len(text.strip()) < 50:
        print("⚠️ Empty or image-based PDF. Skipping.")
        return []

    try:
        chunks = chunker.split_text(text)
    except Exception as e:
        print(f" Chunking failed for {pdf_path}: {e}")
        return []

    output_chunks = []
    for chunk in chunks:
        chunk_id = hash_chunk_id(chunk)
        entry = {
            "chunk_id": chunk_id,
            "chunk_source": pdf_path
        }
        if is_literature:
            entry["chunk_source_doi"] = doi
        output_chunks.append(entry)

    print(f" {len(output_chunks)} chunks created.")
    return output_chunks


def main():
    output = {"literature": [], "handbook": []}

    # === Literature PDFs ===
    for filename in os.listdir(literature_dir):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(literature_dir, filename)
            output["literature"].extend(chunk_pdf(pdf_path, is_literature=True))

    # === Handbook PDF ===
    if os.path.exists(handbook_path):
        output["handbook"].extend(chunk_pdf(handbook_path, is_literature=False))

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=2)

    print(f"\n All done. Output saved to: {output_path}")


if __name__ == "__main__":
    main()


In [1]:
import os
import json
import hashlib
from PyPDF2 import PdfReader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_experimental.text_splitter import SemanticChunker

embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
chunker = SemanticChunker(embeddings=embedder)

def extract_text(pdf_path: str) -> str:
    try:
        reader = PdfReader(pdf_path)
        return " ".join([page.extract_text() or "" for page in reader.pages])
    except Exception:
        return ""

def generate_doi(title: str) -> str:
    safe_title = title.lower().replace(" ", "-").replace(".", "")
    return f"10.2025/medkb.{safe_title}"

def hash_chunk_id(text: str) -> str:
    return hashlib.md5(text.encode("utf-8")).hexdigest()[:10]

def chunk_pdf(pdf_path: str, is_literature: bool = False) -> list:
    filename = os.path.basename(pdf_path)
    base_title = os.path.splitext(filename)[0]
    doi = generate_doi(base_title) if is_literature else None
    text = extract_text(pdf_path)

    if not text.strip() or len(text.strip()) < 50:
        return []

    try:
        chunks = chunker.split_text(text)
    except Exception:
        return []

    output_chunks = []
    for chunk in chunks:
        chunk_id = hash_chunk_id(chunk)
        entry = {
            "chunk_id": chunk_id,
            "chunk_source": pdf_path
        }
        if is_literature:
            entry["chunk_source_doi"] = doi
        output_chunks.append(entry)
    return output_chunks

def main(literature_dir: str, handbook_path: str, output_path: str):
    output = {"literature": [], "handbook": []}

    for filename in os.listdir(literature_dir):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(literature_dir, filename)
            output["literature"].extend(chunk_pdf(pdf_path, is_literature=True))

    if os.path.exists(handbook_path):
        output["handbook"].extend(chunk_pdf(handbook_path, is_literature=False))

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=2)

if __name__ == "__main__":
    literature_dir = "<path_to_literature_folder>"
    handbook_path = "<path_to_handbook_pdf>"
    output_path = "<path_to_output_json>"
    main(literature_dir, handbook_path, output_path)


  embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: '<path_to_literature_folder>'