In [2]:
# Install dependencies if needed
# !pip install langchain langchain-experimental langchain-chroma pillow open_clip_torch torch matplotlib unstructured pydantic
import os
from textbook_loading import (
    load_book,
    clean_and_categorize_elements,
    semantic_chunk_texts,
    enrich_image_context,
    summarize_elements,
    store_in_chromadb,
    delete_irrelevant_images
)
from langchain_experimental.open_clip import OpenCLIPEmbeddings


In [3]:
pdf_file = './data/MediumExample_Ears_17Pgs.pdf'
image_output_dir = './figures/Ears_17Pgs'
chroma_persist_dir = './chroma/Ears_17Pgs/'

# Make sure the data directory exists
assert os.path.exists('./data'), "Error: './data' directory not found."
assert os.path.exists(pdf_file), f"Error: PDF file not found at {pdf_file}."

In [None]:
print("📝 Unstructuring textbooks, filtering junks, semanic chunking...")
raw_pdf_elements = load_book(pdf_file, image_output_dir)
print("🎉 1.process_pdf_with_semantic_chunking complete.")


In [5]:
# 2. Clean and categorize
texts, tables, images_raw, headers_raw, titles_raw, footers_raw, figure_captions_raw, list_items_raw = clean_and_categorize_elements(raw_pdf_elements)


In [6]:
# 3. Semantic chunking
embedding_model = OpenCLIPEmbeddings(model_name="ViT-L-14", checkpoint="laion2b_s32b_b82k")
semantic_chunks = semantic_chunk_texts(texts, embedding_model, n_clusters=30)


In [7]:
# 4. Enrich image context (hybrid)
enrich_image_context(images_raw, raw_pdf_elements, embedding_model=embedding_model, semantic_chunks=semantic_chunks)


In [None]:
# 5. Summarize, store, etc.
text_summaries, table_summaries, image_paths, relevant_images_to_summarize, image_summaries = summarize_elements(
    texts, tables, images_raw
)

In [9]:
retriever = store_in_chromadb(
    text_summaries, texts, table_summaries, tables, image_paths,
    relevant_images_to_summarize, image_summaries,
    persist_directory=chroma_persist_dir
)

In [None]:
delete_irrelevant_images(images_raw, relevant_images_to_summarize)

In [None]:
# import os
sound_file = "/System/Library/Sounds/Glass.aiff"
os.system(f"afplay '{sound_file}'")

# Inspecting Retrieved Docs

In [12]:
query = "tell me about the anatomy of the ear?"

In [None]:
from IPython.display import Image, display

print('-'*40, "Here are the retrieved original_docs with similarity scores",'-'*40)

results = retriever.retrieve_multi_modal(query, k=10)
for res in results:
    if res["modality"] == "image" and os.path.exists(res["summary"]):
        display(Image(filename=res["summary"]))
    print('-' * 40)
    print(f"🔎 Similarity Score: {res['score']:.4f}")
    print(f"🖼️ Modality: {res['modality']}")
    print("📝 Summary/Chunk:", res["summary"])
    print("Summary Metadata:", res["original_metadata"])
    print("Doc ID:", res["doc_id"])

In [14]:
# Only images
image_results = retriever.retrieve(query, k=10, filter={"type": "image"})
# Only text
text_results = retriever.retrieve(query, k=10, filter={"type": "text"})