In [1]:
# Install dependencies if needed
# !pip install langchain langchain-experimental langchain-chroma pillow open_clip_torch torch matplotlib unstructured pydantic
import os
from textbook_loading import (
    load_book,
    clean_and_categorize_elements,
    summarize_elements,
    store_in_chromadb,
    delete_irrelevant_images
)

In [2]:
pdf_file = './data/shortExample1P_PickupCat.pdf'
image_output_dir = './figures/figures_pickupCat'
chroma_persist_dir = './chroma/textbook_test_pickupCat/'

# Make sure the data directory exists
assert os.path.exists('./data'), "Error: './data' directory not found."
assert os.path.exists(pdf_file), f"Error: PDF file not found at {pdf_file}."

In [3]:
print("📚 Loading and partitioning PDF elements...")
raw_pdf_elements = load_book(pdf_file, image_output_dir)
print(f"✅ Found {len(raw_pdf_elements)} raw elements.")

📚 Loading and partitioning PDF elements...


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


✅ Found 55 raw elements.


In [4]:
print("🧹 Cleaning and categorizing elements...")
texts, tables, images_raw, _, _, _, _, _ = clean_and_categorize_elements(raw_pdf_elements)
print(f"Categorized: {len(texts)} text chunks, {len(tables)} tables, {len(images_raw)} images.")

🧹 Cleaning and categorizing elements...
Categorized: 9 text chunks, 0 tables, 10 images.


In [5]:
print("📝 Summarizing text, tables, and Checking relevant images...")
text_summaries, table_summaries, image_paths, relevant_images_to_summarize = summarize_elements(
    texts, tables, images_raw
)
print(f"📊 Summarized: {len(text_summaries)} text summaries, {len(table_summaries)} table summaries")

📝 Summarizing text, tables, and Checking relevant images...
Texts and Tables Summary Done!
Checking image relevance with local textual context...
Skipping irrelevant image: ./figures/figures_pickupCat/figure-2-5.jpg
Number of relevant images: 9
📊 Summarized: 9 text summaries, 0 table summaries


In [6]:
print(f"💾 Storing data into ChromaDB at {chroma_persist_dir}...")
#Returns a UnifiedRetriever class: def __init__(self, vectorstore, docstore, id_key="doc_id"):
retriever = store_in_chromadb(
    text_summaries, texts, table_summaries, tables, image_paths, persist_directory=chroma_persist_dir
)
print("🎉 Data ingestion complete. Retriever initialized.")

💾 Storing data into ChromaDB at ./chroma/textbook_test_pickupCat/...
🎉 Data ingestion complete. Retriever initialized.


In [7]:
print("🗑️ Deleting irrelevant images...")
delete_irrelevant_images(images_raw, relevant_images_to_summarize)

🗑️ Deleting irrelevant images...
Successfully deleted irrelevant image: ./figures/figures_pickupCat/figure-2-5.jpg
Finished deleting images. Total deleted: 1


# Inspecting Retrieved Docs

In [10]:
query = "How to properly pick up a cat?"

In [11]:
print('-'*40, "Here are the retrieved original_docs with similarity scores",'-'*40)

results = retriever.retrieve(query, k=10)
for res in results:
    print('-' * 40)
    print(f"🔎 Similarity Score: {res['score']:.4f}")
    print("📝 Summary/Chunk:", res["summary"])
    print("📄 Full Original Content:", res["original"])
    print("Summary Metadata:", res["summary_metadata"])
    print("Original Metadata:", res["original_metadata"])

---------------------------------------- Here are the retrieved original_docs with similarity scores ----------------------------------------
----------------------------------------
🔎 Similarity Score: 0.6620
📝 Summary/Chunk: Picking up a cat involves gentle handling to avoid stressing or injuring them. Key points:

* Hold the cat securely but not too tightly, supporting their body and legs
* Lift from under the cat, avoiding picking them up by their neck or tail
* Keep your hands close to their body to prevent them from being injured or dropped
* Be mindful of their claws and avoid sudden movements
📄 Full Original Content: PICKING UP A CAT —o— 04_095300 ch01.qxp 10/29/07 3:41 PM Page 3
Summary Metadata: {'type': 'text', 'doc_id': 'a6f8099d-9843-416c-99f8-968ef7d5f937'}
Original Metadata: {'type': 'text', 'doc_id': 'a6f8099d-9843-416c-99f8-968ef7d5f937'}
----------------------------------------
🔎 Similarity Score: 0.8390
📝 Summary/Chunk: For emergency situations, pick up apprehensive 