In [12]:
# Install dependencies if needed
# !pip install langchain langchain-experimental langchain-chroma pillow open_clip_torch torch matplotlib unstructured pydantic
import os
from textbook_loading import (
    load_book,
    clean_and_categorize_elements,
    summarize_elements,
    store_in_chromadb,
    delete_irrelevant_images
)

In [13]:
pdf_file = './data/first_n_second_Chapter_EmergencyInfectiveDisease.pdf'
image_output_dir = './figures/Emergency_and_InfectiveDisease'
chroma_persist_dir = './chroma/Emergency_and_InfectiveDisease/'

# Make sure the data directory exists
assert os.path.exists('./data'), "Error: './data' directory not found."
assert os.path.exists(pdf_file), f"Error: PDF file not found at {pdf_file}."

In [14]:
print("📚 Loading and partitioning PDF elements...")
raw_pdf_elements = load_book(pdf_file, image_output_dir)
print(f"✅ Found {len(raw_pdf_elements)} raw elements.")

📚 Loading and partitioning PDF elements...
✅ Found 1555 raw elements.


In [16]:
print("🧹 Cleaning and categorizing elements...")
texts, tables, images_raw, _, _, _, _, _ = clean_and_categorize_elements(raw_pdf_elements)
print(f"Categorized: {len(texts)} text chunks, {len(tables)} tables, {len(images_raw)} images.")

🧹 Cleaning and categorizing elements...
Categorized: 301 text chunks, 11 tables, 140 images.


In [17]:
print("📝 Summarizing text, tables, and Checking relevant images...")
text_summaries, table_summaries, image_paths, relevant_images_to_summarize = summarize_elements(
    texts, tables, images_raw
)
print(f"📊 Summarized: {len(text_summaries)} text summaries, {len(table_summaries)} table summaries")

📝 Summarizing text, tables, and Checking relevant images...
Texts and Tables Summary Done!
Checking image relevance with local textual context...
Skipping irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-1-1.jpg
Skipping irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-3-6.jpg
Skipping irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-8-14.jpg
Skipping irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-9-18.jpg
Skipping irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-10-19.jpg
Skipping irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-13-24.jpg
Skipping irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-14-25.jpg
Skipping irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-15-26.jpg
Skipping irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-16-28.jpg
Skipping irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-17-30.jpg
Skipping i

In [18]:
print(f"💾 Storing data into ChromaDB at {chroma_persist_dir}...")
#Returns a UnifiedRetriever class: def __init__(self, vectorstore, docstore, id_key="doc_id"):
retriever = store_in_chromadb(
    text_summaries, texts, table_summaries, tables, image_paths, persist_directory=chroma_persist_dir
)
print("🎉 Data ingestion complete. Retriever initialized.")

💾 Storing data into ChromaDB at ./chroma/Emergency_and_InfectiveDisease/...
🎉 Data ingestion complete. Retriever initialized.


In [19]:
print("🗑️ Deleting irrelevant images...")
delete_irrelevant_images(images_raw, relevant_images_to_summarize)

🗑️ Deleting irrelevant images...
Successfully deleted irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-1-1.jpg
Successfully deleted irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-3-6.jpg
Successfully deleted irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-8-14.jpg
Successfully deleted irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-9-18.jpg
Successfully deleted irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-10-19.jpg
Successfully deleted irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-13-24.jpg
Successfully deleted irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-14-25.jpg
Successfully deleted irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-15-26.jpg
Successfully deleted irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-16-28.jpg
Successfully deleted irrelevant image: ./figures/Emergency_and_InfectiveDisease/figure-17-30.jpg
Suc

# Inspecting Retrieved Docs

In [20]:
query = "How to properly pick up a cat?"

In [21]:
print('-'*40, "Here are the retrieved original_docs with similarity scores",'-'*40)

results = retriever.retrieve(query, k=10)
for res in results:
    print('-' * 40)
    print(f"🔎 Similarity Score: {res['score']:.4f}")
    print("📝 Summary/Chunk:", res["summary"])
    print("📄 Full Original Content:", res["original"])
    print("Summary Metadata:", res["summary_metadata"])
    print("Original Metadata:", res["original_metadata"])

---------------------------------------- Here are the retrieved original_docs with similarity scores ----------------------------------------
----------------------------------------
🔎 Similarity Score: 0.7005
📝 Summary/Chunk: Picking up a cat requires care to avoid injuring them. 

- Hold the cat securely but gently around the middle.
- Keep your arms under the cat's front legs and hold below their back leg to minimize strain on their joints.
- Support the cat's body and lift them off the ground, lifting from beneath rather than above.
- Avoid picking up kittens or pregnant cats as they are more fragile.
- Lift slowly and carefully to prevent sudden movements.
📄 Full Original Content: PICKING UP A CAT —o— 04_095300 ch01.qxp 10/29/07 3:41 PM Page 3
Summary Metadata: {'type': 'text', 'doc_id': '9e3619bc-10ed-44c2-b925-e2853ee00503'}
Original Metadata: {'type': 'text', 'doc_id': '9e3619bc-10ed-44c2-b925-e2853ee00503'}
----------------------------------------
🔎 Similarity Score: 0.8617
📝 