In [1]:
# Install dependencies if needed
# !pip install langchain langchain-experimental langchain-chroma pillow open_clip_torch torch matplotlib unstructured pydantic
import os
from textbook_loading import (
    load_book,
    clean_and_categorize_elements,
    summarize_elements,
    store_in_chromadb,
    delete_irrelevant_images
)

In [4]:
pdf_file = './data/shortExample2_Nutrition_20Pgs.pdf'
image_output_dir = './figures/figures_Nutrition'
chroma_persist_dir = './chroma/textbook_test_Nutrition/'

# Make sure the data directory exists
assert os.path.exists('./data'), "Error: './data' directory not found."
assert os.path.exists(pdf_file), f"Error: PDF file not found at {pdf_file}."

In [5]:
print("📚 Loading and partitioning PDF elements...")
raw_pdf_elements = load_book(pdf_file, image_output_dir)
print(f"✅ Found {len(raw_pdf_elements)} raw elements.")

📚 Loading and partitioning PDF elements...


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


✅ Found 316 raw elements.


In [7]:
print("🧹 Cleaning and categorizing elements...")
texts, tables, images_raw, _, _, _, _, _ = clean_and_categorize_elements(raw_pdf_elements)
print(f"Categorized: {len(texts)} text chunks, {len(tables)} tables, {len(images_raw)} images.")

🧹 Cleaning and categorizing elements...
Categorized: 53 text chunks, 4 tables, 21 images.


In [8]:
print("📝 Summarizing text, tables, and relevant images...")
text_summaries, table_summaries, img_summaries, image_paths, relevant_images_to_summarize = summarize_elements(
    texts, tables, images_raw
)
print(f"📊 Summarized: {len(text_summaries)} text summaries, {len(table_summaries)} table summaries, {len(img_summaries)} image summaries.")

📝 Summarizing text, tables, and relevant images...
Checking image relevance with local textual context...
Skipping irrelevant image: ./figures/figures_Nutrition/figure-1-1.jpg
Skipping irrelevant image: ./figures/figures_Nutrition/figure-4-2.jpg
Skipping irrelevant image: ./figures/figures_Nutrition/figure-5-3.jpg
Skipping irrelevant image: ./figures/figures_Nutrition/figure-6-4.jpg
Skipping irrelevant image: ./figures/figures_Nutrition/figure-7-5.jpg
Skipping irrelevant image: ./figures/figures_Nutrition/figure-8-6.jpg
Skipping irrelevant image: ./figures/figures_Nutrition/figure-9-7.jpg
Skipping irrelevant image: ./figures/figures_Nutrition/figure-10-8.jpg
Skipping irrelevant image: ./figures/figures_Nutrition/figure-11-9.jpg
Skipping irrelevant image: ./figures/figures_Nutrition/figure-12-10.jpg
Skipping irrelevant image: ./figures/figures_Nutrition/figure-13-11.jpg
Skipping irrelevant image: ./figures/figures_Nutrition/figure-14-13.jpg
Skipping irrelevant image: ./figures/figures_N

In [9]:
print(f"💾 Storing data into ChromaDB at {chroma_persist_dir}...")
#Returns a UnifiedRetriever class: def __init__(self, vectorstore, docstore, id_key="doc_id"):
retriever = store_in_chromadb(
    text_summaries, texts, table_summaries, tables, img_summaries, image_paths, persist_directory=chroma_persist_dir
)
print("🎉 Data ingestion complete. Retriever initialized.")

💾 Storing data into ChromaDB at ./chroma/textbook_test_Nutrition/...
🎉 Data ingestion complete. Retriever initialized.


In [10]:
print("🗑️ Deleting irrelevant images...")
delete_irrelevant_images(images_raw, relevant_images_to_summarize)

🗑️ Deleting irrelevant images...
Successfully deleted irrelevant image: ./figures/figures_Nutrition/figure-1-1.jpg
Successfully deleted irrelevant image: ./figures/figures_Nutrition/figure-4-2.jpg
Successfully deleted irrelevant image: ./figures/figures_Nutrition/figure-5-3.jpg
Successfully deleted irrelevant image: ./figures/figures_Nutrition/figure-6-4.jpg
Successfully deleted irrelevant image: ./figures/figures_Nutrition/figure-7-5.jpg
Successfully deleted irrelevant image: ./figures/figures_Nutrition/figure-8-6.jpg
Successfully deleted irrelevant image: ./figures/figures_Nutrition/figure-9-7.jpg
Successfully deleted irrelevant image: ./figures/figures_Nutrition/figure-10-8.jpg
Successfully deleted irrelevant image: ./figures/figures_Nutrition/figure-11-9.jpg
Successfully deleted irrelevant image: ./figures/figures_Nutrition/figure-12-10.jpg
Successfully deleted irrelevant image: ./figures/figures_Nutrition/figure-13-11.jpg
Successfully deleted irrelevant image: ./figures/figures_Nu

# Inspecting Retrieved Docs

In [13]:
query = "What would be the ideal weight of a 1 year old and a 4 years old neutered cat?"

In [14]:
print('-'*40, "Here are the retrieved original_docs with similarity scores",'-'*40)

results = retriever.retrieve(query, k=10)
for res in results:
    print('-' * 40)
    print(f"🔎 Similarity Score: {res['score']:.4f}")
    print("📝 Summary/Chunk:", res["summary"])
    print("📄 Full Original Content:", res["original"])
    print("Summary Metadata:", res["summary_metadata"])
    print("Original Metadata:", res["original_metadata"])

---------------------------------------- Here are the retrieved original_docs with similarity scores ----------------------------------------
----------------------------------------
🔎 Similarity Score: 0.6388
📝 Summary/Chunk: For geriatric cats, adjust their diet to prevent obesity:

* Reduce calories by up to 30%
* Feed canned food in smaller portions (divide daily ration into 2-3 parts) and warm it slightly before feeding
* Consider 3-4 meals a day for underweight cats.
📄 Full Original Content: FEEDING GERIATRIC CATS Preventing obesity is the single most important thing you can do to prolong the life of an older cat. Geriatric cats are less active and may require up to 30 percent fewer calories than do younger cats. If the cat’s diet is not adjusted accord- ingly, overfeeding will result in weight gain. If you are feeding canned cat food, divide the daily ration into two or three equal parts and feed them at regular intervals throughout the day. Although canned foods need to be stor

# Let's Generate A Test Query