In [1]:
# Install dependencies if needed
# !pip install langchain langchain-experimental langchain-chroma pillow open_clip_torch torch matplotlib unstructured pydantic
import os
import uuid
from textbook_loading import (
    load_book,
    clean_and_categorize_elements,
    summarize_elements,
    store_in_chromadb,
    delete_irrelevant_images
)

In [2]:
pdf_file = './data/shortExample1P_PickupCat.pdf'
image_output_dir = './figures'
chroma_persist_dir = './chroma/textbook_test_example1/'

# Make sure the data directory exists
assert os.path.exists('./data'), "Error: './data' directory not found."
assert os.path.exists(pdf_file), f"Error: PDF file not found at {pdf_file}."

In [3]:
print("📚 Loading and partitioning PDF elements...")
raw_pdf_elements = load_book(pdf_file, image_output_dir)
print(f"✅ Found {len(raw_pdf_elements)} raw elements.")

📚 Loading and partitioning PDF elements...


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


✅ Found 55 raw elements.


In [4]:
print("🧹 Cleaning and categorizing elements...")
texts, tables, images_raw, _, _, _, _, _ = clean_and_categorize_elements(raw_pdf_elements)
print(f"Categorized: {len(texts)} text chunks, {len(tables)} tables, {len(images_raw)} images.")

🧹 Cleaning and categorizing elements...
Categorized: 9 text chunks, 0 tables, 10 images.


In [5]:
print("📝 Summarizing text, tables, and relevant images...")
text_summaries, table_summaries, img_summaries, image_paths, relevant_images_to_summarize = summarize_elements(
    texts, tables, images_raw
)
print(f"📊 Summarized: {len(text_summaries)} text summaries, {len(table_summaries)} table summaries, {len(img_summaries)} image summaries.")

📝 Summarizing text, tables, and relevant images...
Checking image relevance with local textual context...
Skipping irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-4-9.jpg
Skipping irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-5-10.jpg
Number of relevant images for summarization: 8
📊 Summarized: 9 text summaries, 0 table summaries, 8 image summaries.


In [6]:
print(f"💾 Storing data into ChromaDB at {chroma_persist_dir}...")
retriever = store_in_chromadb(
    text_summaries, texts, table_summaries, tables, img_summaries, image_paths, persist_directory=chroma_persist_dir
)
print("🎉 Data ingestion complete. Retriever initialized.")

💾 Storing data into ChromaDB at ./chroma/textbook_test_example1/...
Using ChromaDB for summaries at: ./chroma/textbook_test_example1/ (collection: summaries)
Using ChromaDB for original texts at: ./chroma/textbook_test_example1/ (collection: original_texts)
Using ChromaDB for original images at: ./chroma/textbook_test_example1/ (collection: original_images)
🎉 Data ingestion complete. Retriever initialized.


In [7]:
# Check total documents in the vectorstore (summaries)
vectorstore_ids = retriever.vectorstore.get(include=[])['ids']
print(f"Total documents in vectorstore (summaries): {len(vectorstore_ids)}")

# Check total documents in the docstore (original content)
docstore_keys = retriever.docstore.mget(vectorstore_ids)
print(f"Total documents in docstore (original content): {len(docstore_keys)}")

if len(vectorstore_ids) > 0:
    print("Sample of docstore original content (first 1 document if available):")
    sample_doc_ids = vectorstore_ids[:1]
    retrieved_docs = retriever.docstore.mget(sample_doc_ids)
    for i, doc_content in enumerate(retrieved_docs):
        if doc_content:
            print(f"--- Document {i+1} Content (from docstore) ---")
            print(str(doc_content)[:500] + "...")
        else:
            print(f"Could not retrieve content for doc ID: {sample_doc_ids[i]}")
else:
    print("No documents to sample from.")
print("--- End of Inspection ---\n")

Total documents in vectorstore (summaries): 17
Total documents in docstore (original content): 17
Sample of docstore original content (first 1 document if available):
Could not retrieve content for doc ID: b9de49ce-0f24-481d-965b-d396391e48ea
--- End of Inspection ---



In [8]:
print("🗑️ Deleting irrelevant images...")
delete_irrelevant_images(images_raw, relevant_images_to_summarize)

🗑️ Deleting irrelevant images...
Successfully deleted irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-4-9.jpg
Successfully deleted irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-5-10.jpg
Finished deleting images. Total deleted: 2


In [9]:
query = "Can you show me how to pick up a cat? Include images"
print('-'*40, "Here are the retrieved original_docs with similarity scores",'-'*40)

# Retrieve documents and similarity scores using the underlying vectorstore
results = retriever.vectorstore.similarity_search_with_score(query, k=8)

print(f"\nThe query: '{query}':")
print(f"\nIn total there are {len(results)} docs retrieved.")
for doc, score in results:
    print('-' * 40)
    print(f"🔎 Similarity Score: {score:.4f}")
    if hasattr(doc, "page_content") and doc.page_content:
        print("📝 Summary/Chunk:", doc.page_content)
        if isinstance(doc.page_content, str) and doc.page_content.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
            print("🖼️ This doc is likely an image. Image path:", doc.page_content)
    else:
        print("❓ No page_content in this doc. Full doc object:", doc)
    doc_id = doc.metadata.get('doc_id') if hasattr(doc, "metadata") else None
    if doc_id:
        original = retriever.docstore.mget([doc_id])[0]
        print("📄 Full Original Content:", original)
        if isinstance(original, str) and original.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
            print("🖼️ Original content is an image. Image path:", original)
    else:
        print("🚫 No doc_id found in metadata for this doc.")

---------------------------------------- Here are the retrieved original_docs with similarity scores ----------------------------------------

The query: 'Can you show me how to pick up a cat? Include images':

In total there are 8 docs retrieved.
----------------------------------------
🔎 Similarity Score: 0.7567
📝 Summary/Chunk: To pick up a cat safely:

* Reach down from above to avoid confrontation
* Place one hand under the chest and take hold of front legs, ensuring a secure grip
* Support hind legs if needed
* Snuggle close to body, cradling chin for comfort
📄 Full Original Content: page_content='PICKING UP A CAT As a general rule, it is advisable to reach down and pick up a cat from above. A face-to-face confrontation might provoke the cat into becoming uncooper- ative or aggressive. Cooperative cats can be picked up by placing one hand around the cat beneath the chest and taking hold of the cat’s front legs so they cross over each other, keeping your index finger between them 

In [10]:
import os
os.system('afplay /System/Library/Sounds/Glass.aiff')

0