In [1]:
# Install dependencies if needed
# !pip install langchain langchain-experimental langchain-chroma pillow open_clip_torch torch matplotlib unstructured pydantic
import os
from textbook_loading import (
    load_book,
    clean_and_categorize_elements,
    summarize_elements,
    store_in_chromadb,
    delete_irrelevant_images
)

In [2]:
pdf_file = './data/shortExample2_Nutrition_20Pgs.pdf'
image_output_dir = './figures'
chroma_persist_dir = './chroma/textbook_test_Nutrition/'

# Make sure the data directory exists
assert os.path.exists('./data'), "Error: './data' directory not found."
assert os.path.exists(pdf_file), f"Error: PDF file not found at {pdf_file}."

In [3]:
print("📚 Loading and partitioning PDF elements...")
raw_pdf_elements = load_book(pdf_file, image_output_dir)
print(f"✅ Found {len(raw_pdf_elements)} raw elements.")

📚 Loading and partitioning PDF elements...


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


✅ Found 316 raw elements.


In [4]:
print("🧹 Cleaning and categorizing elements...")
texts, tables, images_raw, _, _, _, _, _ = clean_and_categorize_elements(raw_pdf_elements)
print(f"Categorized: {len(texts)} text chunks, {len(tables)} tables, {len(images_raw)} images.")

🧹 Cleaning and categorizing elements...
Categorized: 53 text chunks, 4 tables, 21 images.


In [5]:
print("📝 Summarizing text, tables, and relevant images...")
text_summaries, table_summaries, img_summaries, image_paths, relevant_images_to_summarize = summarize_elements(
    texts, tables, images_raw
)
print(f"📊 Summarized: {len(text_summaries)} text summaries, {len(table_summaries)} table summaries, {len(img_summaries)} image summaries.")

📝 Summarizing text, tables, and relevant images...
Checking image relevance with local textual context...
Skipping irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-1-1.jpg
Skipping irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-4-2.jpg
Skipping irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-5-3.jpg
Skipping irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-6-4.jpg
Skipping irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-7-5.jpg
Skipping irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-8-6.jpg
Skipping irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-9-7.jpg
Skipping irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-10-8.jpg
Skipping irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-11-9.jpg
Skipping irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-12-10.jpg
Skipping irrelevant 

In [6]:
print(f"💾 Storing data into ChromaDB at {chroma_persist_dir}...")
retriever = store_in_chromadb(
    text_summaries, texts, table_summaries, tables, img_summaries, image_paths, persist_directory=chroma_persist_dir
)
print("🎉 Data ingestion complete. Retriever initialized.")

💾 Storing data into ChromaDB at ./chroma/textbook_test_Nutrition/...
Using ChromaDB for summaries at: ./chroma/textbook_test_Nutrition/ (collection: summaries)
Using ChromaDB for original texts at: ./chroma/textbook_test_Nutrition/ (collection: original_texts)
Using ChromaDB for original images at: ./chroma/textbook_test_Nutrition/ (collection: original_images)
🎉 Data ingestion complete. Retriever initialized.


In [7]:
# Check total documents in the vectorstore (summaries)
vectorstore_ids = retriever.vectorstore.get(include=[])['ids']
print(f"Total documents in vectorstore (summaries): {len(vectorstore_ids)}")

# Check total documents in the docstore (original content)
docstore_keys = retriever.docstore.mget(vectorstore_ids)
print(f"Total documents in docstore (original content): {len(docstore_keys)}")

if len(vectorstore_ids) > 0:
    print("Sample of docstore original content (first 1 document if available):")
    sample_doc_ids = vectorstore_ids[:1]
    retrieved_docs = retriever.docstore.mget(sample_doc_ids)
    for i, doc_content in enumerate(retrieved_docs):
        if doc_content:
            print(f"--- Document {i+1} Content (from docstore) ---")
            print(str(doc_content)[:500] + "...")
        else:
            print(f"Could not retrieve content for doc ID: {sample_doc_ids[i]}")
else:
    print("No documents to sample from.")
print("--- End of Inspection ---\n")

Total documents in vectorstore (summaries): 61
Total documents in docstore (original content): 61
Sample of docstore original content (first 1 document if available):
Could not retrieve content for doc ID: d6814a70-870d-40c0-ab58-b3b5f8126b2c
--- End of Inspection ---



In [8]:
print("🗑️ Deleting irrelevant images...")
delete_irrelevant_images(images_raw, relevant_images_to_summarize)

🗑️ Deleting irrelevant images...
Successfully deleted irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-1-1.jpg
Successfully deleted irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-4-2.jpg
Successfully deleted irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-5-3.jpg
Successfully deleted irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-6-4.jpg
Successfully deleted irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-7-5.jpg
Successfully deleted irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-8-6.jpg
Successfully deleted irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-9-7.jpg
Successfully deleted irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-10-8.jpg
Successfully deleted irrelevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-11-9.jpg
Successfully deleted irrelevant image: /Users/mas/Desktop/LLM_Veterinary_A

# Inspecting Retrieved Docs

In [9]:
query = "My cat's ear has dirty and greasy thing, what should I do?"
print('-'*40, "Here are the retrieved original_docs with similarity scores",'-'*40)

# Retrieve documents and similarity scores using the underlying vectorstore
results = retriever.vectorstore.similarity_search_with_score(query, k=8)

print(f"\nThe query: '{query}':")
print(f"\nIn total there are {len(results)} docs retrieved.")
for doc, score in results:
    print('-' * 40)
    print(f"🔎 Similarity Score: {score:.4f}")
    if hasattr(doc, "page_content") and doc.page_content:
        print("📝 Summary/Chunk:", doc.page_content)
        if isinstance(doc.page_content, str) and doc.page_content.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
            print("🖼️ This doc is likely an image. Image path:", doc.page_content)
    else:
        print("❓ No page_content in this doc. Full doc object:", doc)
    doc_id = doc.metadata.get('doc_id') if hasattr(doc, "metadata") else None
    if doc_id:
        original = retriever.docstore.mget([doc_id])[0]
        print("📄 Full Original Content:", original)
        if isinstance(original, str) and original.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
            print("🖼️ Original content is an image. Image path:", original)
    else:
        print("🚫 No doc_id found in metadata for this doc.")

---------------------------------------- Here are the retrieved original_docs with similarity scores ----------------------------------------

The query: 'My cat's ear has dirty and greasy thing, what should I do?':

In total there are 8 docs retrieved.
----------------------------------------
🔎 Similarity Score: 0.8479
📝 Summary/Chunk: Cats may need to change their diet due to health issues, which can be done to help them adjust to the new food more easily.
📄 Full Original Content: page_content='Switching Diets It may become necessary to adjust a cat’s diet and switch to a new food because of a health problem. This is yet another reason to get your cat accustomed to —o—' metadata={'doc_id': '42bfdbea-b1a6-4ea3-aead-0e5c7ce6218b'}
----------------------------------------
🔎 Similarity Score: 0.8487
📝 Summary/Chunk: Here's a concise summary:

Cats have limited taste buds (9,000) and can only detect basic tastes of sour, bitter, and salty. However, their sense of smell is superior and hig

# Let's Generate A Test Query

In [10]:
query = "What would be a health weight for a 4 years old neutered cat?"
# Retrieve documents and similarity scores using the underlying vectorstore
results = retriever.vectorstore.similarity_search_with_score(query, k=8)

def build_context(docs):
    context = ""
    for doc in docs:
        # If the doc is a Document object with an image path in page_content or metadata
        if hasattr(doc, "page_content") and isinstance(doc.page_content, str) and doc.page_content.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
            context += f"![Image]({doc.page_content})\n"
        elif hasattr(doc, "metadata") and "image_path" in doc.metadata:
            context += f"![Image]({doc.metadata['image_path']})\n"
        elif hasattr(doc, "page_content"):
            context += doc.page_content + "\n"
        else:
            context += str(doc) + "\n"
    return context

# Extract original docs from results
original_docs = []
for doc, score in results:
    doc_id = doc.metadata.get('doc_id') if hasattr(doc, "metadata") else None
    if doc_id:
        original = retriever.docstore.mget([doc_id])[0]
        if original:
            original_docs.append(original)

context = build_context(original_docs)

In [11]:
import ollama
from IPython.display import display, Markdown

def ask_ollama(context, user_query, model="llama3.2:latest"):
    prompt = f"""You are a helpful assistant. Using the following context, answer the user's question in markdown. 
        If there are images, include them using markdown image syntax with the correct path.

        Context:
        {context}

        User question: {user_query}

        Answer in markdown:"""
    response = ollama.chat(model=model, messages=[{"role": "user", "content": prompt}])
    return response['message']['content']

def render_markdown_with_images(markdown_text):
    display(Markdown(markdown_text))

# Use the context and user query
markdown_answer = ask_ollama(context, query)
render_markdown_with_images(markdown_answer)

### Health Weight for a 4-Year-Old Neutered Cat

The ideal weight for a 4-year-old neutered cat can vary depending on several factors, including breed, size, and body condition. However, according to general guidelines, a healthy adult cat typically weighs between 8-12 pounds (3.6-5.4 kg) for males and 6-9 pounds (2.7-4 kg) for females.

For a neutered cat, it's essential to maintain a lean and muscular build to prevent obesity-related health issues. A general rule of thumb is to weigh your cat about 10-15% below their ideal weight.

### Example Weights for a 4-Year-Old Neutered Cat

* Male: 8.5-11 pounds (3.9-5 kg)
* Female: 6.5-8.5 pounds (2.9-3.8 kg)

Keep in mind that these are general guidelines, and the ideal weight for your specific cat may vary. It's always best to consult with your veterinarian to determine a healthy weight range for your feline friend.