In [1]:
# Install dependencies if needed
# !pip install langchain langchain-experimental langchain-chroma pillow open_clip_torch torch matplotlib unstructured pydantic
import os
from textbook_loading import (
    load_book,
    clean_and_categorize_elements,
    summarize_elements,
    store_in_chromadb,
    delete_irrelevant_images
)

In [2]:
pdf_file = './data/shortExample_PARASITES.pdf'
image_output_dir = './figures_Parasites'
chroma_persist_dir = './chroma/textbook_test_parasites/'

# Make sure the data directory exists
assert os.path.exists('./data'), "Error: './data' directory not found."
assert os.path.exists(pdf_file), f"Error: PDF file not found at {pdf_file}."

In [3]:
print("📚 Loading and partitioning PDF elements...")
raw_pdf_elements = load_book(pdf_file, image_output_dir)
print(f"✅ Found {len(raw_pdf_elements)} raw elements.")

📚 Loading and partitioning PDF elements...


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


✅ Found 193 raw elements.


In [4]:
print("🧹 Cleaning and categorizing elements...")
texts, tables, images_raw, _, _, _, _, _ = clean_and_categorize_elements(raw_pdf_elements)
print(f"Categorized: {len(texts)} text chunks, {len(tables)} tables, {len(images_raw)} images.")

🧹 Cleaning and categorizing elements...
Categorized: 40 text chunks, 1 tables, 18 images.


In [5]:
print("📝 Summarizing text, tables, and relevant images...")
text_summaries, table_summaries, img_summaries, image_paths, relevant_images_to_summarize = summarize_elements(
    texts, tables, images_raw
)
print(f"📊 Summarized: {len(text_summaries)} text summaries, {len(table_summaries)} table summaries, {len(img_summaries)} image summaries.")

📝 Summarizing text, tables, and relevant images...
Checking image relevance with local textual context...
Skipping irrelevant image: ./figures_Parasites/figure-1-1.jpg
Skipping irrelevant image: ./figures_Parasites/figure-2-3.jpg
Skipping irrelevant image: ./figures_Parasites/figure-3-4.jpg
Skipping irrelevant image: ./figures_Parasites/figure-4-5.jpg
Skipping irrelevant image: ./figures_Parasites/figure-6-7.jpg
Skipping irrelevant image: ./figures_Parasites/figure-7-8.jpg
Skipping irrelevant image: ./figures_Parasites/figure-8-10.jpg
Skipping irrelevant image: ./figures_Parasites/figure-10-14.jpg
Skipping irrelevant image: ./figures_Parasites/figure-11-15.jpg
Skipping irrelevant image: ./figures_Parasites/figure-12-16.jpg
Skipping irrelevant image: ./figures_Parasites/figure-13-17.jpg
Skipping irrelevant image: ./figures_Parasites/figure-14-18.jpg
Number of relevant images for summarization: 6
📊 Summarized: 40 text summaries, 1 table summaries, 6 image summaries.


In [6]:
print(f"💾 Storing data into ChromaDB at {chroma_persist_dir}...")
retriever = store_in_chromadb(
    text_summaries, texts, table_summaries, tables, img_summaries, image_paths, persist_directory=chroma_persist_dir
)
print("🎉 Data ingestion complete. Retriever initialized.")

💾 Storing data into ChromaDB at ./chroma/textbook_test_parasites/...
Using ChromaDB for summaries at: ./chroma/textbook_test_parasites/ (collection: summaries)
Using ChromaDB for original texts at: ./chroma/textbook_test_parasites/ (collection: original_texts)
Using ChromaDB for original images at: ./chroma/textbook_test_parasites/ (collection: original_images)
🎉 Data ingestion complete. Retriever initialized.


In [7]:
# Check total documents in the vectorstore (summaries)
vectorstore_ids = retriever.vectorstore.get(include=[])['ids']
print(f"Total documents in vectorstore (summaries): {len(vectorstore_ids)}")

# Check total documents in the docstore (original content)
docstore_keys = retriever.docstore.mget(vectorstore_ids)
print(f"Total documents in docstore (original content): {len(docstore_keys)}")

if len(vectorstore_ids) > 0:
    print("Sample of docstore original content (first 1 document if available):")
    sample_doc_ids = vectorstore_ids[:1]
    retrieved_docs = retriever.docstore.mget(sample_doc_ids)
    for i, doc_content in enumerate(retrieved_docs):
        if doc_content:
            print(f"--- Document {i+1} Content (from docstore) ---")
            print(str(doc_content)[:500] + "...")
        else:
            print(f"Could not retrieve content for doc ID: {sample_doc_ids[i]}")
else:
    print("No documents to sample from.")
print("--- End of Inspection ---\n")

Total documents in vectorstore (summaries): 47
Total documents in docstore (original content): 47
Sample of docstore original content (first 1 document if available):
Could not retrieve content for doc ID: 4bb742e9-889a-4a86-b4ce-f57f1a17e1bd
--- End of Inspection ---



In [8]:
print("🗑️ Deleting irrelevant images...")
delete_irrelevant_images(images_raw, relevant_images_to_summarize)

🗑️ Deleting irrelevant images...
Successfully deleted irrelevant image: ./figures_Parasites/figure-1-1.jpg
Successfully deleted irrelevant image: ./figures_Parasites/figure-2-3.jpg
Successfully deleted irrelevant image: ./figures_Parasites/figure-3-4.jpg
Successfully deleted irrelevant image: ./figures_Parasites/figure-4-5.jpg
Successfully deleted irrelevant image: ./figures_Parasites/figure-6-7.jpg
Successfully deleted irrelevant image: ./figures_Parasites/figure-7-8.jpg
Successfully deleted irrelevant image: ./figures_Parasites/figure-8-10.jpg
Successfully deleted irrelevant image: ./figures_Parasites/figure-10-14.jpg
Successfully deleted irrelevant image: ./figures_Parasites/figure-11-15.jpg
Successfully deleted irrelevant image: ./figures_Parasites/figure-12-16.jpg
Successfully deleted irrelevant image: ./figures_Parasites/figure-13-17.jpg
Successfully deleted irrelevant image: ./figures_Parasites/figure-14-18.jpg
Finished deleting images. Total deleted: 12


# Inspecting Retrieved Docs

In [9]:
query = "What kinds of parasites or worm does cat often have? What does they look like?"

In [10]:
print('-'*40, "Here are the retrieved original_docs with similarity scores",'-'*40)

# Retrieve documents and similarity scores using the underlying vectorstore
results = retriever.vectorstore.similarity_search_with_score(query, k=10)

print(f"\nThe query: '{query}':")
print(f"\nIn total there are {len(results)} docs retrieved.")
for doc, score in results:
    print('-' * 40)
    print(f"🔎 Similarity Score: {score:.4f}")
    if hasattr(doc, "page_content") and doc.page_content:
        print("📝 Summary/Chunk:", doc.page_content)
        if isinstance(doc.page_content, str) and doc.page_content.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
            print("🖼️ This doc is likely an image. Image path:", doc.page_content)
    else:
        print("❓ No page_content in this doc. Full doc object:", doc)
    doc_id = doc.metadata.get('doc_id') if hasattr(doc, "metadata") else None
    if doc_id:
        original = retriever.docstore.mget([doc_id])[0]
        print("📄 Full Original Content:", original)
        if isinstance(original, str) and original.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
            print("🖼️ Original content is an image. Image path:", original)
    else:
        print("🚫 No doc_id found in metadata for this doc.")

---------------------------------------- Here are the retrieved original_docs with similarity scores ----------------------------------------

The query: 'What kinds of parasites or worm does cat often have? What does they look like?':

In total there are 10 docs retrieved.
----------------------------------------
🔎 Similarity Score: 0.5322
📝 Summary/Chunk: Here is a concise summary:

Whipworms are long, slender parasites (2-3 inches) that live in a cat's large intestine. They typically don't cause disease in cats and can be treated with no intervention.
📄 Full Original Content: page_content='WHIPWORMS These are slender parasites, 2 to 3 inches (50 to 76 mm) long that live in the cecum (the first part of the large intestine). Since they are thicker at one end, they have the appearance of a whip. Whipworms are usually found inciden- tally and are not known to cause disease in cats. Treatment: No treatment is necessary.' metadata={'doc_id': '92993530-90ac-49ff-b5f5-2e55d4f82d87'}
-------

# Let's Generate A Test Query

In [11]:
# Retrieve documents and similarity scores using the underlying vectorstore
results = retriever.vectorstore.similarity_search_with_score(query, k=8)

def build_context(docs):
    context = ""
    for doc in docs:
        # If the doc is a Document object with an image path in page_content or metadata
        if hasattr(doc, "page_content") and isinstance(doc.page_content, str) and doc.page_content.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
            context += f"![Image]({doc.page_content})\n"
        elif hasattr(doc, "metadata") and "image_path" in doc.metadata:
            context += f"![Image]({doc.metadata['image_path']})\n"
        elif hasattr(doc, "page_content"):
            context += doc.page_content + "\n"
        else:
            context += str(doc) + "\n"
    return context

# Extract original docs from results
original_docs = []
for doc, score in results:
    doc_id = doc.metadata.get('doc_id') if hasattr(doc, "metadata") else None
    if doc_id:
        original = retriever.docstore.mget([doc_id])[0]
        if original:
            original_docs.append(original)

context = build_context(original_docs)

In [12]:
import ollama
from IPython.display import display, Markdown

def ask_ollama(context, user_query, model="llama3.2:latest"):
    prompt = f"""You are a helpful assistant for users. Using only the text, tables, and images provided in the context below, answer the user's question in markdown.
        If you include images, use only those explicitly provided in the context, and include them using markdown image syntax with the correct path from the context. Do not use or invent any images that are not present in the context.

        Context:
        {context}

        User question: {user_query}

        Answer in markdown:"""
    response = ollama.chat(model=model, messages=[{"role": "user", "content": prompt}])
    return response['message']['content']

def render_markdown_with_images(markdown_text):
    display(Markdown(markdown_text))

# Use the context and user query
markdown_answer = ask_ollama(context, query)
render_markdown_with_images(markdown_answer)

* Whipworms: slender parasites 2 to 3 inches (50 to 76 mm) long that live in the cecum, appear like a whip due to being thicker at one end.
* Ascarids (Roundworms): most common worm parasite in cats, can grow up to 5 inches (13 cm) long, eggs are protected by a hard shell.
* Tapeworms: internal parasite that live in the small intestines, vary in length from less than 1 inch (25 mm) to several feet, scolex (head) fastens itself to intestinal wall.

In [13]:
import os

# Play the system sound (replace with any .wav or .mp3 file you have)
os.system('afplay /System/Library/Sounds/Glass.aiff')

0