In [1]:
# Install dependencies if needed
# !pip install langchain langchain-experimental langchain-chroma pillow open_clip_torch torch matplotlib unstructured pydantic
import os
from textbook_loading import (
    load_book,
    clean_and_categorize_elements,
    summarize_elements,
    store_in_chromadb,
    delete_irrelevant_images,
)


In [2]:
pdf_file = './data/MediumExample_Ears_17Pgs.pdf'
image_output_dir = './figures/Ears'
chroma_persist_dir = './chroma/Ears/'

# Make sure the data directory exists
assert os.path.exists('./data'), "Error: './data' directory not found."
assert os.path.exists(pdf_file), f"Error: PDF file not found at {pdf_file}."

In [3]:
print("üìù Unstructuring textbooks, filtering junks, semanic chunking...")
raw_pdf_elements = load_book(pdf_file, image_output_dir)
print("üéâ 1.process_pdf_with_semantic_chunking complete.")


üìù Unstructuring textbooks, filtering junks, semanic chunking...


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


üéâ 1.process_pdf_with_semantic_chunking complete.


In [4]:
# Clean and categorize
texts, tables, images_raw, headers_raw, titles_raw, footers_raw, figure_captions_raw, list_items_raw = clean_and_categorize_elements(raw_pdf_elements, window_size=2, min_meaningful_text_length=75)


In [5]:
# Summarize, store, etc.
text_summaries, table_summaries, image_paths, relevant_images_to_summarize, image_summaries = summarize_elements(
    texts, tables, images_raw
)

Texts and Tables Summary Done!
Checking image relevance with local textual context...
Skipping decorative image: ./figures/Ears/figure-1-1.jpg
Skipping decorative image: ./figures/Ears/figure-2-3.jpg
Skipping decorative image: ./figures/Ears/figure-3-5.jpg
Skipping decorative image: ./figures/Ears/figure-6-10.jpg
Skipping decorative image: ./figures/Ears/figure-8-12.jpg
Skipping decorative image: ./figures/Ears/figure-9-14.jpg
Skipping decorative image: ./figures/Ears/figure-10-15.jpg
Skipping decorative image: ./figures/Ears/figure-11-17.jpg
Skipping decorative image: ./figures/Ears/figure-12-18.jpg
Skipping decorative image: ./figures/Ears/figure-13-19.jpg
Skipping decorative image: ./figures/Ears/figure-14-20.jpg
Skipping decorative image: ./figures/Ears/figure-15-21.jpg
Skipping decorative image: ./figures/Ears/figure-16-22.jpg
Number of relevant images: 9
Generating LLM summaries for relevant images...


In [6]:
retriever = store_in_chromadb(
    text_summaries, texts, table_summaries, tables, image_paths,
    relevant_images_to_summarize, image_summaries,
    persist_directory=chroma_persist_dir
)

In [None]:
delete_irrelevant_images(images_raw, relevant_images_to_summarize)

In [8]:
# System sound, when done
sound_file = "/System/Library/Sounds/Glass.aiff"
os.system(f"afplay '{sound_file}'")

0

# Inspecting Retrieved Docs

In [27]:
query = "what do to if my cat got ear mite?"
results = retriever.retrieve_multi_modal(query, k=5)


In [28]:
from IPython.display import display, HTML
import os

# 1. Display all images together as thumbnails
image_paths = set()
for res in results:
    if res["modality"] == "image" and os.path.exists(res["summary"]):
        image_paths.add(res["summary"])
    elif res["modality"] == "image_summary":
        img_path = res["original_metadata"].get("image_path")
        if img_path and os.path.exists(img_path):
            image_paths.add(img_path)

if image_paths:
    html_imgs = " ".join(
        f'<img src="{img}" width="100" style="margin:2px; border:1px solid #ccc;">' for img in image_paths
    )
    display(HTML(html_imgs))
else:
    print("No images found in results.")

# 2. Display original text for each text result
print('-'*40, "Retrieved Text Chunks (first 300 chars)", '-'*40)
for res in results:
    if res["modality"] == "text":
        doc_id = res["original_metadata"].get("doc_id")
        original_text = None
        if doc_id and hasattr(retriever, "docstore"):
            doc = retriever.docstore._collection.get(ids=[doc_id], include=["documents"])
            if doc and doc.get("documents") and doc["documents"][0]:
                original_text = doc["documents"][0]
        if not original_text:
            original_text = res["summary"]
        text_display = original_text[:300] + ("..." if len(original_text) > 300 else "")
        print(text_display)
        print('-'*20)

---------------------------------------- Retrieved Text Chunks (first 300 chars) ----------------------------------------
Otitis Interna Otitis interna is an inner ear infection‚Äîoften one that started out as a middle ear infection. Suspect otitis interna if your cat vomits, staggers, or falls toward the affected side, circles toward that side, or shows rhythmic jerking move- ments of her eyeballs. These are signs of ve...
--------------------
Structure of the Ears 206 ‚Ä¢ CAT OWNER‚ÄôS HOME VETERINARY HANDBOOK Your cat has an ear problem if you notice ear scratching, repeated head shaking, a bad odor emanating from the ear, or large amounts of waxy dis- charge or pus draining. In a younger cat, the most likely cause is ear mites, but other d...
--------------------
BITES AND LACERATIONS Cats give and receive painful bites and scratches that are prone to severe infec- tion. The pinna is a frequent site for such injuries. Some occur during mating.
--------------------
EAR MITES Ear mit