In [1]:
# Install dependencies if needed
# !pip install langchain langchain-experimental langchain-chroma pillow open_clip_torch torch matplotlib unstructured pydantic
import os
from textbook_loading import (
    load_book,
    clean_and_categorize_elements,
    summarize_elements,
    store_in_chromadb,
    delete_irrelevant_images
)

In [2]:
pdf_file = './data/shortExample_PARASITES.pdf'
image_output_dir = './figures/figures_Parasites'
chroma_persist_dir = './chroma/textbook_test_Parasites/'

# Make sure the data directory exists
assert os.path.exists('./data'), "Error: './data' directory not found."
assert os.path.exists(pdf_file), f"Error: PDF file not found at {pdf_file}."

In [3]:
print("📚 Loading and partitioning PDF elements...")
raw_pdf_elements = load_book(pdf_file, image_output_dir)
print(f"✅ Found {len(raw_pdf_elements)} raw elements.")

📚 Loading and partitioning PDF elements...


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


✅ Found 193 raw elements.


In [4]:
print("🧹 Cleaning and categorizing elements...")
texts, tables, images_raw, _, _, _, _, _ = clean_and_categorize_elements(raw_pdf_elements)
print(f"Categorized: {len(texts)} text chunks, {len(tables)} tables, {len(images_raw)} images.")

🧹 Cleaning and categorizing elements...
Categorized: 40 text chunks, 1 tables, 18 images.


In [5]:
print("📝 Summarizing text, tables, and relevant images...")
text_summaries, table_summaries, img_summaries, image_paths, relevant_images_to_summarize = summarize_elements(
    texts, tables, images_raw
)
print(f"📊 Summarized: {len(text_summaries)} text summaries, {len(table_summaries)} table summaries, {len(img_summaries)} image summaries.")

📝 Summarizing text, tables, and relevant images...
Checking image relevance with local textual context...
Skipping irrelevant image: ./figures/figures_Parasites/figure-1-1.jpg
Skipping irrelevant image: ./figures/figures_Parasites/figure-2-3.jpg
Skipping irrelevant image: ./figures/figures_Parasites/figure-3-4.jpg
Skipping irrelevant image: ./figures/figures_Parasites/figure-4-5.jpg
Skipping irrelevant image: ./figures/figures_Parasites/figure-6-7.jpg
Skipping irrelevant image: ./figures/figures_Parasites/figure-7-8.jpg
Skipping irrelevant image: ./figures/figures_Parasites/figure-8-10.jpg
Skipping irrelevant image: ./figures/figures_Parasites/figure-10-14.jpg
Skipping irrelevant image: ./figures/figures_Parasites/figure-11-15.jpg
Skipping irrelevant image: ./figures/figures_Parasites/figure-12-16.jpg
Skipping irrelevant image: ./figures/figures_Parasites/figure-13-17.jpg
Skipping irrelevant image: ./figures/figures_Parasites/figure-14-18.jpg
Number of relevant images for summarization

In [6]:
print(f"💾 Storing data into ChromaDB at {chroma_persist_dir}...")
#Returns a UnifiedRetriever class: def __init__(self, vectorstore, docstore, id_key="doc_id"):
retriever = store_in_chromadb(
    text_summaries, texts, table_summaries, tables, img_summaries, image_paths, persist_directory=chroma_persist_dir
)
print("🎉 Data ingestion complete. Retriever initialized.")

💾 Storing data into ChromaDB at ./chroma/textbook_test_Parasites/...
🎉 Data ingestion complete. Retriever initialized.


In [7]:
print("🗑️ Deleting irrelevant images...")
delete_irrelevant_images(images_raw, relevant_images_to_summarize)

🗑️ Deleting irrelevant images...
Successfully deleted irrelevant image: ./figures/figures_Parasites/figure-1-1.jpg
Successfully deleted irrelevant image: ./figures/figures_Parasites/figure-2-3.jpg
Successfully deleted irrelevant image: ./figures/figures_Parasites/figure-3-4.jpg
Successfully deleted irrelevant image: ./figures/figures_Parasites/figure-4-5.jpg
Successfully deleted irrelevant image: ./figures/figures_Parasites/figure-6-7.jpg
Successfully deleted irrelevant image: ./figures/figures_Parasites/figure-7-8.jpg
Successfully deleted irrelevant image: ./figures/figures_Parasites/figure-8-10.jpg
Successfully deleted irrelevant image: ./figures/figures_Parasites/figure-10-14.jpg
Successfully deleted irrelevant image: ./figures/figures_Parasites/figure-11-15.jpg
Successfully deleted irrelevant image: ./figures/figures_Parasites/figure-12-16.jpg
Successfully deleted irrelevant image: ./figures/figures_Parasites/figure-13-17.jpg
Successfully deleted irrelevant image: ./figures/figures

# Inspecting Retrieved Docs

In [14]:
query = "What is dipylidium caninum? What's the life cycle of dipylidium caninum?"

In [15]:
print('-'*40, "Here are the retrieved original_docs with similarity scores",'-'*40)

results = retriever.retrieve(query, k=10)
for res in results:
    print('-' * 40)
    print(f"🔎 Similarity Score: {res['score']:.4f}")
    print("📝 Summary/Chunk:", res["summary"])
    print("📄 Full Original Content:", res["original"])
    print("Summary Metadata:", res["summary_metadata"])
    print("Original Metadata:", res["original_metadata"])

---------------------------------------- Here are the retrieved original_docs with similarity scores ----------------------------------------
----------------------------------------
🔎 Similarity Score: 1.1406
📝 Summary/Chunk: The image illustrates a detailed depiction of the life cycle of Dipylidium caninum, also known as canine hair mites. As part of a veterinary handbook for pet owners, it serves to educate on parasitic infections affecting domestic animals.

From this perspective:
- The first stage shows segments passing in stool or found around a cat's recumbent body.
  - **Search Term:** Cat fecal examination
   This highlights the initial stages where mite segments are likely present without active infection symptoms, indicating potential exposure to parasites. 

- In the second phase, these segments are ingested by fleas when they bite and attach to them for feeding.

  - **Search Term:** Flea lifecycle control

- The third depiction shows a cat ingesting the flea containing mi