In [1]:
# Install dependencies if needed
# !pip install langchain langchain-experimental langchain-chroma pillow open_clip_torch torch matplotlib unstructured pydantic
import os
from textbook_loading import (
    load_book,
    clean_and_categorize_elements,
    summarize_elements,
    store_in_chromadb,
    delete_irrelevant_images
)

In [2]:
pdf_file = './data/first_n_second_Chapter_EmergencyInfectiveDisease.pdf'
image_output_dir = './figures/Emergency_and_InfectiveDisease'
chroma_persist_dir = './chroma/Emergency_and_InfectiveDisease/'

# Make sure the data directory exists
assert os.path.exists('./data'), "Error: './data' directory not found."
assert os.path.exists(pdf_file), f"Error: PDF file not found at {pdf_file}."

In [None]:
print("üìö Loading and partitioning PDF elements...")
raw_pdf_elements = load_book(pdf_file, image_output_dir)
print(f"‚úÖ Found {len(raw_pdf_elements)} raw elements.")

üìö Loading and partitioning PDF elements...


In [None]:
print("üßπ Cleaning and categorizing elements...")
texts, tables, images_raw, _, _, _, _, _ = clean_and_categorize_elements(raw_pdf_elements)
print(f"Categorized: {len(texts)} text chunks, {len(tables)} tables, {len(images_raw)} images.")

In [None]:
print("üìù Summarizing text, tables, and Checking relevant images...")
text_summaries, table_summaries, image_paths, relevant_images_to_summarize, image_contexts = summarize_elements(
    texts, tables, images_raw
)
print(f"üìä Summarized: {len(text_summaries)} text summaries, {len(table_summaries)} table summaries")

In [None]:
print(f"üíæ Storing data into ChromaDB at {chroma_persist_dir}...")
#Returns a UnifiedRetriever class: def __init__(self, vectorstore, docstore, id_key="doc_id"):
retriever = store_in_chromadb(
    text_summaries, texts, table_summaries, tables, image_paths,
    relevant_images_to_summarize, image_contexts,
    persist_directory=chroma_persist_dir
)
print("üéâ Data ingestion complete. Retriever initialized.")

In [None]:
print("üóëÔ∏è Deleting irrelevant images...")
delete_irrelevant_images(images_raw, relevant_images_to_summarize)

In [None]:
import os

# Path to your sound file (must be a supported format, e.g., .wav, .mp3)
sound_file = "/System/Library/Sounds/Glass.aiff"  # macOS comes with some default sounds

# Play the sound
os.system(f"afplay '{sound_file}'")

# Inspecting Retrieved Docs

In [20]:
query = "How to properly pick up a cat?"

In [None]:
print('-'*40, "Here are the retrieved original_docs with similarity scores",'-'*40)

results = retriever.retrieve(query, k=10)
for res in results:
    print('-' * 40)
    print(f"üîé Similarity Score: {res['score']:.4f}")
    print("üìù Summary/Chunk:", res["summary"])
    print("üìÑ Full Original Content:", res["original"])
    print("Summary Metadata:", res["summary_metadata"])
    print("Original Metadata:", res["original_metadata"])