In [None]:
! pip install langchain langchain-chroma "unstructured[all-docs]" pydantic lxml

# Partitioning Source Files using Unstructured

In [1]:
from typing import Any
from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf

# Path to save images
# path = "./figures/shortExample1/"

# Get elements
raw_pdf_elements = partition_pdf(
    filename='./data/shortExample1P_PickupCat.pdf',
    languages=['eng'],
    strategy='hi_res',
    # Using pdf format to find embedded image blocks
    extract_images_in_pdf=True,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
 
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    # Hard max on chunks   


    # --- Unstructure can't do semantic chunking, it only does fixed sized and by_title. 
    # --- I will use langChain semantic chunker for that.
    
    # ----If letting it to do chunking---
    # chunking_strategy="by_title",
    # max_characters=4000,
    # new_after_n_chars=3800,
    # combine_text_under_n_chars=2000,
    # ------------------------------

    image_output_dir_path="./figures/shortExample1/",
)


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [3]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
# TableChunk if Table > max chars set above
unique_categories = set(category_counts.keys())
category_counts

{"<class 'unstructured.documents.elements.Text'>": 8,
 "<class 'unstructured.documents.elements.Header'>": 3,
 "<class 'unstructured.documents.elements.ListItem'>": 3,
 "<class 'unstructured.documents.elements.Title'>": 8,
 "<class 'unstructured.documents.elements.NarrativeText'>": 16,
 "<class 'unstructured.documents.elements.Image'>": 10,
 "<class 'unstructured.documents.elements.FigureCaption'>": 7}

# Cleaning Partitioned Data

In [16]:
from typing import Any, Optional
from pydantic import BaseModel
import re

# Re-defining Element class with optional context field and original_index
class Element(BaseModel):
    type: str
    text: Any
    context: Optional[str] = None
    original_index: Optional[int] = None # Added to store original position

# Initialize lists for categorized elements
text_for_semantic_chunking = []
tables_raw = []
images_raw = []
headers_raw = []
titles_raw = []
footers_raw = []
figure_captions_raw = []
list_items_raw = []

# Variables to build coherent text blocks and manage local context
current_text_block = ""
current_context_prefix = "" # This will capture the most recent Header/Title
min_meaningful_text_length = 20 # Minimum length for a text block to be considered meaningful

# Helper to finalize and append current text block - defined globally
def finalize_text_block():
    global current_text_block
    global current_context_prefix
    if current_text_block.strip() and len(current_text_block.strip()) >= min_meaningful_text_length:
        # No debug prints in final version of this helper
        text_for_semantic_chunking.append(Element(type="text", text=current_text_block.strip()))
    current_text_block = "" # Reset for next block

for i, element in enumerate(raw_pdf_elements):
    element_type_str = str(type(element))
    element_text = str(element).strip()

    if "unstructured.documents.elements.Header" in element_type_str:
        finalize_text_block()
        
        is_running_header = False
        lower_element_text = element_text.lower()
        if (
            "qxp" in lower_element_text or
            "pm" in lower_element_text or
            "am" in lower_element_text or
            "page" in lower_element_text or
            re.search(r'\d{1,2}/\d{1,2}/\d{2,4}', lower_element_text)
        ):
            is_running_header = True

        if not is_running_header:
            current_context_prefix = element_text + " "
        headers_raw.append(Element(type="header", text=element_text, original_index=i))
    elif "unstructured.documents.elements.Title" in element_type_str:
        finalize_text_block()
        current_context_prefix = element_text + " "
        titles_raw.append(Element(type="title", text=element_text, original_index=i))
    elif "unstructured.documents.elements.NarrativeText" in element_type_str or \
         "unstructured.documents.elements.ListItem" in element_type_str or \
         "unstructured.documents.elements.Text" in element_type_str:
        
        if len(element_text) < 5 and not any(char.isalpha() for char in element_text):
            continue

        if not current_text_block and current_context_prefix:
            current_text_block += current_context_prefix
            
        current_text_block += element_text + " "
        # We don't add these directly to text_for_semantic_chunking here,
        # they are accumulated in current_text_block and added via finalize_text_block()
        if "unstructured.documents.elements.ListItem" in element_type_str:
            list_items_raw.append(Element(type="list_item", text=element_text, original_index=i))

    elif "unstructured.documents.elements.Table" in element_type_str:
        finalize_text_block()
        tables_raw.append(Element(type="table", text=element_text, original_index=i))
        # current_context_prefix is NOT reset here
    elif "unstructured.documents.elements.Image" in element_type_str:
        finalize_text_block() # Finalize any text block *before* the image

        image_path = getattr(element.metadata, "image_path", "N/A")
        # For initial context, we'll assign it a placeholder or previous context.
        # The full context will be enriched in a post-processing step.
        images_raw.append(Element(type="image", text=image_path, context="", original_index=i))
        current_text_block = "" # Reset text block after an image

    elif "unstructured.documents.elements.FigureCaption" in element_type_str:
        finalize_text_block() # Finalize current text block before a caption
        figure_captions_raw.append(Element(type="figure_caption", text=element_text, original_index=i))
        # current_context_prefix is NOT reset here
    elif "unstructured.documents.elements.Footer" in element_type_str:
        footers_raw.append(Element(type="footer", text=element_text, original_index=i))

# Finalize any remaining text block after the loop
finalize_text_block()


# --- New Function for Image Context Enrichment ---
def enrich_image_context(images, all_raw_elements, window_size=3):
    """
    Enriches the context for each image by looking at a window of surrounding text and captions.
    window_size: Number of elements to look before and after the image.
    """
    for img_element in images:
        img_index = img_element.original_index
        if img_index is None:
            continue

        start_index = max(0, img_index - window_size)
        end_index = min(len(all_raw_elements), img_index + window_size + 1)
        
        surrounding_text_elements = []
        for j in range(start_index, end_index):
            surrounding_element = all_raw_elements[j]
            element_type_str = str(type(surrounding_element))
            element_text = str(surrounding_element).strip()

            # Include NarrativeText, ListItem, Text, and FigureCaption
            if "unstructured.documents.elements.NarrativeText" in element_type_str or \
               "unstructured.documents.elements.ListItem" in element_type_str or \
               "unstructured.documents.elements.Text" in element_type_str or \
               "unstructured.documents.elements.FigureCaption" in element_type_str:
                
                # Also, try to get the most recent non-running header/title before this window
                # This is implicitly handled by `current_context_prefix` during the initial pass
                # but for post-processing, we might need a more direct way if we want to include
                # it for elements outside `current_text_block` capture.

                # For simplicity here, we'll just concatenate nearby text.
                # The assumption is that `current_context_prefix` will have set the general section context.
                if len(element_text) >= 5 or any(char.isalpha() for char in element_text): # Avoid very short, non-alphabetic
                     surrounding_text_elements.append(element_text)
            
            # Special handling for Headers/Titles that are not running headers
            if "unstructured.documents.elements.Header" in element_type_str or \
               "unstructured.documents.elements.Title" in element_type_str:
                lower_element_text = element_text.lower()
                is_running_header = (
                    "qxp" in lower_element_text or "pm" in lower_element_text or
                    "am" in lower_element_text or "page" in lower_element_text or
                    re.search(r'\d{1,2}/\d{1,2}/\d{2,4}', lower_element_text)
                )
                if not is_running_header:
                    surrounding_text_elements.append(element_text)


        # Combine the relevant surrounding text to form the image context
        enriched_context = " ".join(surrounding_text_elements).strip()
        if not enriched_context:
            enriched_context = "No specific text context available around this image."
        
        img_element.context = enriched_context

# Call the enrichment function after initial parsing
enrich_image_context(images_raw, raw_pdf_elements, window_size=5) # Increased window_size for broader context

# Now, 'texts' should contain the cleaned and combined narrative chunks
texts = [e.text for e in text_for_semantic_chunking]
tables = [e.text for e in tables_raw]

print(f"Number of tables extracted: {len(tables)}")
print(f"Number of text elements for semantic chunking: {len(texts)}")
print(f"Number of images detected: {len(images_raw)}")
print(f"Number of headers: {len(headers_raw)}")
print(f"Number of titles: {len(titles_raw)}")
print(f"Number of footers (ignored for RAG): {len(footers_raw)}")
print(f"Number of figure captions: {len(figure_captions_raw)}")
print(f"Number of list items: {len(list_items_raw)}")

# You can inspect an image element to see its context:
# if images_raw:
#     print("\nExample Image Element with Context:")
#     print(images_raw[0].dict())


Number of tables extracted: 0
Number of text elements for semantic chunking: 9
Number of images detected: 10
Number of headers: 3
Number of titles: 8
Number of footers (ignored for RAG): 0
Number of figure captions: 7
Number of list items: 3


# Text & Table Summary Generation

In [9]:
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
# Prompt
model = ChatOllama(model="llama3.2")

# New: Text Summarization Prompt
prompt_text_summary = """You are an assistant tasked with concisely summarizing text sections related to veterinary advice and pet care. Focus on key information, main ideas, and any actionable advice. Just give me the summary, be concise and do not be verbose. Text chunk: {element} """
prompt_text = ChatPromptTemplate.from_template(prompt_text_summary)
text_summarize_chain = {"element": lambda x: x} | prompt_text | model | StrOutputParser()

# New: Table Summarization Prompt
prompt_table_summary = """You are an assistant tasked with extracting key data, trends, and important numerical information from the provided table, especially when related to pet nutrition, health, or statistics. Just give me the summary, be concise and do not be verbose. Table chunk: {element} """
prompt_table = ChatPromptTemplate.from_template(prompt_table_summary)
table_summarize_chain = {"element": lambda x: x} | prompt_table | model | StrOutputParser()

In [10]:

# Apply to text
# texts = [i.text for i in text_elements if i.text != ""]
text_summaries = text_summarize_chain.batch(texts, {"max_concurrency": 8})

# Apply to tables
# tables = [i.text for i in table_elements]
table_summaries = table_summarize_chain.batch(tables, {"max_concurrency": 8})

# Image Summary Generatetion
From the LangChain Cookbook, they used a LLaVA 7B model to generate image summaries in .txt. Those files will be in the same dir as those images. However, I have llama3.2-vision already installed and setup on my local machine. 

Llama3.2-vision model is a 11B model which may require a strong computing power and large memory. Switch model if necessary. i.e. Llava, Qwen, Gemma, etc.

# ⚠️Patch:
Generic query often won't trigger the LLM to include image in the response. AI analysis shows it could be a problem of how images summaries were created. Retriever performs sematic matching to retrieve relevant chunks, but the current summaries was written in a outsider perspective. It'd be better to try following prompt in making summaries.

'content': 'Describe the image in detail, focusing on any actions, techniques, or procedures depicted related to pet handling or care. Explain the purpose or context of the actions shown, if clear. Be concise and relevant to veterinary advice.'

# ✅ Side Note:
There are many irrelevant images exist in the textbook, like paragraph divider, section dividers, etc. In future development, consider using a **Node/Agent** to decide if a image should be filtered out for summarization.


In [17]:
# 1. Redefine the prompt for image relevance to include image_context
image_relevance_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an intelligent assistant. Your task is to determine if an image could be relevant to the provided local textual context within a veterinary handbook. Consider the filename and the immediate context."),
        ("human", "Local Textual Context: {image_context}\n\nImage Filename: {image_filename}\n\nIs this image relevant to the content? Respond with 'yes' if relevant, 'no' if not. Only respond with 'yes' or 'no'.")
    ]
)

# 2. The image relevance chain setup remains the same
image_relevance_chain = image_relevance_prompt | ChatOllama(model="llama3.2-vision")

In [30]:
import ollama # Ensure this is imported
import os     # Ensure this is imported
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate


relevant_images_to_summarize = []

print("Checking image relevance with local textual context (now using Ollama's direct image passing)...")
for image_element in images_raw:
    image_filename = image_element.text # This is the image path/filename
    image_context = image_element.context # This is the locally associated context

    print(f"\nDEBUG: Processing image relevance for: {image_filename}")
    print(f"DEBUG: Associated text context for model: '{image_context[:100]}...'")

    if not image_context:
        image_context = "No specific text context was captured for this image, infer relevance from filename."

    # Construct the multimodal messages list for ollama.chat
    # Ollama library allows passing file paths directly in the 'images' list
    messages_for_ollama = [
        {
            "role": "user",
            "content": f"Local Textual Context: {image_context}\n\nImage Filename: {os.path.basename(image_filename)}\n\nIs this image relevant to the content? Respond with 'yes' if relevant, 'no' if not. Only respond with 'yes' or 'no'.",
            "images": [] # Initialize empty list, add image_filename if exists
        }
    ]

    if os.path.exists(image_filename):
        messages_for_ollama[0]["images"].append(image_filename)
        print(f"DEBUG: Image file path '{os.path.basename(image_filename)}' IS included in the messages.")
    else:
        print(f"WARNING: Image file not found: {image_filename}. Cannot pass image data to model.")

    # Now, invoke ollama.chat directly
    response_content = "error" # Default in case of invocation failure
    try:
        response_obj = ollama.chat(
            model="minicpm-v:8b",
            messages=messages_for_ollama,
            options={"temperature": 0.0} 
        )
        response_content = response_obj['message']['content']
        print(f"DEBUG: LLM Raw Response for '{os.path.basename(image_filename)}': '{response_content.strip()}'")
    except Exception as e:
        print(f"ERROR: Failed to invoke Ollama for image relevance check on {image_filename}: {e}")
        # If there's an error, assume not relevant to avoid proceeding with bad data
        response_content = "no"

    # Check the response for relevance (case-insensitive and stripping whitespace)
    if "yes" in response_content.lower().strip():
        print(f"✅ Image '{image_filename}' is RELEVANT.")
        relevant_images_to_summarize.append(image_element)
    else:
        print(f" ❌ Image '{image_filename}' is NOT relevant to its context. Skipping summarization.")

print(f"\nNumber of relevant images for summarization: {len(relevant_images_to_summarize)}")

# Now, you would proceed with batch summarization using 'relevant_images_to_summarize'
# instead of the full 'images_raw' list.
# For example:
# image_summaries = summarize_chain.batch([{'element': img} for img in relevant_images_to_summarize])

Checking image relevance with local textual context (now using Ollama's direct image passing)...

DEBUG: Processing image relevance for: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-1-1.jpg
DEBUG: Associated text context for model: 'Any cat, no matter how docile he may be, has the potential to bite when he is severely injured, frig...'
DEBUG: LLM Raw Response for 'figure-1-1.jpg': 'Yes.'
✅ Image '/Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-1-1.jpg' is RELEVANT.

DEBUG: Processing image relevance for: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-1-2.jpg
DEBUG: Associated text context for model: 'There are several effective ways to handle and restrain a cat. Your choice will depend on whether th...'
DEBUG: LLM Raw Response for 'figure-1-2.jpg': 'Yes.'
✅ Image '/Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-1-2.jpg' is RELEVANT.

DEBUG: Processing image relevance for: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-2-3.jpg
DEBUG: Associated text context for 

<h3>Remove Irrelevant Images From The Folder</h3>

In [31]:
import os

# Assuming 'images_raw' contains all original image elements and
# 'relevant_images_to_summarize' contains only the relevant ones from previous steps.

# Get a set of relevant image paths for efficient lookup
relevant_image_paths = {img_elem.text for img_elem in relevant_images_to_summarize}

print("\n--- Deleting irrelevant images ---")
images_deleted_count = 0
for image_element in images_raw:
    image_path = image_element.text
    if image_path not in relevant_image_paths:
        if os.path.exists(image_path):
            try:
                os.remove(image_path)
                print(f"✅ Successfully deleted irrelevant image: {image_path}")
                images_deleted_count += 1
            except OSError as e:
                print(f" ❌ Error deleting file {image_path}: {e}")
        else:
            print(f"❓ Skipping deletion: Image file not found at {image_path}")
    else:
        print(f"Keeping relevant image: {image_path}")

print(f"--- 🏁 Finished deleting images. Total deleted: {images_deleted_count} ---")

# After deletion, you might want to update images_raw to only contain relevant ones
# if you plan to reuse it later in the notebook, though `relevant_images_to_summarize`
# already holds what you need for subsequent steps like summarization.
# For example:
# images_raw = relevant_images_to_summarize


--- Deleting irrelevant images ---
Keeping relevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-1-1.jpg
Keeping relevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-1-2.jpg
Keeping relevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-2-3.jpg
Keeping relevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-2-4.jpg
Keeping relevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-2-5.jpg
Keeping relevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-3-6.jpg
Keeping relevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-4-7.jpg
❓ Skipping deletion: Image file not found at /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-4-8.jpg
Keeping relevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-4-9.jpg
Keeping relevant image: /Users/mas/Desktop/LLM_Veterinary_AI/figures/figure-5-10.jpg
--- 🏁 Finished deleting images. Total deleted: 0 ---


In [None]:
import ollama
import os
import base64

# Directories
image_directory = "./figures/shortExample2/"
output_directory = "./figures/shortExample2/"

# Ensure output directory exists
os.makedirs(output_directory, exist_ok=True)

# Supported image formats
supported_extensions = ('.png', '.jpg', '.jpeg')

for filename in os.listdir(image_directory):
    if filename.lower().endswith(supported_extensions):
        image_path = os.path.join(image_directory, filename)
        output_filename = os.path.splitext(filename)[0] + ".txt"
        output_path = os.path.join(output_directory, output_filename)

        print(f"⏳ Working on summary for {filename}")

        # Check if the summary file already exists
        if os.path.exists(output_path):
            print(f"☑️ Summary for {filename} already exists at {output_path}, skipping.")
            continue

        try:
            # Read and encode image in base64
            with open(image_path, 'rb') as f:
                image_data = base64.b64encode(f.read()).decode('utf-8')

            # Send image to ollama for vision model processing
            response = ollama.chat(
                model='llava:7b',
                messages=[
                    {
                        'role': 'user',
                        'content': 'Describe the image in detail, focusing on any actions, techniques, or procedures depicted related to pet handling or care. Explain the purpose or context of the actions shown, if clear. Be concise and relevant to veterinary advice. If you think the images has nothing to do with veterinary, do not do anything.',  
                        'images': [image_data]
                    }
                ]
            )

            # Extract and save the generated summary
            summary = response['message']['content']
            
            with open(output_path, 'w') as f:
                f.write(summary)

            print(f"✅ Summary for {filename} saved to {output_path}")

        except Exception as e:
            print(f"❌ Error processing {filename}: {e}") 

# Read image and summaries from folder

In [13]:
import glob
import os

path =  "./figures/shortExample2/"
# Get all .txt files in the directory
file_paths = glob.glob(os.path.expanduser(os.path.join(path, "*.txt")))

# Supported image formats
supported_extensions = ('.png', '.jpg', '.jpeg')

# Read each file and store its content in a list, and collect corresponding image paths
img_summaries = []
image_paths = []
for file_path in file_paths:
    with open(file_path, "r") as file:
        img_summaries.append(file.read())
    
    # Derive the original image path from the summary file path
    base_filename = os.path.splitext(os.path.basename(file_path))[0]
    found_image_path = None
    for ext in supported_extensions:
        potential_image_path = os.path.join(path, base_filename + ext)
        if os.path.exists(potential_image_path):
            found_image_path = potential_image_path
            break
    image_paths.append(found_image_path)

# Clean up residual logging
# cleaned_img_summary = [
#     s.split("clip_model_load: total allocated memory: 201.27 MB\\n\\n", 1)[1].strip() #Llava Model could left this message at every summary. This line is here to remove the overhead.
#     for s in img_summaries
# ]

# Filter out entries where no corresponding image was found
# This ensures cleaned_img_summary and image_paths remain aligned
# filtered_img_summaries = []
# filtered_image_paths = []
# for i, summary in enumerate(cleaned_img_summary):
#     if image_paths[i] is not None:
#         filtered_img_summaries.append(summary)
#         filtered_image_paths.append(image_paths[i])

cleaned_img_summary = img_summaries
image_paths

['./figures/shortExample2/figure-15-14.jpg',
 './figures/shortExample2/figure-20-20.jpg',
 './figures/shortExample2/figure-14-12.jpg',
 './figures/shortExample2/figure-19-18.jpg']

# Now, storing all those in a vector DB.


In [14]:
import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_core.documents import Document

# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="summaries", embedding_function=GPT4AllEmbeddings()
)

# The storage layer for the parent documents
store = InMemoryStore()  # <- Can we extend this to images
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)


In [35]:
# Ensure you have initialized your vectorstore variable (e.g., by running previous cells)
# This will delete the Chroma collection named "summaries"
vectorstore.delete_collection()
print("Chroma collection 'summaries' has been deleted.")

Chroma collection 'summaries' has been deleted.


In [15]:
# Add texts
if texts:
    doc_ids = [str(uuid.uuid4()) for _ in texts]
    summary_texts = [
        Document(page_content=s, metadata={id_key: doc_ids[i]})
        for i, s in enumerate(text_summaries)
    ]
    retriever.vectorstore.add_documents(summary_texts)
    retriever.docstore.mset(list(zip(doc_ids, texts)))


# Add tables
if tables:
    table_ids = [str(uuid.uuid4()) for _ in tables]
    summary_tables = [
        Document(page_content=s, metadata={id_key: table_ids[i]})
        for i, s in enumerate(table_summaries)
    ]
    retriever.vectorstore.add_documents(summary_tables)
    retriever.docstore.mset(list(zip(table_ids, tables)))

    # Add images
# Add images
if cleaned_img_summary:
    img_ids = [str(uuid.uuid4()) for _ in cleaned_img_summary]
    summary_img = [
        Document(page_content=s, metadata={id_key: img_ids[i]})
        for i, s in enumerate(cleaned_img_summary)
    ]
    retriever.vectorstore.add_documents(summary_img)
    # Store the image path as the raw document for retrieval
    retriever.docstore.mset(
        list(zip(img_ids, image_paths))
    )
 # Store the image summary as the raw document

In [16]:
try_result = retriever.invoke("Images / figures with cat in a white background")
try_result

['./figures/shortExample2/figure-14-12.jpg',
 './figures/shortExample2/figure-15-14.jpg',
 './figures/shortExample2/figure-20-20.jpg',
 './figures/shortExample2/figure-19-18.jpg']

In [17]:
import io
import re

from IPython.display import HTML, display
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.messages import HumanMessage
from PIL import Image

def plt_img_base64(img_base64):
    """Disply base64 encoded string as image"""
    # Create an HTML img tag with the base64 string as the source
    image_html = f'<img src="data:image/jpeg;base64,{img_base64}" />'
    # Display the image by rendering the HTML
    display(HTML(image_html))

def looks_like_base64(sb):
    """Check if the string looks like base64"""
    return re.match("^[A-Za-z0-9+/]+[=]{0,2}$", sb) is not None



def split_image_text_types(docs):
    """
    Split base64-encoded images and texts from a list of documents/strings.
    Handles both Document objects and direct image path strings.
    Only the first identified image will be base64 encoded and marked for Markdown rendering.
    Subsequent image paths will be added as plain text references.
    """
    b64_images = []
    texts = []
    image_processed = False # Flag to ensure only one image is processed visually

    for doc in docs:
        # Determine the content based on whether 'doc' is a Document object or a string
        doc_content = doc.page_content if isinstance(doc, Document) else str(doc)

        # Check if the content is an image path and if the file exists
        if doc_content.lower().endswith(('.png', '.jpg', '.jpeg')) and os.path.exists(doc_content):
            if not image_processed: # Process only the first image visually
                image_path = doc_content
                try:
                    with open(image_path, 'rb') as f:
                        image_data = base64.b64encode(f.read()).decode('utf-8')
                        resized_image_data = resize_base64_image(image_data, size=(1300, 600))
                        b64_images.append(resized_image_data)
                    # Add the image path for Markdown rendering in the LLM response
                    texts.append(f"IMAGE_PATH_FOR_MD: {image_path}")
                    image_processed = True # Set flag to true after processing the first image
                except Exception as e:
                    print(f"Error loading image {image_path}: {e}")
                    texts.append(doc_content) # If error, treat as text
            else:
                # For subsequent image paths, just add a textual reference without the special tag
                texts.append(f"An additional image related to the context was found at: {os.path.basename(doc_content)}")
        else:
            # It's text or a table summary
            texts.append(doc_content)
    return {"images": b64_images, "texts": texts}


def img_prompt_func(data_dict):
    """
    Construct the messages for the multimodal LLM.
    `data_dict` will contain keys: 'context' (which is {'images': [...], 'texts': [...]}) and 'question'.
    Instructs the LLM to render ALL image paths found via Markdown, striving to include images where relevant.
    """
    formatted_texts = "\n".join(data_dict["context"]["texts"])
    messages = []

    # Debugging: Print images being passed to the model
    print(f"DEBUG: Images being passed to LLM (visually): {len(data_dict['context']['images'])} image(s)")
    if data_dict["context"]["images"]:
        print(f"DEBUG: First visual image (base64 snippet): {data_dict['context']['images'][0][:50]}...")

    # Adding image(s) to the messages if present (this is the visual input)
    if data_dict["context"]["images"]:
        for image in data_dict["context"]["images"]:
            image_message = {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image}"},
            }
            messages.append(image_message)

    # Adding the text for analysis, with stronger instructions for Markdown rendering
    text_message = {
        "type": "text",
        "text": (
            "You are a veterinary assistant tasked with providing veterinary advice. "
            "You will be given a mixed of text, tables, and image references. "
            "Your primary goal is to use all provided information, including images, to answer the user's question comprehensively. "
            "It is ESSENTIAL that you identify ALL instances of `IMAGE_PATH_FOR_MD: /path/to/image.jpg` in the context "
            "and convert them directly into Markdown image syntax within your response. "
            "For each image, provide a brief, accurate alt text description like `![Description of image content](/path/to/image.jpg)`. "
            "For example, if the context contains `IMAGE_PATH_FOR_MD: ./figures/cat_pickup.jpg`, you MUST output `![Illustration of cat pickup technique](./figures/cat_pickup.jpg)`. "
            "Include these Markdown images strategically where they best illustrate your points in the answer. "
            "Do NOT omit any `IMAGE_PATH_FOR_MD:` entries; they must be rendered as Markdown images. "
            f"User-provided question: {data_dict['question']}\n\n"
            "Text and / or tables:\n"
            f"{formatted_texts}"
        ),
    }
    messages.append(text_message)
    return [HumanMessage(content=messages)]

def is_image_data(b64data):
    """
    Check if the base64 data is an image by looking at the start of the data
    """
    image_signatures = {
        b"\xff\xd8\xff": "jpg",
        b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a": "png",
        b"\x47\x49\x46\x38": "gif",
        b"\x52\x49\x46\x46": "webp",
    }
    try:
        header = base64.b64decode(b64data)[:8]  # Decode and get the first 8 bytes
        for sig, format in image_signatures.items():
            if header.startswith(sig):
                return True
        return False
    except Exception:
        return False


def resize_base64_image(base64_string, size=(128, 128)):
    """
    Resize an image encoded as a Base64 string
    """
    # Decode the Base64 string
    img_data = base64.b64decode(base64_string)
    img = Image.open(io.BytesIO(img_data))

    # Resize the image
    resized_img = img.resize(size, Image.LANCZOS)

    # Save the resized image to a bytes buffer
    buffered = io.BytesIO()
    resized_img.save(buffered, format=img.format)

    # Encode the resized image to Base64
    return base64.b64encode(buffered.getvalue()).decode("utf-8")


def multi_modal_rag_chain(retriever):
    """
    Multi-modal RAG chain
    """

    # Multi-modal LLM
    model = ChatOllama(model="llama3.2-vision")

    # RAG pipeline
    chain = (
        {
            "context": retriever | RunnableLambda(split_image_text_types),
            "question": RunnablePassthrough(),
        }
        | RunnableLambda(img_prompt_func) # This consumes the {"context": {"images": ..., "texts": ...}, "question": ...} dict
        | model
        | StrOutputParser()
    )

    return chain

# Create RAG chain
# Assuming 'retriever' is already defined from previous cells
chain_multimodal_rag = multi_modal_rag_chain(retriever)

In [18]:
# Check retrieval
query = "What kind of cat need more calories per day?"
docs = retriever.invoke(query, limit=8)

# We get 4 docs
len(docs)

4

In [19]:
# Your query
query = "What kind of chat need more calories per day? Include images"

# Retrieve documents
docs = retriever.invoke(query, limit=6)

print(f"Retrieved {len(docs)} documents:")

# Iterate and display each document
for i, doc in enumerate(docs):
    doc_content = doc.page_content if isinstance(doc, Document) else str(doc)

    print(f"\n--- Document {i+1} ---")
    if doc_content.lower().endswith(('.png', '.jpg', '.jpeg')) and os.path.exists(doc_content):
        # It's an image path
        image_path = doc_content
        try:
            with open(image_path, 'rb') as f:
                image_data = base64.b64encode(f.read()).decode('utf-8')
                # Resize for display if needed, using your existing function
                # You might want to use a smaller size here for better display in notebook
                resized_image_data = resize_base64_image(image_data, size=(600, 300))
                plt_img_base64(resized_image_data)
                print(f"Displayed image from: {image_path}")
        except Exception as e:
            print(f"Error reading or displaying image {image_path}: {e}")
    else:
        # It's text or a table summary
        print(f"Text Content:\n{doc_content}")

print("\n--- LLM Response ---")
# Finally, get the LLM's answer using the chain
llm_response = chain_multimodal_rag.invoke(query)
print(llm_response)


Retrieved 4 documents:

--- Document 1 ---
Text Content:
Counting Calories

Unless maintaining a good body weight is a problem, senior cats should be on a reduced-calorie diet. In general, an older cat who is neither too fat nor too thin needs about 20 calories per pound (.45 kg) of body weight per day—and sometimes even less—to meet her caloric needs. These are guidelines, and the exact amount needed to keep your cat at an ideal weight may vary. Various health conditions may also dictate that your cat needs more or fewer calories.

--- Document 2 ---
Text Content:
1 The values for amount per kilogram of dry matter have been calculated assuming a dietary energy density of 4,000 calories ME per kilogram of food. If the energy density of the diet is not 4,000 calo- ries ME per kilogram, then to calculate the per kilogram of dry matter for each nutrient, multiply the value for the nutrient by the energy density of the pet food (in calories ME per kilogram) and divide by 4,000.

2 0.02 g a

In [20]:
from IPython.display import Markdown

# Assuming llm_response contains the string output from your chain
# llm_response = chain_multimodal_rag.invoke(query) # Run this to get the response
display(Markdown(llm_response))

The type of cat that needs more calories per day is typically an active or pregnant/nursing cat. Here's a breakdown of the different calorie needs for cats:

**Active Cats**
An active adult cat will need about 30 to 35 calories per pound (.45 kg) of body weight per day, and some will do well with about 25 calories per pound per day. An illustration of an active cat playing with a ball of yarn is shown: ![Active cat playing with yarn](IMAGE_PATH_FOR_MD: ./figures/active_cat_yarn.jpg)

**Pregnant and Nursing Cats**
Pregnant and nursing cats have much higher requirements—figure about 45 calories per pound of body weight per day during the last trimester of pregnancy and as high as 140 calories per pound during the peak of lactation. An image of a pregnant cat with her kittens is shown: ![Pregnant cat with kittens](IMAGE_PATH_FOR_MD: ./figures/pregnant_cat_kittens.jpg)

**Spayed/Neutered Cats**
Spayed and neutered cats have a much lower metabolism than intact cats. Even if they are active, many spayed and neutered cats do very well on the lower calorie estimate. A picture of a spayed cat is shown: ![Spayed cat](IMAGE_PATH_FOR_MD: ./figures/spayed_cat.jpg)

**Senior Cats**
Unless maintaining a good body weight is a problem, senior cats should be on a reduced-calorie diet. In general, an older cat who is neither too fat nor too thin needs about 20 calories per pound (.45 kg) of body weight per day—and sometimes even less—to meet her caloric needs. An image of a senior cat is shown: ![Senior cat](IMAGE_PATH_FOR_MD: ./figures/senior_cat.jpg)

In summary, cats that need more calories per day are typically active or pregnant/nursing cats. Spayed/Neutered cats may also require more calories, but the exact amount will depend on their individual metabolism and activity level. Senior cats, on the other hand, may require fewer calories to maintain a healthy weight.