# Week5-2: RAG — Multimodal (Text + Images/Charts)

In [2]:

!pip install -q transformers torch pillow sentence-transformers accelerate chromadb

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m78.0 MB/s[0m eta [36m0:00:

In [9]:
import os, json, numpy as np, pandas as pd
from PIL import Image
import glob

# Check what images you have
IMG_DIR = './week5_images'
image_files = glob.glob(os.path.join(IMG_DIR, '*.png')) + glob.glob(os.path.join(IMG_DIR, '*.jpg'))
print(f"Found {len(image_files)} images in {IMG_DIR}")
for img in image_files[:3]:  # Shows all
    print(f"  - {os.path.basename(img)}")

selected_images = image_files

print(f"\nSelected {len(selected_images)} images for multimodal RAG:")
for img in selected_images:
    print(f"  - {os.path.basename(img)}")


Found 3 images in ./week5_images
  - Remove_old_paint.png
  - brushes.png
  - Prep_for_painting.png

Selected 3 images for multimodal RAG:
  - Remove_old_paint.png
  - brushes.png
  - Prep_for_painting.png


In [10]:
# 1) Embeddings (real using CLIP) + joint index
!pip install transformers torch -q  # Install if not already in Colab

import numpy as np, pandas as pd
import torch
from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditionalGeneration

# Load BLIP2 for captioning (to generate real captions for images)
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load CLIP for joint text/image embeddings
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Assuming selected_images from previous block (list of paths)
# Create df_imgs with generated captions
meta = []
for i, path in enumerate(selected_images, 1):
    img = Image.open(path)
    inputs = blip_processor(images=img, return_tensors="pt")
    with torch.no_grad():
        output = blip_model.generate(**inputs)
    caption = blip_processor.decode(output[0], skip_special_tokens=True)  # Decode the output tensor
    image_id = f'img{i}'  # Or use os.path.basename(path) for ID
    meta.append({'image_id': image_id, 'path': path, 'caption': caption})

df_imgs = pd.DataFrame(meta)
print('✅ Images with captions:', len(df_imgs))
print(df_imgs.head())

# Define text_corpus (adapt to your project; here, example texts referencing images)
# Replace these with your actual project texts/documents
text_corpus = pd.DataFrame({
    'doc_id': [f'doc{i}' for i in range(1, len(selected_images) + 1)],
    'text': [f'This document discusses painting for the walls shown in {os.path.basename(path)}.' for path in selected_images]
})
print('✅ Text docs:', len(text_corpus))

# Generate real image embeddings using CLIP
img_emb = {}
for _, row in df_imgs.iterrows():
    img = Image.open(row['path'])
    inputs = clip_processor(images=img, return_tensors="pt")
    with torch.no_grad():
        emb = clip_model.get_image_features(**inputs).squeeze().cpu().numpy()
    emb /= (np.linalg.norm(emb) + 1e-9)  # Normalize
    img_emb[row['image_id']] = emb.astype('float32')

# Generate real text embeddings using CLIP
text_emb = {}
for _, row in text_corpus.iterrows():
    inputs = clip_processor(text=row['text'], return_tensors="pt")
    with torch.no_grad():
        emb = clip_model.get_text_features(**inputs).squeeze().cpu().numpy()
    emb /= (np.linalg.norm(emb) + 1e-9)  # Normalize
    text_emb[row['doc_id']] = emb.astype('float32')

# Cosine similarity (unchanged)
def cosine(a, b):
    return float(a @ b / (np.linalg.norm(a) + 1e-9) / (np.linalg.norm(b) + 1e-9))

# Real text encoder using CLIP (for queries)
def encode_text(q):
    inputs = clip_processor(text=q, return_tensors="pt")
    with torch.no_grad():
        v = clip_model.get_text_features(**inputs).squeeze().cpu().numpy()
    return v / (np.linalg.norm(v) + 1e-9)

# Embedding dimension (from CLIP)
emb_dim = list(img_emb.values())[0].shape[0]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Images with captions: 3
  image_id                                  path  \
0     img1   ./week5_images/Remove_old_paint.png   
1     img2            ./week5_images/brushes.png   
2     img3  ./week5_images/Prep_for_painting.png   

                                             caption  
0       a person using a nail to remove a wood floor  
1  a book with three different brushes and a blue...  
2  a poster with a list of the different types of...  
✅ Text docs: 3


In [11]:
# 2) Retrieval modes
def retrieve_text(query, k=3):
    """
    Retrieve top-k text documents based on a text query using cosine similarity.
    """
    q = encode_text(query)  # Uses CLIP-based text encoding from previous block
    scores = [(doc, cosine(q, text_emb[doc])) for doc in text_emb]
    return sorted(scores, key=lambda x: -x[1])[:k]

def retrieve_image_by_text(query, k=3):
    """
    Retrieve top-k images based on a text query using cosine similarity.
    """
    q = encode_text(query)  # Uses CLIP-based text encoding
    scores = [(img, cosine(q, img_emb[img])) for img in img_emb]
    return sorted(scores, key=lambda x: -x[1])[:k]

def retrieve_by_image(image_id, k=3):
    """
    Retrieve top-k text documents and images based on an image ID using cosine similarity.
    """
    q = img_emb[image_id]
    t_scores = [(doc, cosine(q, text_emb[doc])) for doc in text_emb]
    i_scores = [(img, cosine(q, img_emb[img])) for img in img_emb if img != image_id]
    return sorted(t_scores, key=lambda x: -x[1])[:k], sorted(i_scores, key=lambda x: -x[1])[:k]

# Example usage with project-specific queries
print('Text->Docs:', retrieve_text('diagram of dealing with wall paint', 3))
print('Text->Images:', retrieve_image_by_text('photo of certain brushes', 3))
print('Image->Docs/Images:', retrieve_by_image(df_imgs['image_id'].iloc[0], 3))

# Prepare data for Chroma DB with modality tags
#!pip install chromadb -q  # Install if not already in Colab
import chromadb
from chromadb.utils import embedding_functions

# Initialize Chroma DB client
client = chromadb.Client()
collection_name = "multimodal_rag"
try:
    collection = client.create_collection(name=collection_name)
except:
    collection = client.get_collection(name=collection_name)

# Add text embeddings with modality tag
for _, row in text_corpus.iterrows():
    collection.add(
        embeddings=text_emb[row['doc_id']].tolist(),
        documents=[row['text']],
        metadatas=[{"doc_id": row['doc_id'], "modality": "text"}],
        ids=[row['doc_id']]
    )

# Add image embeddings with modality tag
for _, row in df_imgs.iterrows():
    collection.add(
        embeddings=img_emb[row['image_id']].tolist(),
        documents=[row['caption']],
        metadatas=[{"image_id": row['image_id'], "path": row['path'], "modality": "image"}],
        ids=[row['image_id']]
    )

# Example Chroma DB query
query_emb = encode_text("Depictions on dealing with paint").tolist()
results = collection.query(query_embeddings=[query_emb], n_results=3)
print('Chroma DB Query Results:', results)

Text->Docs: [('doc3', 0.8598296046257019), ('doc1', 0.8406165838241577), ('doc2', 0.8298824429512024)]
Text->Images: [('img2', 0.34373119473457336), ('img1', 0.2556796967983246), ('img3', 0.19629265367984772)]
Image->Docs/Images: ([('doc2', 0.274524450302124), ('doc3', 0.27356797456741333), ('doc1', 0.2689427435398102)], [('img2', 0.6604979634284973), ('img3', 0.5101344585418701)])
Chroma DB Query Results: {'ids': [['doc3', 'doc1', 'doc2']], 'embeddings': None, 'documents': [['This document discusses the medical image or chart shown in Prep_for_painting.png.', 'This document discusses the medical image or chart shown in Remove_old_paint.png.', 'This document discusses the medical image or chart shown in brushes.png.']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[{'modality': 'text', 'doc_id': 'doc3'}, {'modality': 'text', 'doc_id': 'doc1'}, {'modality': 'text', 'doc_id': 'doc2'}]], 'distances': [[0.3544785976409912, 0.358754754066467

In [12]:
# 3) Prompt assembly
def assemble_prompt(query, text_hits, image_hits):
    """
    Assemble a prompt combining query, text evidence, and image evidence with citations.
    """
    tbits = [f'[{doc}] {text_corpus[text_corpus.doc_id==doc].iloc[0].text}' for doc, _ in text_hits]
    ibits = [f'[{img}] {df_imgs[df_imgs.image_id==img].iloc[0].caption}' for img, _ in image_hits]
    return f"""System: Answer using ONLY the evidence below. Cite [doc_id] or [image_id].
Query: {query}
Evidence:
- Text: {' | '.join(tbits)}
- Images: {' | '.join(ibits)}
Answer:
"""

# Example usage with home repair-specific query
q = 'How to prepare a wall for painting?'
text_hits = retrieve_text(q, 2)  # From previous retrieval function
image_hits = retrieve_image_by_text(q, 2)  # From previous retrieval function
print("Prompt from retrieval functions:")
print(assemble_prompt(q, text_hits, image_hits))

# Alternative: Assemble prompt using Chroma DB results
query_emb = encode_text(q).tolist()
chroma_results = collection.query(query_embeddings=[query_emb], n_results=4)  # Get top 4 results (text + images)

# Separate text and image hits from Chroma DB results
text_hits_chroma = []
image_hits_chroma = []
for idx, id_ in enumerate(chroma_results['ids'][0]):
    metadata = chroma_results['metadatas'][0][idx]
    score = 1 - chroma_results['distances'][0][idx]  # Convert distance to similarity
    if metadata['modality'] == 'text':
        text_hits_chroma.append((id_, score))
    elif metadata['modality'] == 'image':
        image_hits_chroma.append((id_, score))

# Print prompt using Chroma DB results
print("\nPrompt from Chroma DB results:")
print(assemble_prompt(q, text_hits_chroma[:2], image_hits_chroma[:2]))

Prompt from retrieval functions:
System: Answer using ONLY the evidence below. Cite [doc_id] or [image_id].
Query: How to prepare a wall for painting?
Evidence:
- Text: [doc3] This document discusses painting for the walls shown in Prep_for_painting.png. | [doc1] This document discusses painting for the walls shown in Remove_old_paint.png.
- Images: [img2] a book with three different brushes and a blue background | [img1] a person using a nail to remove a wood floor
Answer:


Prompt from Chroma DB results:
System: Answer using ONLY the evidence below. Cite [doc_id] or [image_id].
Query: How to prepare a wall for painting?
Evidence:
- Text: [doc3] This document discusses painting for the walls shown in Prep_for_painting.png. | [doc1] This document discusses painting for the walls shown in Remove_old_paint.png.
- Images: [img2] a book with three different brushes and a blue background
Answer:



In [13]:
print('A) Text-only -> retrieve text + images-by-text')
q1 = 'Summarize how to remove old paint'
print('Text:', retrieve_text(q1,3)); print('Images-from-text:', retrieve_image_by_text(q1,3))
print('\nB) Image-only -> retrieve related docs & similar images')
t_hits, i_hits = retrieve_by_image('img2', 3); print('Docs:', t_hits); print('Images:', i_hits)

A) Text-only -> retrieve text + images-by-text
Text: [('doc1', 0.8457539081573486), ('doc3', 0.8250333070755005), ('doc2', 0.7813454866409302)]
Images-from-text: [('img2', 0.30430376529693604), ('img1', 0.27715954184532166), ('img3', 0.22156739234924316)]

B) Image-only -> retrieve related docs & similar images
Docs: [('doc2', 0.3305499851703644), ('doc1', 0.3010556399822235), ('doc3', 0.29564806818962097)]
Images: [('img1', 0.6604979634284973), ('img3', 0.5231462121009827)]


In [14]:
# Save environment to JSON file
import subprocess
import json
import sys

def save_environment_to_json(filename="env_multi_rag_adv.json"):
    result = subprocess.run([sys.executable, '-m', 'pip', 'list', '--format=json'],
                          capture_output=True, text=True, check=True)
    packages = json.loads(result.stdout)

    env_info = {
        "python_version": sys.version,
        "platform": sys.platform,
        "packages": packages
    }

    with open(filename, 'w') as f:
        json.dump(env_info, f, indent=2)

    print(f"Environment saved to {filename}")

save_environment_to_json("env_rag_adv.json")

Environment saved to env_rag_adv.json
