In [None]:
import os
import io
import uuid
import torch
import fitz
import docx
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import chromadb
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm
  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [3]:

DATA_FOLDER = "/home/logan78/projects/sih/database" # Folder containing documents
PERSIST_DIR = "embeddings7/chromadb8"  # Directory to persist ChromaDB
CHUNKS_DIR = "chunks"  # Folder to store separate text chunks
TEXT_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5" # Text embedding model
IMAGE_MODEL_NAME = "openai/clip-vit-large-patch14"  # Image embedding
MAX_WORDS_PER_CHUNK = 2000
device = "cuda" if torch.cuda.is_available() else "cpu"
os.makedirs(CHUNKS_DIR, exist_ok=True)


In [4]:
text_model = SentenceTransformer(TEXT_MODEL_NAME, device=device, trust_remote_code=True)
text_model.max_seq_length = 4096
text_model.eval()

clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
clip_model.eval()

<All keys matched successfully>
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05,

In [5]:
def embed_image(image: Image.Image) -> np.ndarray:
    inputs = clip_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        features = features / features.norm(dim=-1, keepdim=True)
    return features.squeeze().cpu().numpy()

from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=MAX_WORDS_PER_CHUNK,     
    chunk_overlap=200                  
)

def split_text_to_chunks(text: str):
    """
    Split text into chunks using LangChain RecursiveCharacterTextSplitter.
    """
    if not text.strip():
        return []
    chunks = splitter.split_text(text)
    return chunks


In [6]:
def split_text_to_chunks(text: str, max_words: int = MAX_WORDS_PER_CHUNK):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        chunk_words = words[start:start+max_words]
        chunks.append(" ".join(chunk_words))
        start += max_words
    return chunks

def extract_text_from_pdf(pdf_path: str):
    txts = []
    with fitz.open(pdf_path) as doc:
        for page in doc:
            txts.append(page.get_text("text"))
    return txts  # Return list of pages

def extract_images_from_pdf(pdf_path: str):
    images = []
    with fitz.open(pdf_path) as doc:
        for i, page in enumerate(doc):
            for img_index, img in enumerate(page.get_images(full=True)):
                try:
                    xref = img[0]
                    base_image = doc.extract_image(xref)
                    pil_image = Image.open(io.BytesIO(base_image["image"])).convert("RGB")
                    image_id = f"{os.path.basename(pdf_path)}_page_{i}_img_{img_index}"
                    images.append((i, pil_image, image_id))
                except Exception as e:
                    print(f"[!] Error extracting image {img_index} on page {i}: {e}")
    return images

def extract_text_from_docx(path: str):
    doc = docx.Document(path)
    return [p.text for p in doc.paragraphs if p.text.strip()]

In [7]:
def embed_text_chunk(text: str) -> np.ndarray:
    """
    Generate embedding for a text chunk using SentenceTransformer.
    """
    if not text.strip():
        return None
    emb = text_model.encode(text, convert_to_tensor=True, normalize_embeddings=True)
    return emb.cpu().numpy()


In [8]:
client = chromadb.PersistentClient(path=PERSIST_DIR)
collection_name = "multimodal_embeddings"

if collection_name in [c.name for c in client.list_collections()]:
    col = client.get_collection(collection_name)
else:
    col = client.create_collection(name=collection_name)

def store_in_chroma(ids, embeddings, metadatas, documents):
    col.add(ids=ids, embeddings=embeddings, metadatas=metadatas, documents=documents)


In [9]:

def process_folder_incremental(folder_path: str):
    file_counter = 0
    image_store = {}

    for root, _, files in os.walk(folder_path):
        for file in tqdm(files):
            path = os.path.join(root, file)
            ext = os.path.splitext(file)[1].lower()
            try:
                if ext == ".pdf":
                    pdf_pages = extract_text_from_pdf(path)
                    for page_index, page_text in enumerate(pdf_pages):
                        chunks = split_text_to_chunks(page_text)
                        for chunk_id, chunk_text in enumerate(chunks):
                            emb = embed_text_chunk(chunk_text)
                            if emb is None: continue
                            uid = f"txt-{file_counter}-{uuid.uuid4()}"
                            meta = {
                                "path": path,
                                "filename": file,
                                "page": page_index,
                                "type": "text",
                                "image_id": ""
                            }
                            store_in_chroma([uid], [emb.tolist()], [meta], [chunk_text])
                            chunk_filename = f"{os.path.splitext(file)[0]}_p{page_index}_c{chunk_id}.txt"
                            with open(os.path.join(CHUNKS_DIR, chunk_filename), "w", encoding="utf-8") as f:
                                f.write(chunk_text)

                            file_counter += 1

                    
                    pdf_images = extract_images_from_pdf(path)
                    for page_index, pil_image, image_id in pdf_images:
                        emb = embed_image(pil_image)
                        uid = f"img-{file_counter}-{uuid.uuid4()}"
                        meta = {
                            "path": path,
                            "filename": file,
                            "page": page_index,
                            "type": "image",
                            "image_id": image_id
                        }
                        store_in_chroma([uid], [emb.tolist()], [meta], [f"[Image: {image_id}]"])
                        image_store[image_id] = pil_image
                        file_counter += 1
                    print(f"[+] Stored PDF: {file}")
                elif ext == ".docx":
                    paragraphs = extract_text_from_docx(path)
                    for chunk_id, chunk_text in enumerate(split_text_to_chunks("\n".join(paragraphs))):
                        emb = embed_text_chunk(chunk_text)
                        if emb is None: continue
                        uid = f"txt-{file_counter}-{uuid.uuid4()}"
                        meta = {
                            "path": path,
                            "filename": file,
                            "page": -1,
                            "type": "text",
                            "image_id": ""
                        }
                        store_in_chroma([uid], [emb.tolist()], [meta], [chunk_text])
                        chunk_filename = f"{os.path.splitext(file)[0]}_p-1_c{chunk_id}.txt"
                        with open(os.path.join(CHUNKS_DIR, chunk_filename), "w", encoding="utf-8") as f:
                            f.write(chunk_text)

                        file_counter += 1
                    print(f"[+] Stored DOCX: {file}")
                elif ext == ".txt":
                    with open(path, "r", encoding="utf-8") as f:
                        text = f.read()
                    for chunk_id, chunk_text in enumerate(split_text_to_chunks(text)):
                        emb = embed_text_chunk(chunk_text)
                        if emb is None: continue
                        uid = f"txt-{file_counter}-{uuid.uuid4()}"
                        meta = {
                            "path": path,
                            "filename": file,
                            "page": -1,
                            "type": "text",
                            "image_id": ""
                        }
                        store_in_chroma([uid], [emb.tolist()], [meta], [chunk_text])
                        chunk_filename = f"{os.path.splitext(file)[0]}_p-1_c{chunk_id}.txt"
                        with open(os.path.join(CHUNKS_DIR, chunk_filename), "w", encoding="utf-8") as f:
                            f.write(chunk_text)

                        file_counter += 1
                    print(f"[+] Stored TXT: {file}")
                elif ext in [".png", ".jpg", ".jpeg", ".bmp"]:
                    pil_image = Image.open(path).convert("RGB")
                    emb = embed_image(pil_image)
                    uid = f"img-{file_counter}-{uuid.uuid4()}"
                    meta = {
                        "path": path,
                        "filename": file,
                        "page": -1,
                        "type": "image",
                        "image_id": ""
                    }
                    store_in_chroma([uid], [emb.tolist()], [meta], [f"[Image: {file}]"])
                    image_store[uid] = pil_image
                    file_counter += 1
                    print(f"[+] Stored standalone image: {file}")

            except Exception as e:
                print(f"[!] Error processing {file}: {e}")
                continue

    print(f"[✓] Done! Processed {file_counter} items in total.")
    return image_store

if __name__ == "__main__":
    image_data_store = process_folder_incremental(DATA_FOLDER)


  0%|          | 0/2 [00:00<?, ?it/s]

 50%|█████     | 1/2 [00:00<00:00,  1.37it/s]

[+] Stored standalone image: image.png


100%|██████████| 2/2 [00:09<00:00,  4.87s/it]

[+] Stored PDF: NIPS-2017-attention-is-all-you-need-Paper.pdf
[✓] Done! Processed 15 items in total.





In [9]:
import numpy as np
import torch
from PIL import Image

def retrieve_from_chroma(query_text=None, query_image=None, top_k=10, mode=None):
    """
    Retrieve top_k items from ChromaDB using multimodal query modes.

    Args:
        query_text (str): Text query
        query_image (str | PIL.Image): Image path or PIL.Image
        top_k (int): Number of results to return
        mode (str): Retrieval mode:
            - "text_to_text": text using SentenceTransformer
            - "image_to_image": image using CLIP image encoder
            - "text_to_image": text using CLIP text encoder
            - "text_and_image_to_image": combine CLIP text & image
            - "image_to_text": image using CLIP image encoder (search text space)
            If not provided, inferred automatically.

    Returns:
        List[dict]: Retrieved items with id, document, metadata, distance.
    """


    if not query_text and not query_image:
        raise ValueError("Provide at least a text or image query.")

    if mode is None:
        if query_text and not query_image:
            mode = "text_to_text"
        elif not query_text and query_image:
            mode = "image_to_image"
        elif query_text and query_image:
            mode = "text_and_image_to_image"
        else:
            raise ValueError("Unable to infer retrieval mode.")

    query_emb = None

    if mode == "text_to_text":
        emb_text = text_model.encode(
            query_text,
            convert_to_tensor=True,
            normalize_embeddings=True
        ).cpu().numpy()
        query_emb = emb_text.reshape(1, -1)


    elif mode == "image_to_image":
        if isinstance(query_image, str):
            query_image = Image.open(query_image).convert("RGB")
        emb_image = embed_image(query_image)
        query_emb = emb_image.reshape(1, -1)


    elif mode == "text_to_image":
        inputs = clip_processor(text=query_text, return_tensors="pt", truncation=True).to(device)
        with torch.no_grad():
            emb_text_clip = clip_model.get_text_features(**inputs)
            emb_text_clip = emb_text_clip / emb_text_clip.norm(dim=-1, keepdim=True)
        query_emb = emb_text_clip.cpu().numpy().reshape(1, -1)


    elif mode == "text_and_image_to_image":
        if isinstance(query_image, str):
            query_image = Image.open(query_image).convert("RGB")

      
        inputs_text = clip_processor(text=query_text, return_tensors="pt", truncation=True).to(device)
        with torch.no_grad():
            emb_text_clip = clip_model.get_text_features(**inputs_text)
            emb_text_clip = emb_text_clip / emb_text_clip.norm(dim=-1, keepdim=True)

   
        inputs_img = clip_processor(images=query_image, return_tensors="pt").to(device)
        with torch.no_grad():
            emb_image_clip = clip_model.get_image_features(**inputs_img)
            emb_image_clip = emb_image_clip / emb_image_clip.norm(dim=-1, keepdim=True)

    
        emb_combined = (emb_text_clip + emb_image_clip) / 2
        emb_combined = emb_combined / emb_combined.norm(dim=-1, keepdim=True)
        query_emb = emb_combined.cpu().numpy().reshape(1, -1)


    elif mode == "image_to_text":
        if isinstance(query_image, str):
            query_image = Image.open(query_image).convert("RGB")

        inputs = clip_processor(images=query_image, return_tensors="pt").to(device)
        with torch.no_grad():
            emb_image_clip = clip_model.get_image_features(**inputs)
            emb_image_clip = emb_image_clip / emb_image_clip.norm(dim=-1, keepdim=True)
        query_emb = emb_image_clip.cpu().numpy().reshape(1, -1)

    else:
        raise ValueError(f"Unknown mode: {mode}")

  
    results = col.query(
        query_embeddings=query_emb.tolist(),
        n_results=top_k
    )


    retrieved = []
    for idx, doc_id in enumerate(results["ids"][0]):
        retrieved.append({
            "id": doc_id,
            "document": results["documents"][0][idx],
            "metadata": results["metadatas"][0][idx],
            "distance": results["distances"][0][idx],
            "retrieval_mode": mode
        })

    return retrieved


In [11]:
results = retrieve_from_chroma(query_text = "a pie chart containing document by subject area" ,top_k=10,mode="text_to_image")

for r in results:
    print("ID:", r['id'])
    print("Document:", r['document'])
    print("Metadata:", r['metadata'])
    print("Distance:", r['distance'])
    print("-"*50)


ID: img-0-8025f59f-9fd5-4ada-84ba-475a53b919c6
Document: [Image: image.png]
Metadata: {'page': -1, 'filename': 'image.png', 'image_id': '', 'type': 'image', 'path': '/home/logan78/projects/sih/database/image.png'}
Distance: 1.4480831623077393
--------------------------------------------------
ID: img-13-d261ece2-e64f-46d1-900b-27be3cb08c5d
Document: [Image: NIPS-2017-attention-is-all-you-need-Paper.pdf_page_3_img_0]
Metadata: {'page': 3, 'image_id': 'NIPS-2017-attention-is-all-you-need-Paper.pdf_page_3_img_0', 'path': '/home/logan78/projects/sih/database/NIPS-2017-attention-is-all-you-need-Paper.pdf', 'filename': 'NIPS-2017-attention-is-all-you-need-Paper.pdf', 'type': 'image'}
Distance: 1.6274502277374268
--------------------------------------------------
ID: img-12-21634407-6690-4a24-8569-d45e9e390dc8
Document: [Image: NIPS-2017-attention-is-all-you-need-Paper.pdf_page_2_img_0]
Metadata: {'page': 2, 'filename': 'NIPS-2017-attention-is-all-you-need-Paper.pdf', 'type': 'image', 'image_