In [29]:
import os
import io
import uuid
import torch
import fitz
import docx
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import chromadb
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import numpy as np
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import json
load_dotenv()

True

In [13]:

DATA_FOLDER = "/home/logan78/projects/sih/database" # Folder containing documents
PERSIST_DIR = "embeddings7/chromadb8"  # Directory to persist ChromaDB
CHUNKS_DIR = "chunks"  # Folder to store separate text chunks
TEXT_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5" # Text embedding model
IMAGE_MODEL_NAME = "openai/clip-vit-large-patch14"  # Image embedding
MAX_WORDS_PER_CHUNK = 2000
device = "cuda" if torch.cuda.is_available() else "cpu"
os.makedirs(CHUNKS_DIR, exist_ok=True)


In [14]:
text_model = SentenceTransformer(TEXT_MODEL_NAME, device=device, trust_remote_code=True)
text_model.max_seq_length = 4096
text_model.eval()

clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
clip_model.eval()

<All keys matched successfully>


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05,

In [15]:
def embed_image(image: Image.Image) -> np.ndarray:
    inputs = clip_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        features = features / features.norm(dim=-1, keepdim=True)
    return features.squeeze().cpu().numpy()

from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=MAX_WORDS_PER_CHUNK,     
    chunk_overlap=200                  
)

def split_text_to_chunks(text: str):
    """
    Split text into chunks using LangChain RecursiveCharacterTextSplitter.
    """
    if not text.strip():
        return []
    chunks = splitter.split_text(text)
    return chunks
def split_text_to_chunks(text: str, max_words: int = MAX_WORDS_PER_CHUNK):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        chunk_words = words[start:start+max_words]
        chunks.append(" ".join(chunk_words))
        start += max_words
    return chunks

def extract_text_from_pdf(pdf_path: str):
    txts = []
    with fitz.open(pdf_path) as doc:
        for page in doc:
            txts.append(page.get_text("text"))
    return txts  # Return list of pages

def extract_images_from_pdf(pdf_path: str):
    images = []
    with fitz.open(pdf_path) as doc:
        for i, page in enumerate(doc):
            for img_index, img in enumerate(page.get_images(full=True)):
                try:
                    xref = img[0]
                    base_image = doc.extract_image(xref)
                    pil_image = Image.open(io.BytesIO(base_image["image"])).convert("RGB")
                    image_id = f"{os.path.basename(pdf_path)}_page_{i}_img_{img_index}"
                    images.append((i, pil_image, image_id))
                except Exception as e:
                    print(f"[!] Error extracting image {img_index} on page {i}: {e}")
    return images

def extract_text_from_docx(path: str):
    doc = docx.Document(path)
    return [p.text for p in doc.paragraphs if p.text.strip()]

In [16]:
client = chromadb.PersistentClient(path=PERSIST_DIR)
collection_name = "multimodal_embeddings"

if collection_name in [c.name for c in client.list_collections()]:
    col = client.get_collection(collection_name)
else:
    col = client.create_collection(name=collection_name)

def store_in_chroma(ids, embeddings, metadatas, documents):
    col.add(ids=ids, embeddings=embeddings, metadatas=metadatas, documents=documents)


In [17]:
import numpy as np
import torch
from PIL import Image

def retrieve_from_chroma(query_text=None, query_image=None, top_k=10, mode=None):
    """
    Retrieve top_k items from ChromaDB using multimodal query modes.

    Args:
        query_text (str): Text query
        query_image (str | PIL.Image): Image path or PIL.Image
        top_k (int): Number of results to return
        mode (str): Retrieval mode:
            - "text_to_text": text using SentenceTransformer
            - "image_to_image": image using CLIP image encoder
            - "text_to_image": text using CLIP text encoder
            - "text_and_image_to_image": combine CLIP text & image
            - "image_to_text": image using CLIP image encoder (search text space)
            If not provided, inferred automatically.

    Returns:
        List[dict]: Retrieved items with id, document, metadata, distance.
    """


    if not query_text and not query_image:
        raise ValueError("Provide at least a text or image query.")

    if mode is None:
        if query_text and not query_image:
            mode = "text_to_text"
        elif not query_text and query_image:
            mode = "image_to_image"
        elif query_text and query_image:
            mode = "text_and_image_to_image"
        else:
            raise ValueError("Unable to infer retrieval mode.")

    query_emb = None

    if mode == "text_to_text":
        emb_text = text_model.encode(
            query_text,
            convert_to_tensor=True,
            normalize_embeddings=True
        ).cpu().numpy()
        query_emb = emb_text.reshape(1, -1)


    elif mode == "image_to_image":
        if isinstance(query_image, str):
            query_image = Image.open(query_image).convert("RGB")
        emb_image = embed_image(query_image)
        query_emb = emb_image.reshape(1, -1)


    elif mode == "text_to_image":
        inputs = clip_processor(text=query_text, return_tensors="pt", truncation=True).to(device)
        with torch.no_grad():
            emb_text_clip = clip_model.get_text_features(**inputs)
            emb_text_clip = emb_text_clip / emb_text_clip.norm(dim=-1, keepdim=True)
        query_emb = emb_text_clip.cpu().numpy().reshape(1, -1)


    elif mode == "text_and_image_to_image":
        if isinstance(query_image, str):
            query_image = Image.open(query_image).convert("RGB")

      
        inputs_text = clip_processor(text=query_text, return_tensors="pt", truncation=True).to(device)
        with torch.no_grad():
            emb_text_clip = clip_model.get_text_features(**inputs_text)
            emb_text_clip = emb_text_clip / emb_text_clip.norm(dim=-1, keepdim=True)

   
        inputs_img = clip_processor(images=query_image, return_tensors="pt").to(device)
        with torch.no_grad():
            emb_image_clip = clip_model.get_image_features(**inputs_img)
            emb_image_clip = emb_image_clip / emb_image_clip.norm(dim=-1, keepdim=True)

    
        emb_combined = (emb_text_clip + emb_image_clip) / 2
        emb_combined = emb_combined / emb_combined.norm(dim=-1, keepdim=True)
        query_emb = emb_combined.cpu().numpy().reshape(1, -1)


    elif mode == "image_to_text":
        if isinstance(query_image, str):
            query_image = Image.open(query_image).convert("RGB")

        inputs = clip_processor(images=query_image, return_tensors="pt").to(device)
        with torch.no_grad():
            emb_image_clip = clip_model.get_image_features(**inputs)
            emb_image_clip = emb_image_clip / emb_image_clip.norm(dim=-1, keepdim=True)
        query_emb = emb_image_clip.cpu().numpy().reshape(1, -1)

    else:
        raise ValueError(f"Unknown mode: {mode}")

  
    results = col.query(
        query_embeddings=query_emb.tolist(),
        n_results=top_k
    )


    retrieved = []
    for idx, doc_id in enumerate(results["ids"][0]):
        retrieved.append({
            "id": doc_id,
            "document": results["documents"][0][idx],
            "metadata": results["metadatas"][0][idx],
            "distance": results["distances"][0][idx],
            "retrieval_mode": mode
        })

    return retrieved


In [37]:
results = retrieve_from_chroma(query_text = "tell me about attention mechanism and how it works can you show figure and diagam " ,top_k=10,mode="text_to_image")

for r in results:
    print("ID:", r['id'])
    print("Document:", r['document'])
    print("Metadata:", r['metadata'])
    print("Distance:", r['distance'])
    print("-"*50)


ID: img-13-d261ece2-e64f-46d1-900b-27be3cb08c5d
Document: [Image: NIPS-2017-attention-is-all-you-need-Paper.pdf_page_3_img_0]
Metadata: {'filename': 'NIPS-2017-attention-is-all-you-need-Paper.pdf', 'path': '/home/logan78/projects/sih/database/NIPS-2017-attention-is-all-you-need-Paper.pdf', 'type': 'image', 'page': 3, 'image_id': 'NIPS-2017-attention-is-all-you-need-Paper.pdf_page_3_img_0'}
Distance: 1.5104589462280273
--------------------------------------------------
ID: img-12-21634407-6690-4a24-8569-d45e9e390dc8
Document: [Image: NIPS-2017-attention-is-all-you-need-Paper.pdf_page_2_img_0]
Metadata: {'page': 2, 'image_id': 'NIPS-2017-attention-is-all-you-need-Paper.pdf_page_2_img_0', 'path': '/home/logan78/projects/sih/database/NIPS-2017-attention-is-all-you-need-Paper.pdf', 'filename': 'NIPS-2017-attention-is-all-you-need-Paper.pdf', 'type': 'image'}
Distance: 1.5252233743667603
--------------------------------------------------
ID: img-14-c74099d6-738e-4e7c-935b-69261f337cd6
Docume

In [39]:
results

[{'id': 'img-13-d261ece2-e64f-46d1-900b-27be3cb08c5d',
  'document': '[Image: NIPS-2017-attention-is-all-you-need-Paper.pdf_page_3_img_0]',
  'metadata': {'filename': 'NIPS-2017-attention-is-all-you-need-Paper.pdf',
   'path': '/home/logan78/projects/sih/database/NIPS-2017-attention-is-all-you-need-Paper.pdf',
   'type': 'image',
   'page': 3,
   'image_id': 'NIPS-2017-attention-is-all-you-need-Paper.pdf_page_3_img_0'},
  'distance': 1.5104589462280273,
  'retrieval_mode': 'text_to_image'},
 {'id': 'img-12-21634407-6690-4a24-8569-d45e9e390dc8',
  'document': '[Image: NIPS-2017-attention-is-all-you-need-Paper.pdf_page_2_img_0]',
  'metadata': {'page': 2,
   'image_id': 'NIPS-2017-attention-is-all-you-need-Paper.pdf_page_2_img_0',
   'path': '/home/logan78/projects/sih/database/NIPS-2017-attention-is-all-you-need-Paper.pdf',
   'filename': 'NIPS-2017-attention-is-all-you-need-Paper.pdf',
   'type': 'image'},
  'distance': 1.5252233743667603,
  'retrieval_mode': 'text_to_image'},
 {'id': 

In [None]:
os.environ["GOOGLE_API_KEY"] = "AIzaSyC2C2huU3oN-LQEvIZZCNff4vQhWYtXU5Q"


In [36]:
import os
import json
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel, WhisperProcessor, WhisperForConditionalGeneration
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI

# -----------------------
# LLM Setup
# -----------------------
GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]
model = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    google_api_key=GOOGLE_API_KEY
)
parser = StrOutputParser()

prompt_template = """
You are a powerful multimodal reasoning AI.
You are given:
1. A user query.
2. Retrieved context (text, images, or audio) with metadata.

Task:
- Only use relevant items to answer the query.
- For images or audio, summarize the content.
- Return ONLY JSON with fields:
{{
  "topic_name": "<key topic>",
  "response_text": "<detailed answer>",
  "retrieved_items": [
      {{
          "type": "<text|image|audio>",
          "path": "<path>",
          "page_number": <page_number or -1>,
          "content_summary": "<summary>"
      }}
  ]
}}

Query: {query}
Retrieved Items:
{retrieved_items}
"""

prompt = PromptTemplate.from_template(prompt_template)
chain = prompt | model | parser

# -----------------------
# Device Setup
# -----------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

def summarize_image(path):
    image = Image.open(path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        image_embeds = clip_model.get_image_features(**inputs)
        image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)

    return f"Image at {path} (CLIP embeddings processed). Likely contains diagrams or visual info."

# -----------------------
# Whisper for Audio Transcription
# -----------------------
whisper_model_name = "openai/whisper-small"
whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name).to(device)
whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)

def summarize_audio(path):
    import torchaudio
    speech_array, sr = torchaudio.load(path)
    inputs = whisper_processor(speech_array, sampling_rate=sr, return_tensors="pt").to(device)
    generated_ids = whisper_model.generate(inputs.input_features)
    transcription = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return f"Audio at {path}: {transcription}"

# -----------------------
# Prepare Retrieved Items
# -----------------------
def prepare_retrieved_items(items):
    prepared = []
    for item in items:
        metadata = item.get("metadata", {})
        item_type = metadata.get("type", "text")
        path = metadata.get("path", "")
        page = metadata.get("page", -1)

        if item_type == "text":
            content_summary = item.get("document", "")
        elif item_type == "image":
            content_summary = summarize_image(path)
        elif item_type == "audio":
            content_summary = summarize_audio(path)
        else:
            content_summary = "[Unknown type]"

        prepared.append({
            "type": item_type,
            "path": path,
            "page_number": page,
            "content_summary": content_summary
        })
    return json.dumps(prepared, indent=2)

# -----------------------
# Main Multimodal Agent
# -----------------------
def multimodal_agent(query, retrieved_items):
    retrieved_json = prepare_retrieved_items(retrieved_items)
    response = chain.invoke({
        "query": query,
        "retrieved_items": retrieved_json
    })
    return response

# -----------------------
# Example Usage
# -----------------------
if __name__ == "__main__":
    query = "give me pie chart about document by subject area"
    retrieved_items = [
        {
            "document": "[Image: image.png]",
            "metadata": {
                "path": "/home/logan78/projects/sih/database/image.png",
                "page": -1,
                "type": "image"
            }
        },
        
        {
            "document": "Transformer uses self-attention for sequence modeling...",
            "metadata": {
                "path": "/home/logan78/projects/sih/database/NIPS-2017-attention-is-all-you-need.pdf",
                "page": 3,
                "type": "text"
            }
        }
    ]

    result = multimodal_agent(query, retrieved_items)
    print(result)


```json
{
  "topic_name": "Document Subject Area Pie Chart",
  "response_text": "The retrieved image at /home/logan78/projects/sih/database/image.png likely contains a pie chart or other diagram related to the subject area of the document. Without further processing of the image, I cannot provide specific details about the pie chart's contents. The text document, 'NIPS-2017-attention-is-all-you-need.pdf' on page 3, discusses the Transformer model and self-attention, which is not directly related to the pie chart or the overall subject distribution of a document.",
  "retrieved_items": [
    {
      "type": "image",
      "path": "/home/logan78/projects/sih/database/image.png",
      "page_number": -1,
      "content_summary": "Image at /home/logan78/projects/sih/database/image.png (CLIP embeddings processed). Likely contains diagrams or visual info."
    }
  ]
}
```


In [None]:
print