In [1]:
# Block 1: Installations
%pip install -qU langchain langchain-google-genai langchain_community pymupdf pillow chromadb transformers torch sentence-transformers
#restart the kernal after this

Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 1.37.1 requires pillow<11,>=7.1.0, but you have pillow 11.3.0 which is incompatible.
xformers 0.0.29.post3 requires torch==2.6.0, but you have torch 2.8.0 which is incompatible.


In [2]:
%pip install arxiv

Collecting arxiv
  Using cached arxiv-2.2.0-py3-none-any.whl.metadata (6.3 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser~=6.0.10->arxiv)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Using cached arxiv-2.2.0-py3-none-any.whl (11 kB)
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
Building wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py): started
  Building wheel for sgmllib3k (setup.py): finished with status 'done'
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6060 sha256=851e99e613c1f39681f9e783ebd14c91d5e7724d4389541373d05e181a3e22ef
  Stored in directory: c:\users\aagam\appdata\local\pip\cache\wheels\03\f5\1a\23761066dac1d0e8e683e5fdb27e12de53209d05a4a37e6246
Successfully built sgmllib3k
Installing collecte

In [29]:
# utils.py

import os
import getpass
import fitz  # PyMuPDF
from PIL import Image
import io
import arxiv
import uuid

# LangChain Imports
from langchain.storage import InMemoryStore
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.docstore.document import Document
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


# --- API Key Setup ---
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Provide your Google API Key: ")
    print("✅ API Key has been set for this session.")
else:
    print("✅ API Key is already set.")

✅ API Key is already set.


In [42]:
import re
import arxiv
import os

def find_and_download_paper(query: str, min_pages: int = 15, max_pages: int = 25, download_path: str = "."):
    """
    Searches ArXiv for papers on a topic and downloads the first result
    that falls within the specified page range.
    """
    print(f"Searching ArXiv for papers on '{query}' between {min_pages}-{max_pages} pages...")
    try:
        client = arxiv.Client(page_size=20, delay_seconds=3, num_retries=3)
        search = arxiv.Search(
            query=query,
            max_results=20,
            sort_by=arxiv.SortCriterion.Relevance,
        )

        # Using client.results(search) is the new, correct method
        for result in client.results(search):
            if result.comment and 'pages' in result.comment.lower():
                match = re.search(r'(\d+)\s*pages', result.comment, re.IGNORECASE)
                if match:
                    pages = int(match.group(1))
                    print(f"  > Found '{result.title}' ({pages} pages)... checking range.")
                    
                    if min_pages <= pages <= max_pages:
                        print(f"    > Match found! Attempting to download...")
                        try:
                            filename = f"{result.entry_id.split('/')[-1]}.pdf"
                            filepath = os.path.join(download_path, filename)
                            result.download_pdf(dirpath=download_path, filename=filename)
                            print(f"Successfully downloaded '{result.title}'")
                            return {"title": result.title, "filepath": filepath}
                        except Exception as e:
                            print(f"    > Download failed: {e}. Trying the next paper.")
                            continue

        print(f"Sorry, could not find a paper between {min_pages}-{max_pages} pages in the top 20 results.")
        return None

    except Exception as e:
        print(f"An error occurred during the ArXiv search: {e}")
        return None

In [3]:
# --- Step 2: Extract Elements from PDF ---
def extract_pdf_elements(pdf_path):
    """Extracts text and images from a PDF."""
    doc = fitz.open(pdf_path)
    text_chunks, images = [], []
    for page in doc:
        text = page.get_text()
        if text.strip():
            text_chunks.append(text)
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image = Image.open(io.BytesIO(base_image["image"]))
            images.append(image)
    return text_chunks, images

In [30]:
# --- Step 3: Generate Image Summaries (Corrected Version) ---
def generate_image_summaries(images):
    """
    Generates text summaries for a list of images using Gemini.
    This version uses the correct model name and includes a delay for rate limiting.
    """
    import base64
    import io
    import time # <-- Add this import

    # Use the correct, available model name
    model = ChatGoogleGenerativeAI(
        model="gemini-2.5-flash", 
        temperature=0,
        google_api_key=os.environ.get("GOOGLE_API_KEY")
    )
    
    summaries = []
    print(f"Summarizing {len(images)} images (1 per second)...")

    for i, img in enumerate(images):
        buffered = io.BytesIO()
        if img.mode == 'RGBA':
            img = img.convert('RGB')
        img.save(buffered, format="JPEG")
        img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
        
        image_content = {
            "type": "image_url",
            "image_url": f"data:image/jpeg;base64,{img_base64}",
        }

        prompt = [HumanMessage(content=[
            {"type": "text", "text": "Describe this image from a research paper. What is it showing? Be detailed."},
            image_content,
        ])]
        
        response = model.invoke(prompt)
        summaries.append(response.content)
        print(f"  > Summarized image {i + 1}/{len(images)}")
        
        # Add a 1-second delay to stay within API rate limits
        time.sleep(1)
        
    return summaries

In [44]:
# --- Step 4: Build the Multi-Vector Retriever ---
def build_multimodal_retriever(pdf_path):
    """Builds the core RAG retriever from a PDF file."""
    id_key = "doc_id"
    
    # The vectorstore to use to index the child chunks
    vectorstore = Chroma(
        collection_name="multimodal_rag",
        embedding_function=SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    )
    # The storage layer for the parent documents
    docstore = InMemoryStore()

    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        docstore=docstore,
        id_key=id_key,
        search_kwargs={'k': 5}
    )

    # Extract elements
    raw_text_chunks, raw_images = extract_pdf_elements(pdf_path)
    
    # Add text to retriever
    doc_ids_text = [str(uuid.uuid4()) for _ in raw_text_chunks]
    text_docs = [Document(page_content=chunk, metadata={id_key: doc_ids_text[i]}) for i, chunk in enumerate(raw_text_chunks)]
    retriever.vectorstore.add_documents(text_docs)
    retriever.docstore.mset(list(zip(doc_ids_text, raw_text_chunks)))

    # Add images and their summaries to retriever
    if raw_images:
        image_summaries = generate_image_summaries(raw_images)
        doc_ids_img = [str(uuid.uuid4()) for _ in raw_images]
        summary_docs = [Document(page_content=summary, metadata={id_key: doc_ids_img[i]}) for i, summary in enumerate(image_summaries)]
        retriever.vectorstore.add_documents(summary_docs)
        retriever.docstore.mset(list(zip(doc_ids_img, raw_images)))
    
    return retriever


In [52]:
# In your notebook cell

def create_rag_chain(retriever):
    """Creates the final RAG chain for querying."""
    import base64
    import io

    def format_context_for_gemini(docs):
        """
        Formats retrieved documents (text and images) for the Gemini model.
        Images are converted to Base64 data URIs.
        """
        context_parts = []
        for doc in docs:
            # The MultiVectorRetriever returns the raw doc from the docstore.
            # It can be a string (from text chunks) or a PIL Image object.
            if isinstance(doc, str):
                # --- THIS IS THE FIX ---
                # Correctly append the text part
                context_parts.append({"type": "text", "text": doc})
            elif isinstance(doc, Image.Image):
                # Convert PIL Image to Base64
                buffered = io.BytesIO()
                if doc.mode == 'RGBA':
                    doc = doc.convert('RGB')
                doc.save(buffered, format="JPEG")
                img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
                
                context_parts.append({
                    "type": "image_url",
                    "image_url": f"data:image/jpeg;base64,{img_base64}",
                })
        return context_parts

    def create_prompt(context_parts, question):
        """Creates a multimodal prompt from the context parts and question."""
        prompt_str = f"""You are an expert research assistant. Synthesize a comprehensive answer to the user's question using all the provided context below. The context contains both text excerpts and images from a research paper. You must use information from both the text and the images to form your answer.

        Question: {question}

        Context:
        """
        final_prompt_content = [{"type": "text", "text": prompt_str}]
        final_prompt_content.extend(context_parts)
        
        return [HumanMessage(content=final_prompt_content)]

    # Initialize the final, powerful model
    model = ChatGoogleGenerativeAI(
        model="gemini-2.5-flash",
        temperature=0.2,
        google_api_key=os.environ.get("GOOGLE_API_KEY")
    )
    
    # Define the final chain
    chain = (
        {"context": retriever | RunnableLambda(format_context_for_gemini), "question": RunnablePassthrough()}
        | RunnableLambda(lambda x: create_prompt(x["context"], x["question"]))
        | model
        | StrOutputParser()
    )
    return chain

In [47]:
# Block 4: Define your topic and question, then run the entire process.

# 1. SET YOUR RESEARCH TOPIC HERE
topic = "multimodal large language models for robotics"

# 2. Find and process the paper
paper_info = find_and_download_paper(topic)

if paper_info:
    # 3. Build the RAG retriever for the downloaded paper
    
    retriever = build_multimodal_retriever(paper_info['filepath'])
    # 4. Create the final query chain
    rag_chain = create_rag_chain(retriever)
    
    # 5. SET YOU    R QUESTION ABOUT THE PAPER HERE
    question = "According to the paper, what are the main challenges in applying multimodal models to robotics? Refer to any diagrams if possible."
    
    print("\n" + "="*50)
    print(f"Asking question: {question}")
    print("="*50 + "\n")

    # 6. Invoke the chain and get the answer
    response = rag_chain.invoke(question)
    
    # 7. Display the final answer
    display(Markdown(response))

Searching ArXiv for papers on 'multimodal large language models for robotics' between 15-25 pages...
  > Found 'Multimodal Fusion and Vision-Language Models: A Survey for Robot Vision' (27 pages)... checking range.
  > Found 'Integrating Large Language Models with Multimodal Virtual Reality Interfaces to Support Collaborative Human-Robot Construction Work' (39 pages)... checking range.
  > Found 'Multimodal Spatial Language Maps for Robot Navigation and Manipulation' (24 pages)... checking range.
    > Match found! Attempting to download...
Successfully downloaded 'Multimodal Spatial Language Maps for Robot Navigation and Manipulation'




Summarizing 14 images (1 per second)...
  > Summarized image 1/14
  > Summarized image 2/14
  > Summarized image 3/14
  > Summarized image 4/14
  > Summarized image 5/14
  > Summarized image 6/14
  > Summarized image 7/14
  > Summarized image 8/14
  > Summarized image 9/14
  > Summarized image 10/14
  > Summarized image 11/14
  > Summarized image 12/14
  > Summarized image 13/14
  > Summarized image 14/14

Asking question: According to the paper, what are the main challenges in applying multimodal models to robotics? Refer to any diagrams if possible.



According to the paper, the main challenges in applying multimodal models to robotics are:

1.  **Data Collection and Annotation:** It is difficult to acquire and accurately label high-dimensional, synchronized, and diverse data from robotic interactions, as this data is dynamic and highly dependent on the robot's state and environment.
2.  **Real-time Inference and Low-latency Control:** Complex multimodal models are computationally intensive, leading to high latency that is unacceptable for the immediate responses required for safe and effective real-time robotic control.
3.  **Generalization and Robustness:** Models trained on limited datasets struggle to adapt and perform reliably in novel, unstructured real-world environments, or when encountering unseen conditions or variations.
4.  **Interpretability and Explainability:** The black-box nature of many deep learning models makes it difficult to understand the reasoning behind a robot's decisions, which hinders debugging, diagnosis of failures, and building trust in autonomous systems.

Figure 2, titled "Challenges in Multimodal Robotics," explicitly illustrates these four points: "Data Collection & Annotation," "Real-time Inference & Control," "Generalization & Robustness," and "Interpretability & Explainability," along with brief descriptions for each.

In [53]:
question = input("Enter your question: ")
    
print("\n" + "="*50)
print(f"Asking question: {question}")
print("="*50 + "\n")

    # 6. Invoke the chain and get the answer
response = rag_chain.invoke(question)
    
    # 7. Display the final answer
display(Markdown(response))


Asking question: what is the important point research paper is trying to tell 



Based on the provided images, the important point the research paper is trying to convey is the ability to perform **cross-modal reasoning** (combining visual/object information with auditory/sound information) within a 3D environment.

This is demonstrated through:
1.  **VLMap Creation and Landmark Indexing:** Building a 3D map that integrates visual information and can be semantically indexed with open-vocabulary text labels for both objects and sounds.
2.  **Combined Queries:** The system can answer complex queries that relate objects to sounds (e.g., "the backpack near the sound of glass breaking" or "the shelf near the sound of door knock").
3.  **Explicit Cross-Modal Reasoning:** The final image explicitly shows how "Sound Prediction" and "Object Prediction" are combined through "Cross-Modal Reasoning" to answer a query like "the sound of baby crying near the sofa," indicating the system's capability to understand and locate entities based on their inter-modal spatial relationships.