### 1. Load Environment Variables

In [24]:
from dotenv import load_dotenv
load_dotenv()

True

### 2. Import Required Libraries

In [25]:
# PDF partitioning
from unstructured.partition.pdf import partition_pdf


### 3. Partition PDF into Structured Elements

In [26]:
#  Partition a PDF into elements: headers, paragraphs, images, tables, etc.
#  Path to your PDF
pdf_path = r"C:\Users\Anilchoudary R\Gitrepos\Multimodal-RAG-Application\data\attention_paper.pdf"

#  Partition the PDF
raw_pdf_elements = partition_pdf(
    filename=pdf_path,
    strategy="hi_res",
    extract_images_in_pdf=True,
    extract_image_block_types=["Image", "Table"],
    extract_image_block_to_payload=False,
    extract_image_block_output_dir="extracted_docs"
)

raw_pdf_elements



[<unstructured.documents.elements.Text at 0x27a34edbe70>,
 <unstructured.documents.elements.Text at 0x27a34edbee0>,
 <unstructured.documents.elements.Text at 0x27a34edb930>,
 <unstructured.documents.elements.Text at 0x27a34edb9a0>,
 <unstructured.documents.elements.Text at 0x27a34edba10>,
 <unstructured.documents.elements.Header at 0x27a2507c050>,
 <unstructured.documents.elements.Text at 0x27a34edba80>,
 <unstructured.documents.elements.Text at 0x27a34edbaf0>,
 <unstructured.documents.elements.Text at 0x27a3432d470>,
 <unstructured.documents.elements.Title at 0x27a34edbb60>,
 <unstructured.documents.elements.Title at 0x27a34edbbd0>,
 <unstructured.documents.elements.Title at 0x27a34edbc40>,
 <unstructured.documents.elements.Title at 0x27a345ca190>,
 <unstructured.documents.elements.Title at 0x27a3432d710>,
 <unstructured.documents.elements.Title at 0x27a34edbcb0>,
 <unstructured.documents.elements.NarrativeText at 0x27a2de5b540>,
 <unstructured.documents.elements.Title at 0x27a2de5b5b

### 4. Categorize PDF Elements by Type

In [27]:
#  Organize extracted PDF elements by type

Header, Footer, Title = [], [], []
NarrativeTexts, Text, ListItem = [], [], []
Images, Tables = [], []

for element in raw_pdf_elements:
    t = str(type(element))
    if "Header" in t:
        Header.append(str(element))
    elif "Footer" in t:
        Footer.append(str(element))
    elif "Title" in t:
        Title.append(str(element))
    elif "NarrativeText" in t:
        NarrativeTexts.append(str(element))
    elif "Text" in t:
        Text.append(str(element))
    elif "ListItem" in t:
        ListItem.append(str(element))
    elif "Image" in t:
        Images.append(str(element))
    elif "Table" in t:
        Tables.append(str(element))

#  Summary of extracted types
print(f"Headers: {len(Header)}, Footers: {len(Footer)}, Titles: {len(Title)}")
print(f"NarrativeTexts: {len(NarrativeTexts)}, Text: {len(Text)}, ListItems: {len(ListItem)}")
print(f"Tables: {len(Tables)}, Images: {len(Images)}")


Headers: 3, Footers: 8, Titles: 40
NarrativeTexts: 83, Text: 21, ListItems: 43
Tables: 4, Images: 7


This block loops over all elements and sorts them into lists so you can handle text, tables, and images separately later.

### 5.  Helper Functions for encoding image binary first into base64 and then into python string



In [28]:
#  Helper to encode an image to base64
import base64

def encode_image(image_path):
    """
    Encode an image as a base64 string.
    """
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")


### 6. Helper Function for Image Summarization

In [30]:
#  Summarize an image using GPT-4 with vision
from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI

def image_summarize(img_b64, prompt):
    """
    Summarize a base64-encoded image with a text prompt using GPT-4 Vision.
    """
    model = ChatOpenAI(model="gpt-4o", max_tokens=1024)
    response = model.invoke(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{img_b64}"
                        }
                    }
                ]
            )
        ]
    )
    return response.content

These helper functions encode an image to base64 and then use GPT-4 Vision to summarize it.



### 7. Helper Function for generating Image Summaries

In [31]:
def generate_image_based_summaries(image_paths, prompt):
    """
    Generic helper: encode and summarize a list of image file paths using a given prompt.
    Returns: (list of base64 strings, list of summaries)
    """
    b64_list = []
    summaries = []

    for path in image_paths:
        b64 = encode_image(path)
        b64_list.append(b64)
        summary = image_summarize(b64, prompt)
        summaries.append(summary)

    return b64_list, summaries


### 8. Prompts for Tables and Images separately

In [39]:
IMAGE_PROMPT = (
    "You are an assistant tasked with summarizing images for retrieval. "
    "Describe the content of this image in detail."
)

TABLE_PROMPT = (
    "You are an assistant tasked with summarizing tables from images for retrieval. "
    "Describe the data, key insights, and structure in detail."
)


### 9. Generate Base64 and Summaries for Normal Images and Table Images

In [37]:
# For normal images
image_paths = [
    r"C:\Users\Anilchoudary R\Gitrepos\Multimodal-RAG-Application\note-books\extracted_docs\figure-3-1.jpg",
    r"C:\Users\Anilchoudary R\Gitrepos\Multimodal-RAG-Application\note-books\extracted_docs\figure-4-2.jpg",
]

image_b64_list, image_summaries = generate_image_based_summaries(image_paths, IMAGE_PROMPT)

print("Sample image summary:", image_summaries[0])




Sample image summary: The image is a diagram representing the architecture of a Transformer model, commonly used in natural language processing. 

- **Left Section (Encoder):**
  - At the bottom, it starts with "Input Embedding" where inputs are embedded.
  - Positional encoding is applied before they enter the layers.
  - The encoder consists of multiple identical layers (Nx) wrapping two main components: "Multi-Head Attention" and "Feed Forward."
  - Each component is followed by "Add & Norm," indicating a residual connection and layer normalization.

- **Right Section (Decoder):**
  - Begins with "Output Embedding" and includes shifted outputs. 
  - Similar to the encoder, positional encoding is added.
  - The decoder has more components per layer: "Masked Multi-Head Attention", "Multi-Head Attention" (taking input from the encoder), and "Feed Forward," each followed by "Add & Norm."
  
- **Top Section:**
  - The final layers include a "Linear" transformation followed by a "Softmax"

In [38]:
# 2️ For table images
table_image_paths = [
    r"C:\Users\Anilchoudary R\Gitrepos\Multimodal-RAG-Application\note-books\extracted_docs\table-6-1.jpg",
    r"C:\Users\Anilchoudary R\Gitrepos\Multimodal-RAG-Application\note-books\extracted_docs\table-8-2.jpg",
]

table_b64_list, table_summaries = generate_image_based_summaries(table_image_paths, TABLE_PROMPT)

print("Sample table summary:", table_summaries[0])

Sample table summary: The table compares different types of neural network layers in terms of their computational characteristics. Here are the key components and insights:

### Structure:
1. **Columns:**
   - **Layer Type:** Identifies the type of neural network layer being analyzed.
   - **Complexity per Layer:** Describes the computational complexity for each layer type, often influenced by factors such as input size (n), dimension (d), and kernel size (k or r).
   - **Sequential Operations:** Determines the number of sequential operations required.
   - **Maximum Path Length:** Indicated the maximum number of steps required to combine information from the most distant parts of the layer.

2. **Rows:**
   - **Self-Attention:** Complexity of \(O(n^2 \cdot d)\), one sequential operation \(O(1)\), and maximum path length of \(O(1)\). This suggests efficiency in parallelizing operations.
   - **Recurrent:** Complexity of \(O(n \cdot d^2)\), more sequential steps \(O(n)\), and its maximu

In [None]:
from pymilvus import connections, Collection, utility, FieldSchema, CollectionSchema, DataType

# Connect to Milvus
connections.connect("default", host="localhost", port="19530")

# Create a collection function
def create_milvus_collection(name, dim, index_type):
    if utility.has_collection(name):
        Collection(name).drop()
    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim)
    ]
    schema = CollectionSchema(fields)
    collection = Collection(name, schema)
    collection.create_index(
        field_name="embedding",
        index_params={"index_type": index_type, "metric_type": "COSINE", "params": {"nlist": 1024}}
    )
    collection.load()
    return collection


### 10. Store in Multi-Vector Retriever



In [260]:
#  Store all text/table/image summaries in a Multi-Vector Retriever

import uuid
from langchain_community.vectorstores import Chroma
from langchain.storage import InMemoryByteStore
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

#  Create retriever function
def create_multi_vector_retriever(vectorstore, text_summaries, texts, table_summaries, tables, image_summaries, images):
    store = InMemoryByteStore()
    id_key = "doc_id"

    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        docstore=store,
        id_key=id_key
    )

    def add_documents(retriever, summaries, contents):
        ids = [str(uuid.uuid4()) for _ in contents]
        summary_docs = [Document(page_content=s, metadata={id_key: ids[i]}) for i, s in enumerate(summaries)]
        retriever.vectorstore.add_documents(summary_docs)
        retriever.docstore.mset(list(zip(ids, contents)))

    if text_summaries:
        add_documents(retriever, text_summaries, texts)
    if table_summaries:
        add_documents(retriever, table_summaries, tables)
    if image_summaries:
        add_documents(retriever, image_summaries, images)

    return retriever

#  Create vector store
vectorstore = Chroma(
    collection_name="mm_rag",
    embedding_function=OpenAIEmbeddings()
)

# Create retriever
retriever_multi_vector = create_multi_vector_retriever(
    vectorstore,
    text_summaries=None,  # if you have text summaries, use here
    texts=Text,
    table_summaries=table_summaries,
    tables=Tables,
    image_summaries=image_summaries,
    images=img_b64_list
)


This cell stores all your summaries + raw content in a retriever — so later you can retrieve relevant content AND the original data.

### 9: Helper to Separate Base64 Images & Text

In [264]:
def separate_text_and_images(docs):
    """
    Separate retrieved docs into plain text and base64 images.
    Supports input items that are either Document objects or plain strings.
    """
    text_chunks = []
    image_chunks = []

    for doc in docs:
        # Handle both Document objects and plain strings
        content = doc.page_content if hasattr(doc, "page_content") else str(doc)

        # Simple heuristic to detect images as base64 strings
        if len(content) > 1000 and content[:5] in ["/9j/4", "iVBOR"]:
            image_chunks.append(content)
        elif "base64" in content.lower() and len(content) > 500:
            image_chunks.append(content)
        else:
            text_chunks.append(content)

    return text_chunks, image_chunks


### 10.Prompt for Vision Model

In [262]:
#  Create the prompt for the GPT-4 Vision model

from langchain_core.prompts import ChatPromptTemplate

rag_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a multi-modal assistant answering user queries using both text and images."),
    ("human", """
Answer the user's question based on the following retrieved content.

## Text:
{context}

## Question:
{input}
""")
])


### 11. Define the Rag chain

In [263]:
# 📌 Full RAG chain for text + image context

from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import StrOutputParser

from langchain_openai import ChatOpenAI

# LLM with Vision capability
vision_llm = ChatOpenAI(model="gpt-4o")

# Text parser
parser = StrOutputParser()

# RAG chain: Retrieve → Separate → Format → Invoke Vision LLM
def multi_modal_rag_query(query):
    # Step 1: Retrieve
    retrieved = retriever_multi_vector.invoke(query)
    
    # Step 2: Separate
    text_chunks, image_chunks = separate_text_and_images(retrieved)
    
    # Step 3: Compose message for LLM
    messages = [
        {"role": "system", "content": "You are a helpful assistant using both text and image context."},
        {
            "role": "user",
            "content": [
                {"type": "text", "text": rag_prompt.format_messages(context='\n'.join(text_chunks), input=query)[0].content}
            ]
        }
    ]
    
    for img_b64 in image_chunks:
        messages[1]["content"].append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{img_b64}"
            }
        })

    # Step 4: Call LLM
    response = vision_llm.invoke(messages)
    return response.content

# ✅ Test it
result = multi_modal_rag_query("What does figure 3-1 show?")
print("Answer:", result)


Answer: This diagram depicts the architecture of a Transformer model, which is widely used in natural language processing tasks. Here's a brief explanation of its components:

1. **Input Embedding and Output Embedding**: 
   - Inputs and outputs are converted into continuous vector representations.

2. **Positional Encoding**: 
   - Adds information about the position of each word in the sequence since the model doesn't inherently recognize sequence order.

3. **Multi-Head Attention**: 
   - Allows the model to focus on different parts of the input sequence when generating an output, capturing various contextual relationships.

4. **Add & Norm**: 
   - Refers to adding input to the output of the layer and then normalizing it. This helps in stabilizing the learning process.

5. **Feed Forward**: 
   - A fully connected feed-forward network applied independently to each position in the sequence.

6. **Masked Multi-Head Attention**: 
   - Similar to Multi-Head Attention, but prevents posi