### 1. Load Environment Variables

In [24]:
from dotenv import load_dotenv
load_dotenv()

True

### 2. Import Required Libraries

In [25]:
# PDF partitioning
from unstructured.partition.pdf import partition_pdf


### 3. Partition PDF into Structured Elements

In [26]:
#  Partition a PDF into elements: headers, paragraphs, images, tables, etc.
#  Path to your PDF
pdf_path = r"C:\Users\Anilchoudary R\Gitrepos\Multimodal-RAG-Application\data\attention_paper.pdf"

#  Partition the PDF
raw_pdf_elements = partition_pdf(
    filename=pdf_path,
    strategy="hi_res",
    extract_images_in_pdf=True,
    extract_image_block_types=["Image", "Table"],
    extract_image_block_to_payload=False,
    extract_image_block_output_dir="extracted_docs"
)

raw_pdf_elements



[<unstructured.documents.elements.Text at 0x27a34edbe70>,
 <unstructured.documents.elements.Text at 0x27a34edbee0>,
 <unstructured.documents.elements.Text at 0x27a34edb930>,
 <unstructured.documents.elements.Text at 0x27a34edb9a0>,
 <unstructured.documents.elements.Text at 0x27a34edba10>,
 <unstructured.documents.elements.Header at 0x27a2507c050>,
 <unstructured.documents.elements.Text at 0x27a34edba80>,
 <unstructured.documents.elements.Text at 0x27a34edbaf0>,
 <unstructured.documents.elements.Text at 0x27a3432d470>,
 <unstructured.documents.elements.Title at 0x27a34edbb60>,
 <unstructured.documents.elements.Title at 0x27a34edbbd0>,
 <unstructured.documents.elements.Title at 0x27a34edbc40>,
 <unstructured.documents.elements.Title at 0x27a345ca190>,
 <unstructured.documents.elements.Title at 0x27a3432d710>,
 <unstructured.documents.elements.Title at 0x27a34edbcb0>,
 <unstructured.documents.elements.NarrativeText at 0x27a2de5b540>,
 <unstructured.documents.elements.Title at 0x27a2de5b5b

### 4. Categorize PDF Elements by Type

In [27]:
#  Organize extracted PDF elements by type

Header, Footer, Title = [], [], []
NarrativeTexts, Text, ListItem = [], [], []
Images, Tables = [], []

for element in raw_pdf_elements:
    t = str(type(element))
    if "Header" in t:
        Header.append(str(element))
    elif "Footer" in t:
        Footer.append(str(element))
    elif "Title" in t:
        Title.append(str(element))
    elif "NarrativeText" in t:
        NarrativeTexts.append(str(element))
    elif "Text" in t:
        Text.append(str(element))
    elif "ListItem" in t:
        ListItem.append(str(element))
    elif "Image" in t:
        Images.append(str(element))
    elif "Table" in t:
        Tables.append(str(element))

#  Summary of extracted types
print(f"Headers: {len(Header)}, Footers: {len(Footer)}, Titles: {len(Title)}")
print(f"NarrativeTexts: {len(NarrativeTexts)}, Text: {len(Text)}, ListItems: {len(ListItem)}")
print(f"Tables: {len(Tables)}, Images: {len(Images)}")


Headers: 3, Footers: 8, Titles: 40
NarrativeTexts: 83, Text: 21, ListItems: 43
Tables: 4, Images: 7


This block loops over all elements and sorts them into lists so you can handle text, tables, and images separately later.

### 5.  Helper Functions for encoding image binary first into base64 and then into python string



In [28]:
#  Helper to encode an image to base64
import base64

def encode_image(image_path):
    """
    Encode an image as a base64 string.
    """
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")


### 6. Helper Function for Image Summarization

In [30]:
#  Summarize an image using GPT-4 with vision
from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI

def image_summarize(img_b64, prompt):
    """
    Summarize a base64-encoded image with a text prompt using GPT-4 Vision.
    """
    model = ChatOpenAI(model="gpt-4o", max_tokens=1024)
    response = model.invoke(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{img_b64}"
                        }
                    }
                ]
            )
        ]
    )
    return response.content

These helper functions encode an image to base64 and then use GPT-4 Vision to summarize it.



### 7. Helper Function for generating Image Summaries

In [31]:
def generate_image_based_summaries(image_paths, prompt):
    """
    Generic helper: encode and summarize a list of image file paths using a given prompt.
    Returns: (list of base64 strings, list of summaries)
    """
    b64_list = []
    summaries = []

    for path in image_paths:
        b64 = encode_image(path)
        b64_list.append(b64)
        summary = image_summarize(b64, prompt)
        summaries.append(summary)

    return b64_list, summaries


### 8. Prompts for Tables and Images separately

In [39]:
IMAGE_PROMPT = (
    "You are an assistant tasked with summarizing images for retrieval. "
    "Describe the content of this image in detail."
)

TABLE_PROMPT = (
    "You are an assistant tasked with summarizing tables from images for retrieval. "
    "Describe the data, key insights, and structure in detail."
)


### 9. Generate Base64 and Summaries for Normal Images and Table Images

In [37]:
# For normal images
image_paths = [
    r"C:\Users\Anilchoudary R\Gitrepos\Multimodal-RAG-Application\note-books\extracted_docs\figure-3-1.jpg",
    r"C:\Users\Anilchoudary R\Gitrepos\Multimodal-RAG-Application\note-books\extracted_docs\figure-4-2.jpg",
]

image_b64_list, image_summaries = generate_image_based_summaries(image_paths, IMAGE_PROMPT)

print("Sample image summary:", image_summaries[0])




Sample image summary: The image is a diagram representing the architecture of a Transformer model, commonly used in natural language processing. 

- **Left Section (Encoder):**
  - At the bottom, it starts with "Input Embedding" where inputs are embedded.
  - Positional encoding is applied before they enter the layers.
  - The encoder consists of multiple identical layers (Nx) wrapping two main components: "Multi-Head Attention" and "Feed Forward."
  - Each component is followed by "Add & Norm," indicating a residual connection and layer normalization.

- **Right Section (Decoder):**
  - Begins with "Output Embedding" and includes shifted outputs. 
  - Similar to the encoder, positional encoding is added.
  - The decoder has more components per layer: "Masked Multi-Head Attention", "Multi-Head Attention" (taking input from the encoder), and "Feed Forward," each followed by "Add & Norm."
  
- **Top Section:**
  - The final layers include a "Linear" transformation followed by a "Softmax"

In [44]:
# 2️ For table images
table_image_paths = [
    r"C:\Users\Anilchoudary R\Gitrepos\Multimodal-RAG-Application\note-books\extracted_docs\table-6-1.jpg",
    r"C:\Users\Anilchoudary R\Gitrepos\Multimodal-RAG-Application\note-books\extracted_docs\table-8-2.jpg",
]

table_b64_list, table_summaries = generate_image_based_summaries(table_image_paths, TABLE_PROMPT)

print("Sample table summary:", table_summaries[0])

Sample table summary: The table presents a comparison of different neural network layer types based on three criteria: Complexity per Layer, Sequential Operations, and Maximum Path Length.

### Data Columns:

1. **Layer Type**:
   - Lists four different types of neural network layers.
   - Types: Self-Attention, Recurrent, Convolutional, Self-Attention (restricted).

2. **Complexity per Layer**:
   - Indicates the computational complexity of each layer type.
   - Given as big O notation for each layer:
     - Self-Attention: \(O(n^2 \cdot d)\)
     - Recurrent: \(O(n \cdot d^2)\)
     - Convolutional: \(O(k \cdot n \cdot d^2)\)
     - Self-Attention (restricted): \(O(r \cdot n \cdot d)\)

3. **Sequential Operations**:
   - Denotes the number of operations that need to be performed in sequence.
   - Values:
     - Self-Attention: \(O(1)\)
     - Recurrent: \(O(n)\)
     - Convolutional: \(O(1)\)
     - Self-Attention (restricted): \(O(1)\)

4. **Maximum Path Length**:
   - Refers to the

### Semantic Chunking ( Optional )

In [73]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

all_texts = NarrativeTexts + Text + ListItem + Title  # + image/table summaries if needed

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)

chunks = text_splitter.split_text("\n".join(all_texts))
print(f"Total chunks: {len(chunks)}")


Total chunks: 47


In [74]:
from langchain_core.documents import Document

text_docs = [Document(page_content=chunk, metadata={"type": "text"}) for chunk in chunks]
table_summary_docs = [Document(page_content=s, metadata={"type": "table_summary"}) for s in table_summaries]
table_docs = [Document(page_content=t, metadata={"type": "table_image"}) for t in table_b64_list]
image_summary_docs = [Document(page_content=s, metadata={"type": "image_summary"}) for s in image_summaries]
image_docs = [Document(page_content=i, metadata={"type": "image_base64"}) for i in image_b64_list]

all_docs = text_docs + table_summary_docs + table_docs + image_summary_docs + image_docs
print(f"Total documents for embedding: {len(all_docs)}")



Total documents for embedding: 55


 ### Setup Qdrant Collections with Different Index Types

In [77]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, HnswConfigDiff

qdrant_client = QdrantClient(url="http://localhost:6333")

embedding_size = 1536  # OpenAI embedding size

# Define collection parameters for flat (default)
flat_params = {
    "vectors_config": VectorParams(size=embedding_size, distance=Distance.COSINE),
}

# Define collection parameters for HNSW index
hnsw_params = {
    "vectors_config": VectorParams(size=embedding_size, distance=Distance.COSINE),
    "hnsw_config": HnswConfigDiff(m=16, ef_construct=200),
}

collections = {
    "mm_flat": flat_params,
    "mm_hnsw": hnsw_params,
}

for col_name, params in collections.items():
    try:
        qdrant_client.delete_collection(collection_name=col_name)
    except Exception:
        pass
    qdrant_client.create_collection(collection_name=col_name, **params)

print("Collections created successfully.")




Collections created successfully.


In [78]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Qdrant

embedding_fn = OpenAIEmbeddings()

def upload_to_qdrant(collection_name):
    vectorstore = Qdrant.from_documents(
        documents=all_docs,
        embedding=embedding_fn,
        url="http://localhost:6333",
        collection_name=collection_name,
    )
    return vectorstore

vectorstores = {}
for col_name in collections.keys():
    print(f"Uploading documents to collection {col_name} ...")
    vectorstores[col_name] = upload_to_qdrant(col_name)

print("All documents uploaded and embedded.")



Uploading documents to collection mm_flat ...
Uploading documents to collection mm_hnsw ...
All documents uploaded and embedded.


### Create Retriever

In [82]:
import time

retrievers = {}
query = "Explain attention mechanism in the paper."

for name, vs in vectorstores.items():
    # Use the `k` param instead of `search_kwargs`
    retriever = vs.as_retriever(k=5)  # ✅ THIS is the proper way
    start = time.time()
    docs = retriever.get_relevant_documents(query)
    duration = time.time() - start
    retrievers[name] = retriever
    print(f"Retriever '{name}' query time: {duration:.3f} seconds")






Retriever 'mm_flat' query time: 0.311 seconds
Retriever 'mm_hnsw' query time: 0.225 seconds


### Measure retrieval time:

In [67]:
import time

query = "Explain the attention mechanism"
start = time.time()
docs = retriever.get_relevant_documents(query)
end = time.time()

print(f"Retrieved in {end - start:.3f} sec")
for d in docs:
    print(d.page_content[:150])


Retrieved in 0.289 sec
3.2 Attention
3.2 Attention
Attention mechanisms have become an integral part of compelling sequence modeling and transduc- tion models in various tasks, allowing modeling of dep
Attention mechanisms have become an integral part of compelling sequence modeling and transduc- tion models in various tasks, allowing modeling of dep
3.2.3 Applications of Attention in our Model


In [83]:
# Make sure to install sklearn before running this cell: pip install scikit-learn
from sklearn.metrics import accuracy_score

queries = [
    "Explain attention",
    "What is the key result?",
    "Describe figure 3-1"
]

expected_keywords = [
    "attention",
    "key result",
    "figure 3"
]

correct = []
for q, keyword in zip(queries, expected_keywords):
    docs = retrievers["mm_flat"].get_relevant_documents(q)  # Use 'mm_flat' or loop all
    combined_text = " ".join([d.page_content.lower() for d in docs])
    correct.append(keyword.lower() in combined_text)

accuracy = accuracy_score(correct, [True]*len(correct))
print(f"Retriever accuracy: {accuracy:.2f}")




Retriever accuracy: 0.33


In [85]:
# Use built-in MMR support in the retriever
mmr_retriever = retrievers["mm_flat"].vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 5, "lambda_mult": 0.5}
)

reranked_docs = mmr_retriever.get_relevant_documents("Explain attention")

print(f"Reranked top document preview:\n{reranked_docs[0].page_content[:500]}")


Reranked top document preview:
Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [4, 27, 28, 22].
End-to-end memory networks are based on a recurrent attention mechanism instead of sequence- aligned recur


In [86]:
from langchain_core.prompts import ChatPromptTemplate

rag_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a multi-modal assistant answering user queries using text and images."),
    ("human", """
Answer the user's question based on the following retrieved content.

## Text:
{context}

## Question:
{input}
""")
])


In [87]:
from langchain_openai import ChatOpenAI

vision_llm = ChatOpenAI(model="gpt-4o")

def separate_text_and_images(docs):
    text_chunks = []
    image_chunks = []
    for doc in docs:
        content = doc.page_content
        # Basic heuristic: base64 strings likely for images; texts otherwise
        if len(content) > 1000 and (content.startswith("/9j/") or content.startswith("iVBOR")):
            image_chunks.append(content)
        elif content.lower().startswith("data:image") or "base64" in content.lower():
            image_chunks.append(content)
        else:
            text_chunks.append(content)
    return text_chunks, image_chunks

def multi_modal_rag_query(query, retriever):
    retrieved_docs = retriever.get_relevant_documents(query)
    text_chunks, image_chunks = separate_text_and_images(retrieved_docs)

    messages = [
        {"role": "system", "content": "You are a helpful assistant using text and images."},
        {
            "role": "user",
            "content": [
                {"type": "text", "text": rag_prompt.format_messages(context="\n".join(text_chunks), input=query)[0].content}
            ]
        }
    ]

    for img_b64 in image_chunks:
        messages[1]["content"].append({
            "type": "image_url",
            "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}
        })

    response = vision_llm.invoke(messages)
    return response.content


In [88]:
query = "What does figure 3-1 show?"
answer = multi_modal_rag_query(query, retrievers["mm_flat"])
print("Answer:", answer)


Answer: These images depict different aspects of the Transformer model, commonly used in natural language processing.

1. **Architecture Diagram**: This shows the basic structure of the Transformer model, including the encoder-decoder architecture. Key components include multi-head attention, feed-forward layers, and positional encoding. The encoder and decoder are made up of stacked layers.

2. **BLEU Scores and Training Costs**: This table compares the BLEU scores and training costs (measured in FLOPs) of various models on different language translation tasks. The Transformer model is highlighted for its performance and efficiency.

3. **Complexity Table**: This table compares different neural network layers in terms of complexity per layer, sequential operations, and maximum path length. It highlights the efficiency of self-attention compared to recurrent and convolutional layers.

4. **Scaled Dot-Product Attention**: This diagram explains the process of scaled dot-product attention

In [90]:
from docx import Document

doc = Document()
doc.add_heading("LLM Generated Answer", 0)
doc.add_paragraph(answer)

output_path = "generated_answer.docx"
doc.save(output_path)
print(f"\nAnswer saved to: {output_path}")



Answer saved to: generated_answer.docx
