In [1]:
# For Linux
# %sudo apt-get install poppler-utils tesseract-ocr libmagic-dev

In [2]:
# %pip install -Uq "unstructured[all-docs]" pillow lxml pillow
# %pip install -Uq chromadb tiktoken
# %pip install -Uq langchain langchain-community langchain-openai langchain-groq
# %pip install -Uq python_dotenv

In [None]:
import os

os.environ["GEMINI_KEY"] = ""
os.environ["LANGCHAIN_API_KEY"] = ""
os.environ["LANGCHAIN_TRACING_V2"] = "true"

In [4]:
from unstructured.partition.pdf import partition_pdf

output_path = "./pdf/"
file_path = output_path + "attention.pdf"

chunks = partition_pdf(
    filename=file_path,
    infer_table_structure=True,            # extract tables
    strategy="hi_res",                     # mandatory to infer tables

    extract_image_block_types=["Image"],   # Add 'Table' to list to extract image of tables
    image_output_dir_path=output_path,   # if None, images and tables will saved in base64

    extract_image_block_to_payload=True,   # if true, will extract base64 for API usage

    # chunking_strategy="by_title", # chunking strategy
    # max_characters=10000, # maximum characters in a chunk
    # combine_text_under_n_chars=2000, # combine small chunks when they are under n characters
    # new_after_n_chars=6000, # create a new chunk after n characters

    # extract_images_in_pdf=True,          # deprecated
)



The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [None]:
from unstructured.documents.elements import Table, Image
from unstructured.chunking.basic import chunk_elements

# 'chunks' is now the raw list of all elements from partition_pdf

tables = []
images = []
text_elements = []

# --- 1. Separate elements by type ---
for el in chunks:
    if isinstance(el, Table):
        tables.append(el)
    elif isinstance(el, Image):
        images.append(el)
    else:
        # This catches NarrativeText, Title, Text, etc.
        text_elements.append(el)

print(f"Found {len(tables)} tables and {len(images)} images.")
print(f"Found {len(text_elements)} text-based elements to be chunked.")

# --- 2. Chunk *only* the text elements ---
text_chunks = chunk_elements(
    text_elements,
    max_characters=10000,
    include_orig_elements=True,
    # combine_text_under_n_chars=2000,
    new_after_n_chars=6000,
)
print(f"Chunked text into {len(text_chunks)} CompositeElements.")


final_chunks_list = tables + text_chunks

final_types = set([str(type(el)) for el in final_chunks_list])
print(f"\nTypes in final list: {final_types}")

Found 4 tables and 7 images.
Found 206 text-based elements to be chunked.
Chunked text into 6 CompositeElements.

Types in final list: {"<class 'unstructured.documents.elements.CompositeElement'>", "<class 'unstructured.documents.elements.Table'>"}


In [6]:
# number of chunks created
len(final_chunks_list)

10

In [7]:
# We get 2 types of elements from the partition_pdf function
set([str(type(el)) for el in final_chunks_list])

{"<class 'unstructured.documents.elements.CompositeElement'>",
 "<class 'unstructured.documents.elements.Table'>"}

In [8]:
for el in final_chunks_list:
    print(el.category, type(el))


Table <class 'unstructured.documents.elements.Table'>
Table <class 'unstructured.documents.elements.Table'>
Table <class 'unstructured.documents.elements.Table'>
Table <class 'unstructured.documents.elements.Table'>
CompositeElement <class 'unstructured.documents.elements.CompositeElement'>
CompositeElement <class 'unstructured.documents.elements.CompositeElement'>
CompositeElement <class 'unstructured.documents.elements.CompositeElement'>
CompositeElement <class 'unstructured.documents.elements.CompositeElement'>
CompositeElement <class 'unstructured.documents.elements.CompositeElement'>
CompositeElement <class 'unstructured.documents.elements.CompositeElement'>


In [9]:
chunks = final_chunks_list

In [10]:
chunks = chunks + images

In [11]:
a = chunks[0].to_dict()
a

{'type': 'Table',
 'element_id': '58246ac1d674122055fa1f9b8e56318d',
 'text': 'Layer Type Complexity per Layer Sequential Maximum Path Length Operations Self-Attention O(n2 · d) O(1) O(1) Recurrent O(n · d2) O(n) O(n) Convolutional O(k · n · d2) O(1) O(logk(n)) Self-Attention (restricted) O(r · n · d) O(1) O(n/r)',
 'metadata': {'detection_class_prob': 0.928255021572113,
  'coordinates': {'points': ((np.float64(320.3291931152344),
     np.float64(312.45477294921875)),
    (np.float64(320.3291931152344), np.float64(519.1640014648438)),
    (np.float64(1363.98291015625), np.float64(519.1640014648438)),
    (np.float64(1363.98291015625), np.float64(312.45477294921875))),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2025-11-03T13:29:17',
  'text_as_html': '<table><thead><tr><th>Layer Type</th><th>Complexity per Layer</th><th>Sequential Operations</th><th>Maximum Path Length</th></tr></thead><tbody><tr><td>Self-Attention</td><td>O(n? - d

In [12]:
a['metadata']

{'detection_class_prob': 0.928255021572113,
 'coordinates': {'points': ((np.float64(320.3291931152344),
    np.float64(312.45477294921875)),
   (np.float64(320.3291931152344), np.float64(519.1640014648438)),
   (np.float64(1363.98291015625), np.float64(519.1640014648438)),
   (np.float64(1363.98291015625), np.float64(312.45477294921875))),
  'system': 'PixelSpace',
  'layout_width': 1700,
  'layout_height': 2200},
 'last_modified': '2025-11-03T13:29:17',
 'text_as_html': '<table><thead><tr><th>Layer Type</th><th>Complexity per Layer</th><th>Sequential Operations</th><th>Maximum Path Length</th></tr></thead><tbody><tr><td>Self-Attention</td><td>O(n? - d)</td><td>O(1)</td><td>O(1)</td></tr><tr><td>Recurrent</td><td>O(n- d?)</td><td>O(n)</td><td>O(n)</td></tr><tr><td>Convolutional</td><td>O(k-n-d?)</td><td>O(1)</td><td>O(logx(n))</td></tr><tr><td>Self-Attention (restricted)</td><td>O(r-n-d)</td><td>o(1)</td><td>O(n/r)</td></tr></tbody></table>',
 'filetype': 'application/pdf',
 'languages

In [14]:
chunks[2].text

'N dmodel dff h dk dv Pdrop ϵls train steps PPL (dev) BLEU params (dev) ×106 base 6 512 2048 8 64 64 0.1 0.1 100K 4.92 25.8 65 1 512 512 5.29 24.9 (A) 4 16 128 32 128 32 5.00 4.91 25.5 25.8 32 16 16 5.01 25.4 (B) 16 32 5.16 5.01 25.1 25.4 58 60 2 6.11 23.7 36 4 5.19 25.3 50 8 4.88 25.5 80 (C) 256 32 32 5.75 24.5 28 1024 128 128 4.66 26.0 168 1024 5.12 25.4 53 4096 4.75 26.2 90 0.0 5.77 24.6 (D) 0.2 0.0 4.95 4.67 25.5 25.3 0.2 5.47 25.7 (E) positional embedding instead of sinusoids 4.92 25.7'

In [15]:
# separate tables from texts
tables = []
texts = []

for chunk in chunks:
    if "Table" in str(type(chunk)):
        tables.append(chunk)

    if "CompositeElement" in str(type((chunk))):
        texts.append(chunk)

In [16]:
texts

[<unstructured.documents.elements.CompositeElement at 0x72c52f9fd220>,
 <unstructured.documents.elements.CompositeElement at 0x72c52f9fd070>,
 <unstructured.documents.elements.CompositeElement at 0x72c52f9fd0a0>,
 <unstructured.documents.elements.CompositeElement at 0x72c52f9fce60>,
 <unstructured.documents.elements.CompositeElement at 0x72c52f9fcdd0>,
 <unstructured.documents.elements.CompositeElement at 0x72c52f9fcd70>]

In [17]:
texts[0].metadata.orig_elements

[<unstructured.documents.elements.Text at 0x72c5872d1fa0>,
 <unstructured.documents.elements.Text at 0x72c58509e900>,
 <unstructured.documents.elements.Text at 0x72c5872d3aa0>,
 <unstructured.documents.elements.Text at 0x72c5872d3da0>,
 <unstructured.documents.elements.Text at 0x72c58712ed20>,
 <unstructured.documents.elements.Header at 0x72c58712f5c0>,
 <unstructured.documents.elements.Text at 0x72c5332c7140>,
 <unstructured.documents.elements.Text at 0x72c58509e0f0>,
 <unstructured.documents.elements.Text at 0x72c58509f3b0>,
 <unstructured.documents.elements.Text at 0x72c58509e090>,
 <unstructured.documents.elements.Text at 0x72c58509d0a0>,
 <unstructured.documents.elements.Text at 0x72c58509e300>,
 <unstructured.documents.elements.NarrativeText at 0x72c5872d1130>,
 <unstructured.documents.elements.Title at 0x72c5872d27b0>,
 <unstructured.documents.elements.Text at 0x72c5872d3ef0>,
 <unstructured.documents.elements.Text at 0x72c58712c8c0>,
 <unstructured.documents.elements.Text at 0x

In [18]:
print(texts[1])

End-to-end memory networks are based on a recurrent attention mechanism instead of sequence- aligned recurrence and have been shown to perform well on simple-language question answering and language modeling tasks [34].

To the best of our knowledge, however, the Transformer is the first transduction model relying entirely on self-attention to compute representations of its input and output without using sequence- aligned RNNs or convolution. In the following sections, we will describe the Transformer, motivate self-attention and discuss its advantages over models such as [17, 18] and [9].

3 Model Architecture

Most competitive neural sequence transduction models have an encoder-decoder structure [5, 2, 35]. Here, the encoder maps an input sequence of symbol representations (x1,...,xn) to a sequence of continuous representations z = (z1,...,zn). Given z, the decoder then generates an output sequence (y1,...,ym) of symbols one element at a time. At each step the model is auto-regressiv

In [19]:
tables

[<unstructured.documents.elements.Table at 0x72c587172b70>,
 <unstructured.documents.elements.Table at 0x72c533026870>,
 <unstructured.documents.elements.Table at 0x72c53302e570>,
 <unstructured.documents.elements.Table at 0x72c53302c3b0>]

In [20]:
tables[0].to_dict()

{'type': 'Table',
 'element_id': '58246ac1d674122055fa1f9b8e56318d',
 'text': 'Layer Type Complexity per Layer Sequential Maximum Path Length Operations Self-Attention O(n2 · d) O(1) O(1) Recurrent O(n · d2) O(n) O(n) Convolutional O(k · n · d2) O(1) O(logk(n)) Self-Attention (restricted) O(r · n · d) O(1) O(n/r)',
 'metadata': {'detection_class_prob': 0.928255021572113,
  'coordinates': {'points': ((np.float64(320.3291931152344),
     np.float64(312.45477294921875)),
    (np.float64(320.3291931152344), np.float64(519.1640014648438)),
    (np.float64(1363.98291015625), np.float64(519.1640014648438)),
    (np.float64(1363.98291015625), np.float64(312.45477294921875))),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2025-11-03T13:29:17',
  'text_as_html': '<table><thead><tr><th>Layer Type</th><th>Complexity per Layer</th><th>Sequential Operations</th><th>Maximum Path Length</th></tr></thead><tbody><tr><td>Self-Attention</td><td>O(n? - d

In [23]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [24]:
# Prompt
prompt_text = """
You are an assistant tasked with summarizing tables and text.
Give a concise summary of the table or text.

Respond only with the summary, no additionnal comment.
Do not start your message by saying "Here is a summary" or anything like that.
Just give the summary as it is.

Table or text chunk: {element}

"""
prompt = ChatPromptTemplate.from_template(prompt_text)

# Summary chain
model = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", google_api_key=os.environ["GEMINI_KEY"])
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [25]:
# Summarize text
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 3})

# Summarize tables
tables_html = [table.metadata.text_as_html for table in tables]
table_summaries = summarize_chain.batch(tables_html, {"max_concurrency": 3})

In [26]:
text_summaries

["The Transformer is a new neural network architecture for sequence transduction tasks, such as machine translation, that relies solely on attention mechanisms, eliminating recurrence and convolutions. This design allows for greater parallelization and faster training times, while achieving superior translation quality. The paper demonstrates the Transformer's effectiveness on machine translation tasks and its generalization to other tasks like English constituency parsing.",
 "The Transformer is a transduction model that uses self-attention mechanisms instead of recurrent or convolutional layers. It employs an encoder-decoder architecture, where both components consist of stacked identical layers. Each encoder layer has a multi-head self-attention mechanism and a position-wise feed-forward network, with residual connections and layer normalization. The decoder has these two sub-layers plus an additional multi-head attention sub-layer that attends to the encoder's output. The self-atte

In [31]:
import uuid
from langchain_chroma import Chroma
from langchain_classic.storage import InMemoryStore
from langchain_core.documents import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_classic.retrievers.multi_vector import MultiVectorRetriever
# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="multi_modal_rag", embedding_function=GoogleGenerativeAIEmbeddings(
    model="models/gemini-embedding-001", google_api_key=os.environ["GEMINI_KEY"]
))

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

In [29]:
text_summaries

["The Transformer is a new neural network architecture for sequence transduction tasks, such as machine translation, that relies solely on attention mechanisms, eliminating recurrence and convolutions. This design allows for greater parallelization and faster training times, while achieving superior translation quality. The paper demonstrates the Transformer's effectiveness on machine translation tasks and its generalization to other tasks like English constituency parsing.",
 "The Transformer is a transduction model that uses self-attention mechanisms instead of recurrent or convolutional layers. It employs an encoder-decoder architecture, where both components consist of stacked identical layers. Each encoder layer has a multi-head self-attention mechanism and a position-wise feed-forward network, with residual connections and layer normalization. The decoder has these two sub-layers plus an additional multi-head attention sub-layer that attends to the encoder's output. The self-atte

In [32]:
# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
    Document(page_content=summary, metadata={id_key: doc_ids[i]}) for i, summary in enumerate(text_summaries)
]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))

# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [
    Document(page_content=summary, metadata={id_key: table_ids[i]}) for i, summary in enumerate(table_summaries)
]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))

In [36]:
# Retrieve
docs = retriever.invoke(
    "what is multihead attention?"
)

In [37]:
print(docs)

[<unstructured.documents.elements.CompositeElement object at 0x72c52f9fd070>, <unstructured.documents.elements.CompositeElement object at 0x72c52f9fd0a0>, <unstructured.documents.elements.CompositeElement object at 0x72c52f9fd220>, <unstructured.documents.elements.CompositeElement object at 0x72c52f9fcdd0>]


In [35]:
for doc in docs:
    print(str(doc) + "\n\n" + "-" * 80)

[16] Łukasz Kaiser and Samy Bengio. Can active memory replace attention? In Advances in Neural Information Processing Systems, (NIPS), 2016.

[17] Łukasz Kaiser and Ilya Sutskever. Neural GPUs learn algorithms. In International Conference on Learning Representations (ICLR), 2016.

[18] Nal Kalchbrenner, Lasse Espeholt, Karen Simonyan, Aaron van den Oord, Alex Graves, and Ko- ray Kavukcuoglu. Neural machine translation in linear time. arXiv preprint arXiv:1610.10099v2, 2017.

[19] Yoon Kim, Carl Denton, Luong Hoang, and Alexander M. Rush. Structured attention networks. In International Conference on Learning Representations, 2017.

[20] Diederik Kingma and Jimmy Ba. Adam: A method for stochastic optimization. In ICLR, 2015.

[21] Oleksii Kuchaiev and Boris Ginsburg. Factorization tricks for LSTM networks. arXiv preprint arXiv:1703.10722, 2017.

[22] Zhouhan Lin, Minwei Feng, Cicero Nogueira dos Santos, Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio. A structured self-attentive sentenc

In [38]:
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.messages import SystemMessage,HumanMessage
from langchain_google_genai import ChatGoogleGenerativeAI
from base64 import b64decode

def parse_docs(docs):
    """Split base64-encoded images and texts"""
    b64 = []
    text = []
    for doc in docs:
        try:
            b64decode(doc)
            b64.append(doc)
        except Exception as e:
            text.append(doc)
    return {"images": b64, "texts": text}


def build_prompt(kwargs):

    docs_by_type = kwargs["context"]
    user_question = kwargs["question"]

    context_text = ""
    if len(docs_by_type["texts"]) > 0:
        for text_element in docs_by_type["texts"]:
            context_text += text_element.text

    # construct prompt with context (including images)
    prompt_template = f"""
    Answer the question based only on the following context, which can include text, tables, and the below image.
    Context: {context_text}
    Question: {user_question}
    """

    prompt_content = [{"type": "text", "text": prompt_template}]

    if len(docs_by_type["images"]) > 0:
        for image in docs_by_type["images"]:
            prompt_content.append(
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{image}"},
                }
            )

    return ChatPromptTemplate.from_messages(
        [
            HumanMessage(content=prompt_content),
        ]
    )
    
chain = (
    {
        "context": retriever | RunnableLambda(parse_docs),
        "question": RunnablePassthrough(),
    }
    | RunnableLambda(build_prompt)
    | ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", google_api_key=os.environ["GEMINI_KEY"])
    | StrOutputParser()
)

chain_with_sources = {
    "context": retriever | RunnableLambda(parse_docs),
    "question": RunnablePassthrough(),
} | RunnablePassthrough().assign(
    response=(
        RunnableLambda(build_prompt)
        | ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", google_api_key=os.environ["GEMINI_KEY"])
        | StrOutputParser()
    )
)



In [43]:
response = chain.invoke(
    "What is multihead?"
)

print(response)

Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions. It is achieved by linearly projecting the queries, keys, and values multiple times with different learned projections, and then performing the attention function in parallel on each of these projected versions. The resulting outputs are concatenated and projected again to produce the final output. The paper uses h = 8 parallel attention layers (heads), with each head having a dimension of dk = dv = dmodel/h = 64.


In [None]:
response = chain_with_sources.invoke(
    "What is multihead?"
)

print("Response:", response['response'])

print("\n\nContext:")
for text in response['context']['texts']:
    print(text.text)
    print("Page number: ", text.metadata.page_number)
    print("\n" + "-"*50 + "\n")