In [151]:
import os
from unstructured.partition.pdf import partition_pdf
#import pytesseract
import uuid

from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.storage import InMemoryStore
from langchain.vectorstores import Chroma

import base64
from langchain.chat_models import ChatOpenAI
from langchain.schema.messages import HumanMessage, AIMessage
from dotenv import load_dotenv

from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser

load_dotenv()

True

In [152]:
input_path = os.getcwd()
output_path = os.path.join(os.getcwd(), "figures")

raw_pdf_elements = partition_pdf(
    filename=os.path.join(input_path, "attention_is_all_you_need.pdf"),
    extract_images_in_pdf=True,
    infer_table_structure=True,
    chunking_strategy="by_title",
    max_characters=500,
    new_after_n_chars=400,
    combine_text_under_n_chars=490,
    image_output_dir_path=output_path,
)



In [None]:
output_path

In [None]:
input_path

In [155]:
raw_pdf_elements

[<unstructured.documents.elements.CompositeElement at 0x1dbc4b2a330>,
 <unstructured.documents.elements.CompositeElement at 0x1dbd605f590>,
 <unstructured.documents.elements.CompositeElement at 0x1dc19258ce0>,
 <unstructured.documents.elements.CompositeElement at 0x1dc1e8c2f30>,
 <unstructured.documents.elements.CompositeElement at 0x1dc00deea50>,
 <unstructured.documents.elements.CompositeElement at 0x1dc01735400>,
 <unstructured.documents.elements.CompositeElement at 0x1dbd591d040>,
 <unstructured.documents.elements.CompositeElement at 0x1dc017560c0>,
 <unstructured.documents.elements.CompositeElement at 0x1dc01610230>,
 <unstructured.documents.elements.CompositeElement at 0x1dbd6b63b90>,
 <unstructured.documents.elements.CompositeElement at 0x1dc00db9d30>,
 <unstructured.documents.elements.CompositeElement at 0x1dc010dede0>,
 <unstructured.documents.elements.CompositeElement at 0x1dbd2757590>,
 <unstructured.documents.elements.CompositeElement at 0x1dbc485c1a0>,
 <unstructured.docum

In [None]:
for element in raw_pdf_elements:
    print(element.metadata.to_dict())  # Access metadata for each element

In [None]:
import base64

text_elements = []
table_elements = []
image_elements = []

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

for element in raw_pdf_elements:
    if 'CompositeElement' in str(type(element)):
        text_elements.append(element)
    elif 'Table' in str(type(element)):
        table_elements.append(element)

table_elements = [i.text for i in table_elements]
text_elements = [i.text for i in text_elements]

# Tables
print("The length of table elements are :", len(table_elements))

# Text
print("The length of text elements are :", len(text_elements))

if os.listdir(output_path):  # True if folder has any files or subfolders
    print("Folder has files, proceeding...")
    for image_file in os.listdir(output_path):
        if image_file.endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(output_path, image_file)
            encoded_image = encode_image(image_path)
            image_elements.append(encoded_image)
        
    # image
    print("The length of image elements are :",len(image_elements))
    # ➡ Your next code block here
else:
    print("❌ Folder is empty. Skipping execution.")



The length of table elements are : 5
The length of text elements are : 106
Folder has files, proceeding...
The length of image elements are : 7


In [159]:
chain_gpt= ChatOpenAI(model="gpt-4o")

In [160]:
def summarize_text(text_element):
    prompt = f"Summarize the following text:\n\n{text_element}\n\nSummary:"
    response = chain_gpt.invoke([HumanMessage(content=prompt)])
    return response.content

In [161]:
text_summaries=[summarize_text(i) for i in text_elements]
text_summaries

['Google grants permission to reproduce the tables and figures from the paper for journalistic or scholarly purposes, provided proper attribution is given.',
 '"Attention Is All You Need" is a seminal paper that introduces the Transformer model, a novel architecture for sequence-to-sequence tasks. The authors, a team of researchers from Google Brain and other institutions, propose a model that relies entirely on self-attention mechanisms, dispensing with recurrent and convolutional layers traditionally used for these tasks. The Transformer is designed to enhance parallelization and improve computational efficiency. It demonstrates superior performance in tasks such as translation, setting new benchmarks and significantly reducing training time compared to previous models. The paper\'s contributions have had a profound impact on natural language processing and have paved the way for advancements in machine learning models.',
 'The dominant sequence transduction models typically use comp

In [162]:
def summarize_table(table_element):
    prompt = f"Summarize the following table:\n\n{table_element}\n\nSummary:"
    response = chain_gpt.invoke([HumanMessage(content=prompt)])
    return response.content

In [163]:
table_summaries=[summarize_table(i) for i in table_elements]
table_summaries

['The table compares several machine translation models based on their performance in translating from English to German (EN-DE) and English to French (EN-FR), measured by BLEU scores, along with their training costs in FLOPs. The ByteNet model has a BLEU score of 23.75 for EN-DE. The Deep-Att + PosUnk model achieves a BLEU score of 39.2 for EN-FR. The GNMT + RL model records 24.6 for EN-DE and 39.92 for EN-FR, with training costs of 2.3x10^19 and 1.4x10^70 FLOPs, respectively. The ConvS2S model has scores of 25.16 for EN-DE and 40.46 for EN-FR, with costs of 9.6x10^19 and 1.5x10^70 FLOPs. The MoE model scores 26.03 for EN-DE and 40.56 for EN-FR, with costs of 2.0x10^19 and 1.2x10^79 FLOPs. Ensemble versions of the Deep-Att + PosUnk and GNMT + RL models improve EN-FR scores to 40.4 and 41.16, respectively, with varying costs. The ConvS2S Ensemble scores 26.36 for EN-DE and 41.29 for EN-FR, with costs of 7.7x10^19 and 1.2x10^71 FLOPs. The Transformer models achieve the highest BLEU scor

In [164]:

def summarize_image(encoded_image):
    prompt = [
        AIMessage(content="You are a bot that is good at analyzing images."),
        HumanMessage(content=[
            {"type": "text", "text": "Describe the contents of this image."},
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{encoded_image}"
                },
            },
        ])
    ]
    response = chain_gpt.invoke(prompt)
    return response.content

In [165]:
image_summaries=[summarize_image(i) for i in image_elements]

In [166]:
image_summaries

['The image appears to be a diagram or visualization showing a sequence of words, likely representing a sentence. The words are connected with lines, possibly indicating relationships or dependencies between them. The sentence seems to be about American governments and voting laws, mentioning the difficulty of the registration or voting process since 2009. There are elements like "<EOS>" and "<pad>" which suggest this could be related to a machine learning model\'s processing of text data.',
 'This image appears to depict a visual representation of text alignment or attention mapping. It shows two sentences with words connected by lines of varying thickness and opacity, indicating associations or correspondences between the words. The sentences are:\n\n1. "The Law will never be perfect, but its application should be just."\n2. "This is what we are missing, in my opinion."\n\nThe connections suggest relationships between specific words in each sentence. Additionally, there are special t

In [167]:
vectorstore = Chroma(collection_name="summaris", embedding_function=OpenAIEmbeddings())
store = InMemoryStore()
id_key = "doc_id"
retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key=id_key,top_k=1)

In [168]:
def add_documents_to_retriever(summaries, original_contents):
    doc_ids = [str(uuid.uuid4()) for _ in summaries]
    summary_docs = [
        Document(page_content=s, metadata={id_key: doc_ids[i]})
        for i, s in enumerate(summaries)
    ]
    retriever.vectorstore.add_documents(summary_docs)
    retriever.docstore.mset(list(zip(doc_ids, original_contents)))


# Add text summaries


if len(text_summaries)>0:
    print('Text Summary Exist')
    add_documents_to_retriever(text_elements, text_elements)
else :
    print('Text is empty')

# Add table summaries
if len(table_summaries)>0:
    print('Table Summary Exist')
    add_documents_to_retriever(table_elements, table_elements)
else :
    print('Table is empty')

# Add image summaries

if len(image_summaries)>0:
    print('Image Summary Exist')
    add_documents_to_retriever(image_summaries, image_elements) 
else :
    print('Image is empty')

Text Summary Exist
Table Summary Exist
Image Summary Exist


In [173]:
# template = """Extract the following personal details based only on the context provided below. 
# The context may include text, images, and tables.

# {context}

# Return the result in this exact format:

# Name                   : <Actual Name>
# Email                  : <Actual Email>
# Mobile Number          : <Actual Mobile Number>
# Latest Education       : <Actual Latest Education>
# KEY SKILL SET          : <Actual KEY SKILL SET >

# Question: {question}
# """

template = """Extract the requested details.Do not give ambiguous answer.
Answers should be to the point.

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatOpenAI(temperature=0, model="gpt-4o")

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)


In [174]:
question= "What is SoftMax and MatMul in Decoder section"
answer = chain.invoke(question)
print(answer)

SoftMax and MatMul are operations typically used in the decoder section of neural network architectures, such as transformers. 

- **SoftMax**: This is a function that converts a vector of raw scores (logits) into probabilities. It is often used in the final layer of a neural network to produce a probability distribution over possible output classes.

- **MatMul**: This stands for matrix multiplication. It is a fundamental operation in neural networks used to compute the dot product between matrices, which is essential for transforming input data through the network layers.
