# Installing Dependencies

In [1]:
from typing import Any
import os
from unstructured.partition.pdf import partition_pdf
import pytesseract
import os
import base64
from langchain.chat_models import ChatOpenAI
from langchain.schema.messages import HumanMessage, AIMessage
import uuid
import chromadb
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser


# Parse PDFs using Tesseract and Unstructed (ML Based PDF Reader)

Using Tesseract/Unstructured allows us to extract images from PDFs. We can then use a vision model to create summaries of the images that can be used as context when generating content

This approach is especially useful as it can extract text in English, German and French (the required EPO languages), so in theory should be able to process docs majority of EPO applications.

In [2]:
input_path = os.getcwd()
output_path = 'C:/Users/eeo21/VSCodeProjects/PatentRAGApplication/figures'

# Get elements from both patents
raw_pdf_elements = partition_pdf(
    filename=os.path.join(input_path, "us10745814.pdf"),
    extract_images_in_pdf=True, # Get images 
    infer_table_structure=True, # Get tables 
    chunking_strategy="by_title", # Preserves sections (by headings and subheadings, as well as over pages where possible)
    max_characters=4000, # Define chunk size
    new_after_n_chars=3800, # Soft max chunk size 
    combine_text_under_n_chars=2000, # Min length of chunk size
    image_output_dir_path='C:/Users/eeo21/VSCodeProjects/PatentRAGApplication/figures') #Directory to store images

This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSe

#  Sort PDF Elements into Table, Text and Images

In [None]:
text_elements = []
table_elements = []
image_elements = []

for element in raw_pdf_elements:
    if 'CompositeElement' in str(type(element)):
        text_elements.append(element)
    elif 'Table' in str(type(element)):
        table_elements.append(element)

table_elements = [i.text for i in table_elements]
text_elements = [i.text for i in text_elements]


# Check that we managed to extract tables and text from the PDF
# Tables
print(len(table_elements))

# Text
print(len(text_elements))

# Function to Encode images for use with GPT-4 Vision Model 

In [None]:
# Function to encode images
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

### Encoding Images

In [None]:
for image_file in os.listdir('C:/Users/eeo21/VSCodeProjects/PatentRAGApplication/figures'):
    if image_file.endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join('C:/Users/eeo21/VSCodeProjects/PatentRAGApplication/figures', image_file)
        encoded_image = encode_image(image_path)
        image_elements.append(encoded_image)

# Define API KEY Globally

In [None]:
OPENAI_API_KEY = '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'
os.environ['OPENAI_API_KEY'] = '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'

# Functions for Creating Summaries for Text, Tables and Elements in PDF(s)

In [None]:
chain_gpt_35 = ChatOpenAI(model="gpt-3.5-turbo", max_tokens=1024, api_key=OPENAI_API_KEY)
chain_gpt_4_vision = ChatOpenAI(model="gpt-4-vision-preview", max_tokens=1024, api_key=OPENAI_API_KEY)

# Function for text summaries
def summarize_text(text_element):
    prompt = f"Summarize the following text:\n\n{text_element}\n\nSummary:"
    response = chain_gpt_35.invoke([HumanMessage(content=prompt)])
    return response.content

# Function for table summaries
def summarize_table(table_element):
    prompt = f"Summarize the following table:\n\n{table_element}\n\nSummary:"
    response = chain_gpt_35.invoke([HumanMessage(content=prompt)])
    return response.content

# Function for image summaries
def summarize_image(encoded_image):
    prompt = [
        AIMessage(content="You are a bot that is good at analyzing images."),
        HumanMessage(content=[
            {"type": "text", "text": "Describe the contents of this image."},
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{encoded_image}"
                },
            },
        ])
    ]
    response = chain_gpt_4_vision.invoke(prompt)
    return response.content

# Generate Summaries of Tables, Text and Images

### Table Summaries

In [None]:
# Processing table elements with feedback and sleep
table_summaries = []
for i, te in enumerate(table_elements):
    summary = summarize_table(te)
    table_summaries.append(summary)
    print(f"{i + 1}th element of tables processed.")

### Text Summaries

In [None]:
# Processing text elements with feedback and sleep
text_summaries = []
for i, te in enumerate(text_elements):
    summary = summarize_text(te)
    text_summaries.append(summary)
    print(f"{i + 1}th element of texts processed.")

### Image Summaries

In [None]:
# Processing image elements with feedback and sleep
image_summaries = []
for i, ie in enumerate(image_elements):
    summary = summarize_image(ie)
    image_summaries.append(summary)
    print(f"{i + 1}th element of images processed.")

print(image_summaries)

# Save Documents and Summaries to Database (Chroma)

In [None]:
store = InMemoryStore()
id_key = "doc_id"

# Initialize the retriever
vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())
retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key=id_key)

# Function to add documents to the retriever
def add_documents_to_retriever(summaries, original_contents):
    doc_ids = [str(uuid.uuid4()) for _ in summaries]
    summary_docs = [
        Document(page_content=s, metadata={id_key: doc_ids[i]})
        for i, s in enumerate(summaries)
    ]
    retriever.vectorstore.add_documents(summary_docs)
    retriever.docstore.mset(list(zip(doc_ids, original_contents)))

In [None]:
# Add text summaries
add_documents_to_retriever(text_summaries, text_elements)

# Add table summaries
add_documents_to_retriever(table_summaries, table_elements)

# Add image summaries
add_documents_to_retriever(image_summaries, image_summaries) # hopefully real images soon

# Return top-N contexts from database (by cosine similarity)  

In [None]:
# Print context used to inform output of LLM
retriever.get_relevant_documents(
    'What are some methods for electrically controlled nucleotide chain synthesis?'
)

# Collect Prompt from User

In [None]:
template = """Answer the question based only on the following context, which can include text, images and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

# Generate Response from User Input

In [None]:
chain.invoke(
     'What are some methods for electrically controlled nucleotide chain synthesis?'
)

# Integrating Backend with Gradio App