#### Installing Dependencies

In [59]:
from typing import Any
import os
from unstructured.partition.pdf import partition_pdf
import pytesseract
import os
import base64
from langchain.chat_models import ChatOpenAI
from langchain.schema.messages import HumanMessage, AIMessage
import uuid
import chromadb
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
import openai
import gradio


#### Define input and output paths

In [32]:
input_path = os.getcwd()
output_path = 'C:/Users/eeo21/VSCodeProjects/PatentRAGApplication/figures'

#### Define and Instantiate LLM

In [57]:
# API Key


OPENAI_API_KEY = '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'
os.environ['OPENAI_API_KEY'] = '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'
openai.api_key = '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'

# OpenAI Model

chain_gpt_35 = ChatOpenAI(model="gpt-4-0125-preview", max_tokens=1024, api_key=OPENAI_API_KEY)
chain_gpt_4_vision = ChatOpenAI(model="gpt-4-vision-preview", max_tokens=1024, api_key=OPENAI_API_KEY)

#### Function to encode images for use with GPT-4 Vision Model 

In [34]:
# Function to encode images
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

#### Functions to create summaries from PDF Text, Tables and Images 

In [35]:
# Function for text summaries
def summarize_text(text_element):
    prompt = f"Summarize in detail the following text:\n\n{text_element}\n\nSummary:"
    response = chain_gpt_35.invoke([HumanMessage(content=prompt)])
    return response.content

# Function for table summaries
def summarize_table(table_element):
    prompt = f"Summarize in detail the following table:\n\n{table_element}\n\nSummary:"
    response = chain_gpt_35.invoke([HumanMessage(content=prompt)])
    return response.content

# Function for image summaries
def summarize_image(encoded_image):
    prompt = [
        AIMessage(content="You are a bot that is good at analyzing images."),
        HumanMessage(content=[
            {"type": "text", "text": "Describe the contents of this image."},
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{encoded_image}"
                },
            },
        ])
    ]
    response = chain_gpt_4_vision.invoke(prompt)
    return response.content

#### Define retreival algorithm and database

In [None]:
store = InMemoryStore()
id_key = "doc_id"

# Initialize the retriever
vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())
retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key=id_key)


#### Function to add documents to retrieval algorithm

In [37]:
# Function to add documents to the retriever
def add_documents_to_retriever(summaries, original_contents):
    id_key = "doc_id"
    doc_ids = [str(uuid.uuid4()) for _ in summaries]
    summary_docs = [
        Document(page_content=s, metadata={id_key: doc_ids[i]})
        for i, s in enumerate(summaries)
    ]
    retriever.vectorstore.add_documents(summary_docs)
    retriever.docstore.mset(list(zip(doc_ids, original_contents)))

#### Adding PDF information into database

In [None]:
Patents = ['us10745814', 'us2021229059']

for patent in Patents:

    raw_pdf_elements = partition_pdf(
        filename=os.path.join(input_path, f"{patent}.pdf"),
        extract_images_in_pdf=True, # Get images 
        infer_table_structure=True, # Get tables 
        chunking_strategy="by_title", # Preserves sections (by headings and subheadings, as well as over pages where possible)
        max_characters=4000, # Define chunk size
        new_after_n_chars=3800, # Soft max chunk size 
        extract_image_block_types=['Image'],
        combine_text_under_n_chars=2000, # Min length of chunk size
        extract_image_block_output_dir=f'C:/Users/eeo21/VSCodeProjects/PatentRAGApplication/{patent}_images') #Directory to store images


    # Save table and text elements to list
    text_elements = []
    table_elements = []
    image_elements = []

    for element in raw_pdf_elements:
        if 'CompositeElement' in str(type(element)):
            text_elements.append(element)
        elif 'Table' in str(type(element)):
            table_elements.append(element)

    table_elements = [i.text for i in table_elements]
    text_elements = [i.text for i in text_elements]

    # Save image elements to list, converting to format compatible with Vision transformer model

    for image_file in os.listdir(f'C:/Users/eeo21/VSCodeProjects/PatentRAGApplication/{patent}_images'):
        if image_file.endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(f'C:/Users/eeo21/VSCodeProjects/PatentRAGApplication/{patent}_images', image_file)
            encoded_image = encode_image(image_path)
            image_elements.append(encoded_image)

    # Generate Table summaries
    table_summaries = []
    
    for i, te in enumerate(table_elements):
        summary = summarize_table(te)
        table_summaries.append(summary)
        print(f"{i + 1}th element of tables processed.")

    # Generate Text element summaries
    text_summaries = []
    for i, te in enumerate(text_elements):
        summary = summarize_text(te)
        text_summaries.append(summary)
        print(f"{i + 1}th element of texts processed.")

    # Generate summaries of images
    image_summaries = []
    for i, ie in enumerate(image_elements):
        try:
            summary = summarize_image(ie)
            image_summaries.append(summary)
        except:
            print(f'{ie} could not be summarised, could be too large')
        print(f"{i + 1}th element of images processed.")

    # Add text, table and image summaries to vector database

    add_documents_to_retriever(text_summaries, text_elements)
    add_documents_to_retriever(table_summaries, table_elements)
    add_documents_to_retriever(image_summaries, image_summaries) 


In [54]:

print(retriever)

vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000002916281B820> docstore=<langchain.storage.in_memory.InMemoryBaseStore object at 0x000002916281BF10>


#### Processing user queries

In [44]:
#Define prompt template

template = """Answer the question based only on the following context, which can include text, images and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [49]:
# response = chain.invoke('What are some methods for electrically controlled nucleotide chain synthesis?')

print(len(retriever.get_relevant_documents('What are some methods for electrically controlled nucleotide chain synthesis?')))

# print(response)

4


In [62]:
messages = [
    {"role": "system", "content": "You are a helpful and kind AI Assistant."},
]

def chatbot(input):
    if input:
        messages.append({"role": "user", "content": input})
        chat = openai.ChatCompletion.create(
            model="gpt-3.5-turbo", messages=messages
        )
        reply = chat.choices[0].message.content
        messages.append({"role": "assistant", "content": reply})
        return reply

inputs = gradio.Textbox(lines=7, label="Chat with AI")
outputs = gradio.Textbox(label="Reply")

gradio.Interface(fn=chatbot, inputs=inputs, outputs=outputs, title="Your First AI ChatBot",
             description="Ask anything you want",
             theme="compact").launch(share=True)


Sorry, we can't find the page you are looking for.


Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://b99a31e870e0963fb5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




Traceback (most recent call last):
  File "c:\Users\eeo21\Anaconda3\lib\site-packages\gradio\queueing.py", line 495, in call_prediction
    output = await route_utils.call_process_api(
  File "c:\Users\eeo21\Anaconda3\lib\site-packages\gradio\route_utils.py", line 231, in call_process_api
    output = await app.get_blocks().process_api(
  File "c:\Users\eeo21\Anaconda3\lib\site-packages\gradio\blocks.py", line 1591, in process_api
    result = await self.call_function(
  File "c:\Users\eeo21\Anaconda3\lib\site-packages\gradio\blocks.py", line 1176, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "c:\Users\eeo21\Anaconda3\lib\site-packages\anyio\to_thread.py", line 28, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(func, *args, cancellable=cancellable,
  File "c:\Users\eeo21\Anaconda3\lib\site-packages\anyio\_backends\_asyncio.py", line 818, in run_sync_in_worker_thread
    return await future
  File "c:\Users\eeo21\Anaconda3\lib\site-pack