#### Installing Dependencies

In [9]:
from typing import Any
import os
from unstructured.partition.pdf import partition_pdf
import pytesseract
import os
import base64
from langchain.chat_models import ChatOpenAI
from langchain.schema.messages import HumanMessage, AIMessage
import uuid
import chromadb
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
import openai
import gradio
import json

#### Define input and output paths

In [10]:
input_path = os.getcwd()
output_path = 'C:/Users/eeo21/VSCodeProjects/PatentRAGApplication/figures'

#### Define and Instantiate LLM

In [11]:
# API Key


OPENAI_API_KEY = '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'
os.environ['OPENAI_API_KEY'] = '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'
openai.api_key = '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'

# OpenAI Model

chain_gpt_35 = ChatOpenAI(model="gpt-4-0125-preview", max_tokens=1024, api_key=OPENAI_API_KEY)
chain_gpt_4_vision = ChatOpenAI(model="gpt-4-vision-preview", max_tokens=1024, api_key=OPENAI_API_KEY)

#### Function to encode images for use with GPT-4 Vision Model 

In [12]:
# Function to encode images
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

#### Functions to create summaries from PDF Text, Tables and Images 

In [13]:
# Function for text summaries
def summarize_text(text_element):
    prompt = f"Summarize in detail the following text:\n\n{text_element}\n\nSummary:"
    response = chain_gpt_35.invoke([HumanMessage(content=prompt)])
    return response.content

# Function for table summaries
def summarize_table(table_element):
    prompt = f"Summarize in detail the following table:\n\n{table_element}\n\nSummary:"
    response = chain_gpt_35.invoke([HumanMessage(content=prompt)])
    return response.content

# Function for image summaries
def summarize_image(encoded_image):
    prompt = [
        AIMessage(content="You are a bot that is good at analyzing images."),
        HumanMessage(content=[
            {"type": "text", "text": "Describe the contents of this image."},
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{encoded_image}"
                },
            },
        ])
    ]
    response = chain_gpt_4_vision.invoke(prompt)
    return response.content

#### Define retreival algorithm and database

In [14]:
store = InMemoryStore()
id_key = "doc_id"

# Initialize the retriever
vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())
retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key=id_key)


#### Function to add documents to retrieval algorithm

In [15]:
# Function to add documents to the retriever
def add_documents_to_retriever(summaries, original_contents):
    id_key = "doc_id"
    doc_ids = [str(uuid.uuid4()) for _ in summaries]
    summary_docs = [
        Document(page_content=s, metadata={id_key: doc_ids[i]})
        for i, s in enumerate(summaries)
    ]
    retriever.vectorstore.add_documents(summary_docs)
    retriever.docstore.mset(list(zip(doc_ids, original_contents)))

#### Adding PDF information into database

In [16]:
Patents = ['us10745814', 'us2021229059']

for patent in Patents:

    raw_pdf_elements = partition_pdf(
        filename=os.path.join(input_path, f"{patent}.pdf"),
        extract_images_in_pdf=True, # Get images 
        infer_table_structure=True, # Get tables 
        chunking_strategy="by_title", # Preserves sections (by headings and subheadings, as well as over pages where possible)
        max_characters=4000, # Define chunk size
        new_after_n_chars=3800, # Soft max chunk size 
        extract_image_block_types=['Image'],
        combine_text_under_n_chars=2000, # Min length of chunk size
        extract_image_block_output_dir=f'C:/Users/eeo21/VSCodeProjects/PatentRAGApplication/{patent}_images') #Directory to store images


    # Save table and text elements to list
    text_elements = []
    table_elements = []
    image_elements = []

    for element in raw_pdf_elements:
        if 'CompositeElement' in str(type(element)):
            text_elements.append(element)
        elif 'Table' in str(type(element)):
            table_elements.append(element)
    
    patent_id = f'###_### {patent}' 

    table_elements = [i.text for i in table_elements]
    table_elements = [''.join([i, patent_id]) for i in table_elements]

    text_elements = [i.text for i in text_elements]
    text_elements = [''.join([i, patent_id]) for i in text_elements]

    # Save image elements to list, converting to format compatible with Vision transformer model

    for image_file in os.listdir(f'C:/Users/eeo21/VSCodeProjects/PatentRAGApplication/{patent}_images'):
        if image_file.endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(f'C:/Users/eeo21/VSCodeProjects/PatentRAGApplication/{patent}_images', image_file)
            encoded_image = encode_image(image_path)
            image_elements.append(encoded_image)

    # Generate Table summaries
    table_summaries = []
    
    for i, te in enumerate(table_elements):
        summary = summarize_table(te)
        summary = ''.join([summary, patent_id])
        table_summaries.append(summary)
        print(f"{i + 1}th element of tables processed.")

    # Generate Text element summaries
    text_summaries = []
    for i, te in enumerate(text_elements):
        summary = summarize_text(te)
        summary = ''.join([summary, patent_id])
        text_summaries.append(summary)
        print(f"{i + 1}th element of texts processed.")

    # Generate summaries of images
    image_summaries = []
    for i, ie in enumerate(image_elements):
        try:
            summary = summarize_image(ie)
            summary = ''.join([summary, patent_id])
            image_summaries.append(summary)
        except:
            print(f'{ie} could not be summarised, could be too large')
        print(f"{i + 1}th element of images processed.")

    # Add text, table and image summaries to vector database
    add_documents_to_retriever(text_summaries, text_elements)
    add_documents_to_retriever(table_summaries, table_elements)
    add_documents_to_retriever(image_summaries, image_summaries) 

    # Save pdf elements to databases 

    texts =list(zip(text_summaries, text_elements))
    tables = list(zip(table_summaries, table_elements))
    images = list(zip(image_summaries, image_summaries))

    components = [texts, tables, images]
    
    PDFdictionary = {}
    for component in components:
        for pair in component:
            PDFdictionary[pair[0]] = [pair[1]]
    
    with open(f"{patent}.json", "w") as outfile: 
        json.dump(PDFdictionary, outfile)

This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name


1th element of tables processed.
2th element of tables processed.
3th element of tables processed.
4th element of tables processed.
5th element of tables processed.
1th element of texts processed.
2th element of texts processed.
3th element of texts processed.
4th element of texts processed.
5th element of texts processed.
6th element of texts processed.
7th element of texts processed.
8th element of texts processed.
9th element of texts processed.
10th element of texts processed.
11th element of texts processed.
12th element of texts processed.
13th element of texts processed.
14th element of texts processed.
15th element of texts processed.
16th element of texts processed.
17th element of texts processed.
18th element of texts processed.
19th element of texts processed.
20th element of texts processed.
21th element of texts processed.
22th element of texts processed.
23th element of texts processed.
24th element of texts processed.
25th element of texts processed.
26th element of tex

This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name


1th element of tables processed.
1th element of texts processed.
2th element of texts processed.
3th element of texts processed.
4th element of texts processed.
5th element of texts processed.
6th element of texts processed.
7th element of texts processed.
8th element of texts processed.
9th element of texts processed.
10th element of texts processed.
11th element of texts processed.
12th element of texts processed.
13th element of texts processed.
14th element of texts processed.
15th element of texts processed.
16th element of texts processed.
17th element of texts processed.
18th element of texts processed.
1th element of images processed.
2th element of images processed.
3th element of images processed.
4th element of images processed.
5th element of images processed.
6th element of images processed.
7th element of images processed.
8th element of images processed.
9th element of images processed.


#### Define inference model and prompt

In [17]:
#Define prompt template

template = """Answer the question based only on the following context, which can include text, images and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

Aim = 'Quality' #Change to slow for faster response, perhaps with lower quality

if Aim == 'Quality':
    model = ChatOpenAI(temperature=0, model="gpt-4-0125-preview")
elif Aim == 'Fast':
    model = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

chain = ({"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser())

#### Define Gradio App

In [18]:
response = chain.invoke('How can I use pH alternation to activate an enzyme intermittently, in order to add a nucleotide to a polymer chain?')

context = retriever.get_relevant_documents('How can I use pH alternation to activate an enzyme intermittently, in order to add a nucleotide to a polymer chain?')

for x in context:
     print(context)

print(response)

['2019r0040459 Al 2r2019 Pi\'casiicb ei sl\n\n\'" cited by examiner\n\nPrimriri Ear(ruiner Narayan K Bhat (74) driornel;\n\n)gent, or Firm Banner & Witcofi; Ltd.\n\n)gent, or Firm Banner & Witcofi; Ltd.\n\nABSTRACT (57) The present disclosure provides methods of activatin an enzyme. such as error prone or template independent poly- mcrasc, using clcctucity io alter pH of a rcactton zone and reaction site from an inactn aiing pH at wluch tlm enzyme is inactive to an activating pl I at v hich the enzynie is active to add a nucleotide to an initiator or amwing polynier chain \'I\'he activating pl I can then be changed back to an inacti- vating pH and the process repeated as many times as desired to produce a target nucleic acid sequence\n\n48 Claims, 9 Drawing Sheets\n\nSpecilication includes a Sequence Listing.\n\nU.S. Patent\n\nAug. 18,2020\n\ng FIG.1 43 rs & Ss %, % ey oe Re e red\n\n1-Borano-dATP\n\no  o o*  HO  B3eara-dATP  0 5-Propargylamlno-dNP  HO  S~ R~ /X. / 0 0  0  0  0  0  ,54

In [19]:
messages = [
    {"role": "system", "content": "You are a helpful and kind AI Assistant."},
]

def chatbot(input):
    if input:
        chat = openai.ChatCompletion.create(
            model="gpt-3.5-turbo", messages=messages
        )
        reply = chat.choices[0].message.content
        messages.append({"role": "assistant", "content": reply})
        return reply

inputs = gradio.Textbox(lines=7, label="Chat with AI")
outputs = gradio.Textbox(label="Reply")

gradio.Interface(fn=chatbot, inputs=inputs, outputs=outputs, title="Your First AI ChatBot",
             description="Ask anything you want",
             theme="compact").launch(share=True)


Sorry, we can't find the page you are looking for.


Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://746664bfde196bc668.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


