In [2]:
import yaml
import langchain
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

Load API key.

In [3]:
with open('../api_keys.yaml', 'r') as f:
    keys = yaml.safe_load(f)

openai_api_key = keys['openai']

## Initial exploration of FAISS as a vector store

In [4]:
loader = PyPDFLoader("../example_data/Aviza_et_al_2007.pdf")
pages = loader.load_and_split()

NOTE: Not sure if all patents will be like this, but some "pages" are just headers since the actual content consists of images.

In [7]:
pages[6].page_content

'U.S. Patent Sep. 25, 2007 Sheet 6 0f 8 \n {I 6 \n £3 US 7,272,991 B2'

In [8]:
[len(p.page_content) for p in pages]

[2464, 54, 54, 103, 54, 54, 66, 54, 54, 3963, 3350, 3998, 2179, 2647]

In [9]:
pages = [p for p in pages if len(p.page_content) > 100]

In [10]:
faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings(openai_api_key=openai_api_key))
docs = faiss_index.similarity_search("Summarize the independent claims.", k=2)
for doc in docs:
    print(str(doc.metadata["page"]) + ":", doc.page_content)
docs

3: U.S. Patent Sep. 25, 2007 Sheet 3 0f 8 US 7,272,991 B2 
 m» i 
 swim 
 {.3 G 
 Mm 
 came an 
 an as 
 K
9: cutting edges parallel to each other and spaced from adjacent 
 cutting edges so as to de?ne a shaving surface, connecting 
 the ?rst longitudinal ends to each other and the second 
 longitudinal ends to each by Welding While the cutting edges 
 are maintained parallel to each other. 
 Particular embodiments of the invention may include one 
 or more of the folloWing features. In particular embodiments 
 a ?xture is used to align the blades in parallel planes and to 
 position the cutting edges at desired positions. The ?xture 
 has slots to align the blades and stop surfaces to position the 
 cutting edges. The integral unit of blades is positioned into 
 a recess in a housing. The recess can be open to the top, With, 
 e.g., the integral blade unit being loWered into the recess and 
 held in place by clips or by snap-?tting, or the recess can 
 open to the bottom, With the in

[Document(page_content='U.S. Patent Sep. 25, 2007 Sheet 3 0f 8 US 7,272,991 B2 \n m» i \n swim \n {.3 G \n Mm \n came an \n an as \n K', metadata={'source': 'example_data/Aviza_et_al_2007.pdf', 'page': 3}),
 Document(page_content='cutting edges parallel to each other and spaced from adjacent \n cutting edges so as to de?ne a shaving surface, connecting \n the ?rst longitudinal ends to each other and the second \n longitudinal ends to each by Welding While the cutting edges \n are maintained parallel to each other. \n Particular embodiments of the invention may include one \n or more of the folloWing features. In particular embodiments \n a ?xture is used to align the blades in parallel planes and to \n position the cutting edges at desired positions. The ?xture \n has slots to align the blades and stop surfaces to position the \n cutting edges. The integral unit of blades is positioned into \n a recess in a housing. The recess can be open to the top, With, \n e.g., the integral blade u

In [19]:
type(pages)

list

## Function for ingesting multiple patent PDFs and putting them into a single vector store

In [56]:
def create_faiss_db(doc_paths: list[str], faiss_index: FAISS | None = None):

    all_pages = []
    for doc in doc_paths:
        loader = PyPDFLoader(doc)
        pages = loader.load_and_split()

        # Filter out any pages that are just the header (e.g. because the rest of the content is images.) 
        #  Also the first page might have useful info but has too much special formatting (e.g. specific numbers in front of specific fields)
        pages = [p for p in pages if len(p.page_content) > 150 and p.metadata['page'] > 0]
        all_pages += pages

    tmp_index = FAISS.from_documents(all_pages, OpenAIEmbeddings(openai_api_key=openai_api_key))
    if faiss_index is None:
        faiss_index = tmp_index
    else:
        faiss_index.merge_from(tmp_index)

    return faiss_index
    

Some examination of specific pages to address odd patent formatting.

In [57]:
doc_paths = ["../example_data/Aviza_et_al_2007.pdf", "../example_data/Jessemey_et_al_2011.pdf"]
faiss_index = create_faiss_db(doc_paths)

In [49]:
faiss_index.docstore._dict['12b34936-7858-4810-adf3-863a11d978ed'].metadata

{'source': 'example_data/Aviza_et_al_2007.pdf', 'page': 0}

In [53]:
for doc_id, page in faiss_index.docstore._dict.items():
    if page.metadata['page'] > 0 and page.metadata['source'] == '../example_data/Aviza_et_al_2007.pdf' and len(page.page_content) > 150:
        print(page.page_content)
        break

US 7,272,991 B2 
 1 
 SHAVING RAZORS, AND BLADE 
 SUBASSEMBLIES THEREFOR AND 
 METHODS OF MANUFACTURE 
 The invention relates to shaving razors, and blade subas 
 semblies therefor and methods of manufacture. 
 Shaving razors often include a plurality of blades that are 
 secured in a desired position in a plastic housing. The 
 housing is often provided With a guard With ?ns or other skin 
 engaging structures made of elastomeric material in front of 
 the blades, and a cap on Which the skin can slide behind the 
 blades. A shaving aid (e.g., a lubricant agent dispensing 
 mechanism) can be incorporated into the cap and, in some 
 cases, the guard. The blades can be stationary or movable, 
 and the housing can be ?xed to a handle or movably 
 mounted on the handle, to, e.g., assist in folloWing the 
 contours of the skin during shaving. 
 Examples of some different types of shaving raZors are 
 described in US. Pat. Nos. 5,313,706; 5,369,885; 5,416, 
 974; 5,546,660; 6,032,372; 6,145,

## Use docs in FAISS to answer questions

In [11]:
from langchain.chains import LLMChain
from langchain.chains.question_answering import load_qa_chain
# Below prompt:
"""Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT

In [12]:
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain

In [58]:
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0, openai_api_key=openai_api_key), faiss_index.as_retriever(), return_source_documents=True)

In [59]:
chat_history = []
query = "Summarize the invention in US patent 7,272,991?"
result = qa({"question": query, "chat_history": chat_history})

In [60]:
result

{'question': 'Summarize the invention in US patent 7,272,991?',
 'chat_history': [],
 'answer': ' US patent 8,061,041 B2 is for a safety razor with an electronic control device. The device includes a handle with a neck member, and two contact members extending from the neck member. The contact members are configured to resiliently bend. The electronic control device is electrically coupled to the at least one blade and includes a switch for controlling operation of the device between a normal mode and a low power consumption mode. The device also includes an indicator for producing a signal for indicating to a razor user that the electronic control device is connected to a power source and ready to actuate the electrical device.',
 'source_documents': [Document(page_content='US 8,061,041 B2 \n Page 2 \n U.S. PATENT DOCUMENTS 2008/0289185 A1* 11/2008 Clarke ........................... 30, 41.5 \n 2009 OOO7433 A1* 1/2009 Hawes et al. ... 30.45 \n 6,406,157 B1 6/2002 Audet 2009 OO19701 A1

In [17]:
len(faiss_index.docstore._dict)

7

In [None]:
llm = OpenAI(temperature=0)
# This takes previous questions and answers, along with user input, and combines them into a new standalone question.
question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT)
# map_reduce means that doc_chain is applied to each document and then their contents are combined with another LLM before being fed into question_generator
# llm serves both as map and reduce llm. Promps seem to be defaults for these two tasks.
doc_chain = load_qa_chain(llm, chain_type="map_reduce")

chain = ConversationalRetrievalChain(
    retriever=faiss_index.as_retriever(),
    question_generator=question_generator,
    combine_docs_chain=doc_chain,
)