In [1]:
import yaml
import langchain
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

Load API key.

In [11]:
with open('api_keys.yaml', 'r') as f:
    keys = yaml.safe_load(f)

openai_api_key = keys['openai']

In [3]:
loader = PyPDFLoader("example_data/Aviza_et_al_2007.pdf")
pages = loader.load_and_split()

In [6]:
[len(p.page_content) for p in pages]

[2464, 54, 54, 103, 54, 54, 66, 54, 54, 3963, 3350, 3998, 2179, 2647]

NOTE: Not sure if all patents will be like this, but some "pages" are just headers since the actual content consists of images.

In [8]:
pages[6].page_content

'U.S. Patent Sep. 25, 2007 Sheet 6 0f 8 \n {I 6 \n £3 US 7,272,991 B2'

In [7]:
faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings(openai_api_key=openai_api_key))
docs = faiss_index.similarity_search("Summarize the independent claims.", k=2)
for doc in docs:
    print(str(doc.metadata["page"]) + ":", doc.page_content)
docs

1: U.S. Patent Sep. 25, 2007 Sheet 1 0f 8 US 7,272,991 B2
7: U.S. Patent Sep. 25, 2007 Sheet 7 0f 8 US 7,272,991 B2


[Document(page_content='U.S. Patent Sep. 25, 2007 Sheet 1 0f 8 US 7,272,991 B2', metadata={'source': 'example_data/Aviza_et_al_2007.pdf', 'page': 1}),
 Document(page_content='U.S. Patent Sep. 25, 2007 Sheet 7 0f 8 US 7,272,991 B2', metadata={'source': 'example_data/Aviza_et_al_2007.pdf', 'page': 7})]

In [13]:
from langchain.chains import LLMChain
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT

In [14]:
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain

In [16]:
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0, openai_api_key=openai_api_key), faiss_index.as_retriever(), return_source_documents=True)

In [19]:
chat_history = []
query = "What are the independent claims of US 7,272,991"
result = qa({"question": query, "chat_history": chat_history})

In [20]:
result

{'question': 'What are the independent claims of US 7,272,991',
 'chat_history': [],
 'answer': " I don't know.",
 'source_documents': [Document(page_content='U.S. Patent Sep. 25, 2007 Sheet 7 0f 8 US 7,272,991 B2', metadata={'source': 'example_data/Aviza_et_al_2007.pdf', 'page': 7}),
  Document(page_content='U.S. Patent Sep. 25, 2007 Sheet 1 0f 8 US 7,272,991 B2', metadata={'source': 'example_data/Aviza_et_al_2007.pdf', 'page': 1}),
  Document(page_content='U.S. Patent Sep. 25, 2007 Sheet 2 0f 8 US 7,272,991 B2', metadata={'source': 'example_data/Aviza_et_al_2007.pdf', 'page': 2}),
  Document(page_content='U.S. Patent Sep. 25, 2007 Sheet 5 0f 8 US 7,272,991 B2', metadata={'source': 'example_data/Aviza_et_al_2007.pdf', 'page': 5})]}

In [1]:
faiss_index.docstore._dict

NameError: name 'faiss_index' is not defined

In [None]:
llm = OpenAI(temperature=0)
question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT)
doc_chain = load_qa_chain(llm, chain_type="map_reduce")

chain = ConversationalRetrievalChain(
    retriever=faiss_index.as_retriever(),
    question_generator=question_generator,
    combine_docs_chain=doc_chain,
)