In [75]:
import yaml
import os

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.document_loaders import BigQueryLoader
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.llms import OpenAI

## Authentication

Load OpenAI API Key

In [3]:
with open('../api_keys.yaml', 'r') as f:
    keys = yaml.safe_load(f)

openai_api_key = keys['openai']

Authenticate to BigQuery

In [4]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../law-project-service-account.json'

In [44]:
QUERY = (
    'SELECT spif_publication_number as patent_number, t.text as title,  a.text as abstract, c.text as claims '
    'FROM `patents-public-data.patents.publications`, UNNEST(title_localized) as t, UNNEST(abstract_localized) as a,  UNNEST(claims_localized) as c '
    'WHERE spif_publication_number = "US8205344B2" '
    'LIMIT 100')

## BigQuery Doc Loading

In [45]:
loader = BigQueryLoader(QUERY)

data = loader.load()

In [46]:
print(data)

[Document(page_content='patent_number: US8205344B2\ntitle: Safety razor having pivotable blade unit\nabstract: A safety razor having a blade unit has at least one blade and a handle casing. A pivotal connection structure is disposed between the blade unit and the handle casing. A first member is connected to the blade unit and a second member is connected to the handle casing. A joint member connects the first member and the second member and facilitates movement of the first member relative to the second member about a hinge axis that is substantially perpendicular to the at least one blade.\nclaims: 1. A safety razor comprising:\n a blade unit having at least one blade having a cutting edge; \n a handle casing; \n a pivotal connection structure including: \n a first member connected to the blade unit; \n a second member connected to the handle casing; and \n a joint member comprising a plurality of separated joint elements which are disposed along the perpendicular pivot axis, and wh

# Document(s) QA

## Chunking

### Split query results into chunks

In [47]:
from langchain.text_splitter import RecursiveCharacterTextSplitter 
text_splitter = RecursiveCharacterTextSplitter (
    # Set a really small chunk size, just to show.
    chunk_size = 1000,
    chunk_overlap  = 0
)
texts = text_splitter.split_documents(data)
print(texts[0])
print(len(texts))

page_content='patent_number: US8205344B2\ntitle: Safety razor having pivotable blade unit\nabstract: A safety razor having a blade unit has at least one blade and a handle casing. A pivotal connection structure is disposed between the blade unit and the handle casing. A first member is connected to the blade unit and a second member is connected to the handle casing. A joint member connects the first member and the second member and facilitates movement of the first member relative to the second member about a hinge axis that is substantially perpendicular to the at least one blade.\nclaims: 1. A safety razor comprising:\n a blade unit having at least one blade having a cutting edge; \n a handle casing; \n a pivotal connection structure including: \n a first member connected to the blade unit; \n a second member connected to the handle casing; and' metadata={}
7


Give a source index to each of the doc chunks.

In [52]:
for i in range(len(texts)):
    texts[i].metadata['source'] = i
texts[0].metadata['source']

0

In [64]:
len(data[0].page_content)

6431

In [66]:
data[0].page_content[0]

'p'

### Construct vector DB out of results

CAREFUL: Don't run next cell too often; costs money

In [48]:
faiss_index = FAISS.from_documents(texts, OpenAIEmbeddings(openai_api_key=openai_api_key))

In [49]:
docs = faiss_index.similarity_search("Summarize the independent claims.", k=6)
docs

[Document(page_content='9. The safety razor of  claim 7  wherein the blade unit includes a frame with a cam surface and the handle unit includes a spring-biased plunger with a rounded distal end that contacts the cam surface at a location spaced from the parallel pivot axis to impart a biasing force to the frame. \n     \n     \n       10. The safety razor of  claim 1  further comprising a shaving cartridge which is detachable from the handle unit, wherein the shaving cartridge includes the blade unit and the pivotal connection structure. \n     \n     \n       11. The safety razor of  claim 1  further comprising a shaving cartridge which is detachable from the handle unit, wherein the shaving cartridge includes the blade unit and the handle unit includes the pivotal connection structure. \n     \n     \n       12. The safety razor of  claim 1  wherein the separated joint elements include a triangular prism shaped base member and an elastic plate member extending from the base member.'

### Use search results against vector DB in LLM QA prompt

CAREFUL: Don't run next cell too often; costs money

In [54]:
template = """You are a patent lawyer. Given the following extracted parts of a patent and a question, create a final answer with references ("SOURCES"). 
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.

QUESTION: {question}
=========
{summaries}
=========
FINAL ANSWER:"""
PROMPT = PromptTemplate(template=template, input_variables=["summaries", "question"])

chain = load_qa_with_sources_chain(OpenAI(temperature=0, openai_api_key=openai_api_key), chain_type="stuff", prompt=PROMPT)
query = "How does claim 1 differ from claim 15?"
docs = faiss_index.similarity_search(query, k=6)
chain({"input_documents": docs, "question": query}, return_only_outputs=True)

{'output_text': '\nClaim 1 of the safety razor includes a blade unit having at least one blade with a cutting edge, a handle casing, and a pivotal connection structure including a first member connected to the blade unit and a second member connected to the handle casing. Claim 15 of the safety razor includes all of the elements of claim 1, as well as a joint member connecting the first and second members and facilitating movement of the first member relative to the second member about a hinge axis that is substantially perpendicular to the at least one blade. Additionally, claim 15 includes further elements such as the joint member being formed by an injection molding process of a thermo plastic material, at least one of the first and second members having a convexly curved face facing the other of the first and second members, a shaving cartridge which is detachable from the handle unit, and the shaving cartridge including the blade unit and the pivotal connection structure. \n\nSOUR

In [61]:
texts[6]

Document(page_content='16. The safety razor according to  claim 15 , wherein the first and second members and the joint member are formed by an injection molding process of a thermo plastic material. \n     \n     \n       17. The safety razor according to  claim 15 , wherein at least one of the first and second members has a convexly curved face facing the other of the first and second members. \n     \n     \n       18. The safety razor according to  claim 15 , further comprising a shaving cartridge which is detachable from the handle unit, wherein the shaving cartridge includes the blade unit and the pivotal connection structure. \n     \n     \n       19. The safety razor according to  claim 15 , further comprising a shaving cartridge which is detachable from the handle unit, wherein the shaving cartridge includes the blade unit and the handle unit includes the pivotal connection structure.', metadata={'source': 6})

## Whole Patent as Input

In [74]:
template = """You are a patent lawyer. Given the following extracted parts of a patent and a question, create a final answer with references ("SOURCES"). Answer concisely in plain English so that a layman could understand.
Do not repeat anything in the patent in your answer. If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.

QUESTION: {question}
=========
{summaries}
=========
FINAL ANSWER:"""
PROMPT = PromptTemplate(template=template, input_variables=["summaries", "question"])

chain = load_qa_with_sources_chain(OpenAI(temperature=0, openai_api_key=openai_api_key, model_name='gpt-3.5-turbo'), chain_type="stuff", prompt=PROMPT)
query = "How does claim 1 differ from claim 15?"
data[0].metadata['source'] = 0  # Need to give our singular doc source info that the LLM can cite.
chain({"input_documents": data, "question": query}, return_only_outputs=True)



{'output_text': 'Claim 1 and claim 15 both describe a safety razor with a blade unit and a handle casing connected by a pivotal connection structure that allows movement of the blade unit relative to the handle casing about a hinge axis that is perpendicular to the cutting edge of the blade. However, claim 1 specifies that the joint member of the pivotal connection structure has a thinner wall section toward the hinge axis than toward at least one of the joint portions of the first and second members, while claim 15 does not include this feature. \n\nSOURCES: US8205344B2'}

# Chat over Docs with Chat History

In [79]:
from typing import List

from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.schema import BaseRetriever, Document

In [76]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [84]:
# Assumes you ran the BigQuery Doc Loading section and now have patent as a single document in list `data`.
data[0].metadata['source'] = 0
class SimpleRetriever(BaseRetriever):

    def __init__(self) -> None:
        super().__init__()

    def get_relevant_documents(self, query: str) -> List[Document]:
        return data

    async def aget_relevant_documents(self, query: str) -> List[Document]:
        return data
    
simple_retriever = SimpleRetriever()

In [85]:
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0, openai_api_key=openai_api_key), simple_retriever, memory=memory)

In [86]:
query = "How does claim 1 differ from claim 15?"
result = qa({'question': "How does claim 1 differ from claim 15?"})
result['answer']

' Claim 1 describes a safety razor with a blade unit, handle casing, and a pivotal connection structure between the blade unit and the handle casing. The pivotal connection structure includes a first member connected to the blade unit, a second member connected to the handle casing, and a joint member connecting the first and second members that facilitates movement of the first member relative to the second member about a hinge axis that is substantially perpendicular to the at least one cutting edge. Claim 15 describes a safety razor with a blade unit, handle unit, and a pivotal connection structure between the blade unit and the handle unit. The pivotal connection structure includes a first member connected to the blade unit, a second member connected to the handle unit, and a joint member connecting the first and second members that works as the perpendicular pivot axis. The joint member includes a plurality of separated joint elements which are disposed along the perpendicular piv

In [88]:
result = qa({'question': 'Can you summarize those differences more clearly?'})
result['answer']

' Claim 1 describes a safety razor with a blade unit, handle casing, and a pivotal connection structure between the two. The pivotal connection structure includes a first member connected to the blade unit, a second member connected to the handle casing, and a joint member connecting the two. The joint member has a thinner wall section toward the hinge axis than toward the joint portions of the first and second members. Claim 15 describes a safety razor with a blade unit, handle casing, and a pivotal connection structure between the two. The pivotal connection structure includes a first member connected to the blade unit, a second member connected to the handle casing, and a joint member connecting the two. The joint member has a hinge axis disposed between the joint portions of the first and second members, and includes a plurality of separated joint elements which are disposed along the perpendicular pivot axis. Each of the joint elements includes a bearing and a pivot shaft.'

In [89]:
result = qa({'question': 'Please tell me how the two claims are different.'})
result['answer']

' Claim 1 describes a safety razor with a blade unit, handle casing, and a pivotal connection structure between the two. The pivotal connection structure includes a first member connected to the blade unit, a second member connected to the handle casing, and a joint member connecting the two. Claim 15 describes a safety razor with a blade unit, handle casing, and a pivotal connection structure between the two. The pivotal connection structure includes a first member connected to the blade unit, a second member connected to the handle casing, and a joint member connecting the two. The joint member includes a plurality of separated joint elements which are disposed along the perpendicular pivot axis, and each of the joint elements includes a bearing and a pivot shaft.'

### Improve prompts

In [None]:
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0, openai_api_key=openai_api_key), 
                                           simple_retriever, 
                                           memory=memory,
                                           condense_question_prompt=TODO, qa_prompt=TODO)
# TODO qa_prompt arg might be deprecated?
# See template https://github.com/hwchase17/langchain/blob/239dc108527849e94f447ab4cc2c6e51fc1aab1f/langchain/chains/conversational_retrieval/prompts.py for condense question prompt
# TODO figure out system messages for chatgpt

### Take 1: Chat Messages Without Question Condensation

In [90]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

In [91]:
chat = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)

In [93]:
sys_msg_prompt = SystemMessage(content="You are a helpful assistant that understands patent and legal information.")

human_template = """Given the following question and parts of a patent, create a final answer with references ("SOURCES"). Answer concisely in plain English so that a layman could understand.
Do not repeat anything in the patent in your answer. If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.

QUESTION: {question}
=========
PATENT: {patent}
=========
FINAL ANSWER:"""

human_msg_prompt = HumanMessagePromptTemplate.from_template(human_template)

In [97]:
chat_prompt = ChatPromptTemplate.from_messages([sys_msg_prompt, human_msg_prompt])

# Get a chat completion from the formatted messages.
# Both page_content and 'source' key of metadata are injected into prompt in document QA. Formatting still unclear. For now just passing page_content because only have single doc.
filled_chat_prompt = chat_prompt.format_prompt(question="How does claim 1 differ from claim 15?", patent=data[0].page_content).to_messages()

In [98]:
filled_chat_prompt

[SystemMessage(content='You are a helpful assistant that understands patent and legal information.', additional_kwargs={}),

In [100]:
ai_answer = chat(filled_chat_prompt)

In [101]:
ai_answer

AIMessage(content='Claim 1 and claim 15 both describe a safety razor with a blade unit and a handle casing connected by a pivotal connection structure that allows movement of the blade unit relative to the handle casing about a hinge axis that is perpendicular to the cutting edge of the blade. However, claim 15 specifies that the joint member of the pivotal connection structure includes a plurality of separated joint elements, each with a bearing fixed to the second member and a pivot shaft fixed to the first member and inserted into the bearing. Claim 1 does not specify this feature. \n\nSOURCES: US8205344B2', additional_kwargs={})

Manually pass in previous query and answer as chat history.

In [103]:
chat_history = [
    sys_msg_prompt,
    HumanMessagePromptTemplate.from_template("How does claim 1 differ from claim 15?"),
    ai_answer,
    human_msg_prompt
]
chat_prompt_2 = ChatPromptTemplate.from_messages(chat_history)
filled_chat_prompt_2 = chat_prompt.format_prompt(question="Are there any other ways in which claim 1 differs from claim 15?", patent=data[0].page_content).to_messages()

In [104]:
ai_answer_2 = chat(filled_chat_prompt_2)
ai_answer_2

AIMessage(content='Without knowing the specific differences between claim 1 and claim 15, it is impossible to answer this question. Claim 1 and claim 15 are both part of the same patent, US8205344B2, which describes a safety razor with a pivotable blade unit. The patent includes various claims related to the construction and operation of the razor, including the use of a joint member with separated joint elements to facilitate movement of the blade unit relative to the handle casing. The patent also describes the use of an injection molding process to form the various components of the razor, as well as the inclusion of a detachable shaving cartridge. \n\nSOURCES: US8205344B2', additional_kwargs={})

Hmmmm. TODO Maybe put prelude of human_template in system message? Also, can langchain's memory utilities store chat history for me? Maybe that is a bad idea since don't want to have patent text in every human message of chat history that LLM processes.

## Take 2: Chat Messages Without Question Condensation

Results from below are definitely better than Take 1, if only because of better prompt design and positioning in system message.

In [105]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

In [107]:
chat = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)

In [120]:
sys_msg_template = """
You are a helpful assistant to a patent lawyer. The lawyer wants your help understanding the patent, which is delimited with triple backticks. 
Answer the lawyer's questions in 300 words or less. Only use information from the patent delimited by triple backticks. Cite text from the patent in quotes in your answer. 
If you don't know the answer, just say that you don't know. Don't try to make up an answer.

Patent: '''{patent}'''"""

sys_msg_prompt = SystemMessagePromptTemplate.from_template(sys_msg_template)

human_template = """The lawyer's question is delimited with triple backticks.

Lawyer's question: '''{question}'''"""

human_msg_prompt = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt = ChatPromptTemplate.from_messages([sys_msg_prompt, human_msg_prompt])

# Get a chat completion from the formatted messages.
# Both page_content and 'source' key of metadata are injected into prompt in document QA. Formatting still unclear. For now just passing page_content because only have single doc.
filled_chat_prompt = chat_prompt.format_prompt(question="How does claim 1 differ from claim 15?", patent=data[0].page_content).to_messages()

In [121]:
filled_chat_prompt

[SystemMessage(content="\nYou are a helpful assistant to a patent lawyer. The lawyer wants your help understanding the patent, which is delimited with triple backticks. \nAnswer the lawyer's questions in 300 words or less. Only use information from the patent delimited by triple backticks. Cite text from the patent in quotes in your answer. \nIf you don't know the answer, just say that you don't know. Don't try to make up an answer.\n\nPatent: '''patent_number: US8205344B2\ntitle: Safety razor having pivotable blade unit\nabstract: A safety razor having a blade unit has at least one blade and a handle casing. A pivotal connection structure is disposed between the blade unit and the handle casing. A first member is connected to the blade unit and a second member is connected to the handle casing. A joint member connects the first member and the second member and facilitates movement of the first member relative to the second member about a hinge axis that is substantially perpendicular 

In [122]:
ai_answer = chat(filled_chat_prompt)
ai_answer

AIMessage(content='Claim 1 and claim 15 both describe a safety razor with a blade unit having at least one blade and a handle casing, and a pivotal connection structure between the blade unit and the handle casing that allows movement of the blade unit relative to the handle casing about a hinge axis that is substantially perpendicular to the cutting edge of the blade. However, claim 1 describes the pivotal connection structure as having a joint member comprising a plurality of separated joint elements which are disposed along the perpendicular pivot axis, while claim 15 describes the joint member as being for jointing, in a hinged manner, the joint portion of the first member with the joint portion of the second member, and constructed such that the joint member has a hinge axis disposed between the joint portions of the first and second members, which works as the perpendicular pivot axis, and includes a plurality of separated joint elements which are disposed along the perpendicular

In [112]:
len(ai_answer.content.split(' '))

150

In [126]:
type(filled_chat_prompt[0])

langchain.schema.SystemMessage

Put the AI's answer in the list of chat messages.

In [127]:
filled_chat_prompt.append(ai_answer)

User enters new message.

In [130]:
filled_chat_prompt.append(human_msg_prompt)

chat_prompt = ChatPromptTemplate.from_messages(filled_chat_prompt)

# Unlike before, don't need patent data since already was injected into chat prompt. Just inject new user message.
filled_chat_prompt = chat_prompt.format_prompt(question="Can you elaborate more on differences between the hinges in claims 1 and 15?").to_messages()
# Observe that using `question` in format_prompt only modified the last message, since it existed as a template whereas the first human message existed as a stati human message.
print(filled_chat_prompt[1])
print(filled_chat_prompt[-1])

content="The lawyer's question is delimited with triple backticks.\n\nLawyer's question: '''How does claim 1 differ from claim 15?'''" additional_kwargs={}
content="The lawyer's question is delimited with triple backticks.\n\nLawyer's question: '''Can you elaborate more on differences between the hinges in claims 1 and 15?'''" additional_kwargs={}


In [134]:
ai_answer = chat(filled_chat_prompt)
ai_answer

AIMessage(content='Both claim 1 and claim 15 describe a safety razor with a pivotal connection structure that allows movement of the blade unit relative to the handle casing about a hinge axis that is substantially perpendicular to the cutting edge of the blade. However, claim 1 describes the joint member as comprising a plurality of separated joint elements which are disposed along the perpendicular pivot axis, while claim 15 describes the joint member as being for jointing, in a hinged manner, the joint portion of the first member with the joint portion of the second member, and constructed such that the joint member has a hinge axis disposed between the joint portions of the first and second members, which works as the perpendicular pivot axis, and includes a plurality of separated joint elements which are disposed along the perpendicular pivot axis, wherein each of the plurality of separated joint elements includes a bearing having a cylindrical shape fixed to the second member, an

In [135]:
filled_chat_prompt.append(ai_answer)

One option for saving chat history to MongoDB is serializing. Downside is that user later on requires equivalent langchain installation in client.

In [137]:
import pickle
pickled = pickle.dumps(filled_chat_prompt)
restored = pickle.loads(pickled)
restored

[SystemMessage(content="\nYou are a helpful assistant to a patent lawyer. The lawyer wants your help understanding the patent, which is delimited with triple backticks. \nAnswer the lawyer's questions in 300 words or less. Only use information from the patent delimited by triple backticks. Cite text from the patent in quotes in your answer. \nIf you don't know the answer, just say that you don't know. Don't try to make up an answer.\n\nPatent: '''patent_number: US8205344B2\ntitle: Safety razor having pivotable blade unit\nabstract: A safety razor having a blade unit has at least one blade and a handle casing. A pivotal connection structure is disposed between the blade unit and the handle casing. A first member is connected to the blade unit and a second member is connected to the handle casing. A joint member connects the first member and the second member and facilitates movement of the first member relative to the second member about a hinge axis that is substantially perpendicular 

## Take 3: Chat Messages Without Question Condensation

Here, trying to get model to think through its outputs step by step.

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

In [140]:
chat = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)

In [147]:
sys_msg_template = """
You are a helpful assistant to a patent lawyer. The lawyer wants your help understanding a patent, which is delimited with triple backticks. 
Perform the following actions:
1 - Extract the key words and phrases of each indepedent claim, especially those that distinguish the independent claims from each other. Format your answer as a JSON whose keys are independent 
claim numbers and whose values are lists of extracted key words and phrases. You should consider the dependent claims of each independent claim in an independent claim's key words and phrases.
2 - Answer the lawyer's questions about the patent in 300 words or less. Cite text from the patent in quotes in your answer. If you don't know the answer, just say that you don't know. Don't try to make up an answer.

Use the following format:

Patent: 
```
patent the lawyer is interested in
```
Key Words and Phrases: 
```
Distinguishing key words and phrases for each independent claim
```
Answer: 
```
Your answer to the lawyer's question
```

Patent: 
```
{patent}
```
Key Words and Phrases:
"""

sys_msg_prompt_template = SystemMessagePromptTemplate.from_template(sys_msg_template)

human_template = """The lawyer's question is delimited with triple backticks.

Lawyer's question: '''{question}'''"""

human_msg_prompt_template = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt_template = ChatPromptTemplate.from_messages([sys_msg_prompt_template, human_msg_prompt_template])

# Get a chat completion from the formatted messages.
# Both page_content and 'source' key of metadata are injected into prompt in document QA. Formatting still unclear. For now just passing page_content because only have single doc.
chat_prompt = chat_prompt_template.format_prompt(question="How does claim 1 differ from claim 15?", patent=data[0].page_content).to_messages()

In [148]:
result = chat.generate([chat_prompt])

In [149]:
result.llm_output

{'token_usage': {'prompt_tokens': 1542,
  'completion_tokens': 448,
  'total_tokens': 1990},
 'model_name': 'gpt-3.5-turbo'}

In [150]:
result.generations[0]

[ChatGeneration(text='Key Words and Phrases:\n\nClaim 1:\n- safety razor\n- blade unit\n- at least one blade\n- cutting edge\n- handle casing\n- pivotal connection structure\n- first member\n- second member\n- joint member\n- hinge axis\n- perpendicular to the cutting edge\n- joint portion\n- thinner wall section\n- convexly curved face\n\nClaim 15:\n- safety razor\n- blade unit\n- at least one blade\n- cutting edge\n- handle unit\n- handle casing\n- pivotal connection structure\n- first member\n- second member\n- joint member\n- hinge axis\n- perpendicular to the cutting edge\n- joint portion\n- bearing\n- cylindrical shape\n- pivot shaft\n- injection molding process\n- thermo plastic material\n- convexly curved face\n- shaving cartridge\n- detachable\n\nAnswer:\n\nClaim 1 and Claim 15 are similar in that they both describe a safety razor with a blade unit having at least one blade with a cutting edge, a handle casing or handle unit, and a pivotal connection structure that allows the 

Well... the detachable thing is wrong. Both claims 1 and 15 allow that. Also the key words and phrases are not formatted as a JSON.

## Take 4: Chat Messages Without Question Condensation

Maybe we can shrink the system prompt a bit. Result: Removing formatting guidelines caused AI to not return keywords.

In [144]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

In [145]:
chat = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)

In [151]:
sys_msg_template = """
You are a helpful assistant to a patent lawyer. The lawyer wants your help understanding a patent, which is delimited with triple backticks. 
In the following messages from the lawyer, you will be asked questions about the patent. Respond to each message by performing the following actions:
1 - Extract the key words and phrases of each indepedent claim, especially those that distinguish the independent claims from each other. Format your answer as a JSON whose keys are independent 
claim numbers and whose values are lists of extracted key words and phrases. You should consider the dependent claims of each independent claim in an independent claim's key words and phrases.
2 - Answer the lawyer's questions about the patent in 300 words or less. Cite text from the patent in quotes in your answer. If you don't know the answer, just say that you don't know. Don't try to make up an answer.

Patent: 
```
{patent}
```
"""

sys_msg_prompt_template = SystemMessagePromptTemplate.from_template(sys_msg_template)

human_template = """The lawyer's question is delimited with triple backticks.

Lawyer's question: '''{question}'''"""

human_msg_prompt_template = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt_template = ChatPromptTemplate.from_messages([sys_msg_prompt_template, human_msg_prompt_template])

# Get a chat completion from the formatted messages.
# Both page_content and 'source' key of metadata are injected into prompt in document QA. Formatting still unclear. For now just passing page_content because only have single doc.
chat_prompt = chat_prompt_template.format_prompt(question="How does claim 1 differ from claim 15?", patent=data[0].page_content).to_messages()

In [152]:
result = chat.generate([chat_prompt])
result.llm_output

{'token_usage': {'prompt_tokens': 1499,
  'completion_tokens': 477,
  'total_tokens': 1976},
 'model_name': 'gpt-3.5-turbo'}

In [153]:
result.generations[0][0].text

'Claim 1 and claim 15 are similar in that they both describe a safety razor with a blade unit having at least one blade with a cutting edge, a handle casing, and a pivotal connection structure disposed between the blade unit and the handle casing. The pivotal connection structure allows for movement of the blade unit relative to the handle casing about a hinge axis that is substantially perpendicular to the at least one cutting edge. However, there are some differences between the two claims. \n\nClaim 1 describes the pivotal connection structure as including a first member connected to the blade unit, a second member connected to the handle casing, and a joint member comprising a plurality of separated joint elements which are disposed along the perpendicular pivot axis and which connect the first member and the second member. The joint member has a thinner wall section toward the hinge axis than toward at least one of the joint portions of the first and second members. Claim 1 also d

Now the darn thing isn't even outputting a JSON of keywords.

## Take 5: Chat Messages Without Condensation

Try manually extracting differences in wording of claims and feeding that into prompt.

### Extracting Word Set Diffs

In [156]:
bq_doc = data[0].page_content
bq_doc

'patent_number: US8205344B2\ntitle: Safety razor having pivotable blade unit\nabstract: A safety razor having a blade unit has at least one blade and a handle casing. A pivotal connection structure is disposed between the blade unit and the handle casing. A first member is connected to the blade unit and a second member is connected to the handle casing. A joint member connects the first member and the second member and facilitates movement of the first member relative to the second member about a hinge axis that is substantially perpendicular to the at least one blade.\nclaims: 1. A safety razor comprising:\n a blade unit having at least one blade having a cutting edge; \n a handle casing; \n a pivotal connection structure including: \n a first member connected to the blade unit; \n a second member connected to the handle casing; and \n a joint member comprising a plurality of separated joint elements which are disposed along the perpendicular pivot axis, and which connect the first m

Get just the claims section of the biq query result.

In [162]:
import re
result = re.search('(?<=\nclaims: ).+', bq_doc, flags=re.DOTALL)
result

<re.Match object; span=(582, 6431), match='1. A safety razor comprising:\n a blade unit havi>

Get a specific claim. At least in this patent, claims are separated by 3 or 4 new-lines with 5 spaces in-between 

In [167]:
claims = result.group().split('\n     \n     \n       ')
claims

['1. A safety razor comprising:\n a blade unit having at least one blade having a cutting edge; \n a handle casing; \n a pivotal connection structure including: \n a first member connected to the blade unit; \n a second member connected to the handle casing; and \n a joint member comprising a plurality of separated joint elements which are disposed along the perpendicular pivot axis, and which connect the first member and the second member that facilitates movement of the first member relative to the second member about a hinge axis that is substantially perpendicular to the at least one cutting edge, wherein the first member has a joint portion, the second member has a joint portion, and the joint member has a thinner wall section toward the hinge axis than toward at least one of the joint portions of the first and second members. \n ',
 '2. The safety razor of claim l wherein the joint member includes a plurality of separated joint elements which are disposed along the perpendicular 

Not sure how well this will hold up on other patents, but at least here an independent claim never has the word 'claim' in it.

In [170]:
indep_claims = [c for c in claims if 'claim' not in c]
indep_claim_indices = [i for i, c in enumerate(claims) if 'claim' not in c]
indep_claims

['1. A safety razor comprising:\n a blade unit having at least one blade having a cutting edge; \n a handle casing; \n a pivotal connection structure including: \n a first member connected to the blade unit; \n a second member connected to the handle casing; and \n a joint member comprising a plurality of separated joint elements which are disposed along the perpendicular pivot axis, and which connect the first member and the second member that facilitates movement of the first member relative to the second member about a hinge axis that is substantially perpendicular to the at least one cutting edge, wherein the first member has a joint portion, the second member has a joint portion, and the joint member has a thinner wall section toward the hinge axis than toward at least one of the joint portions of the first and second members. \n ',
 '15. A safety razor comprising a blade unit having at least one blade having a cutting edge, a handle unit having a handle casing, and a pivotal conn

Group claim text by independent claims.

In [172]:
indep_claims_and_their_dependents = dict()
for i, c in enumerate(claims):
    if i in indep_claim_indices:
        # Is independent, so start an entry in indep_claims_and_their_dependents.
        indep_claims_and_their_dependents[i] = c
    else:
        # Is dependent, so group to most recent independent claim since that is what dependent claim refers to.
        processed_indep_claim_indices = list(indep_claims_and_their_dependents.keys())
        processed_indep_claim_indices.sort()
        latest_indep_claim_index = processed_indep_claim_indices[-1]
        indep_claims_and_their_dependents[latest_indep_claim_index] = indep_claims_and_their_dependents[latest_indep_claim_index] + c
indep_claims_and_their_dependents

{0: '1. A safety razor comprising:\n a blade unit having at least one blade having a cutting edge; \n a handle casing; \n a pivotal connection structure including: \n a first member connected to the blade unit; \n a second member connected to the handle casing; and \n a joint member comprising a plurality of separated joint elements which are disposed along the perpendicular pivot axis, and which connect the first member and the second member that facilitates movement of the first member relative to the second member about a hinge axis that is substantially perpendicular to the at least one cutting edge, wherein the first member has a joint portion, the second member has a joint portion, and the joint member has a thinner wall section toward the hinge axis than toward at least one of the joint portions of the first and second members. \n 2. The safety razor of claim l wherein the joint member includes a plurality of separated joint elements which are disposed along the perpendicular pi

In [195]:
def get_word_set(multi_word_string: str):
    # Assume words are split by spaces.
    words = multi_word_string.split(' ')

    # Remove punctuation and new-lines.
    def remove_chars(word: str, chars_to_remove: list):
        for char in chars_to_remove:
            word = word.replace(char, '')
        return word
    
    words = [remove_chars(word, ['.', ';', ',', '(', ')', ':', '\n']) for word in words]

    # Get rid of any length-0 or length-1 words (e.g. '', 'a') and any numerics (e.g. '10')
    words = [word for word in words if len(word) > 1 and not word.isnumeric()]
    
    # Remove any duplicates by using set().
    return set(words)

In [196]:
ic1_words = get_word_set(indep_claims_and_their_dependents[0])
ic15_words = get_word_set(indep_claims_and_their_dependents[14])

In [197]:
words_in_1_and_not_15 = ic1_words - ic15_words
words_in_1_and_not_15

{'arms',
 'base',
 'biasing',
 'cam',
 'comprising:',
 'connect',
 'contacts',
 'distal',
 'elastic',
 'extending',
 'facilitates',
 'force',
 'frame',
 'identical',
 'impart',
 'include',
 'including:',
 'latch',
 'leaf',
 'location',
 'metal',
 'pair',
 'parallel',
 'plate',
 'plunger',
 'prism',
 'rounded',
 'section',
 'secure',
 'shaped',
 'spaced',
 'spring',
 'spring-biased',
 'surface',
 'than',
 'thinner',
 'toward',
 'triangular',
 'unitary',
 'wall'}

In [198]:
len(words_in_1_and_not_15)

40

In [199]:
words_in_15_and_not_1 = ic15_words - ic1_words
words_in_15_and_not_1

{'as',
 'being',
 'between',
 'constructed',
 'contours',
 'during',
 'following',
 'hinged',
 'in',
 'including',
 'jointing',
 'manner',
 'skin',
 'such',
 'thereto',
 'through',
 'user',
 'works'}

In [200]:
len(words_in_15_and_not_1)

18

In [204]:
', '.join(words_in_15_and_not_1)

'constructed, in, following, during, including, contours, user, through, works, jointing, hinged, such, between, thereto, as, being, manner, skin'

### Prompt

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

In [243]:
chat = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)

In [215]:
sys_msg_template = """
You are a helpful assistant to a patent lawyer. The lawyer wants your help understanding a patent.
Answer the lawyer's questions about the patent in 300 words or less. Cite text from the patent in quotes in your answer. If you don't know the answer, just say that you don't know. 
Don't try to make up an answer. If you are uncertain about any part of your answer, say so. 

Use the following information in thinking through your answer. First, the patent is delimited by triple backticks. Second, the words that appear in claim 1 but not in claim 15 are
delimited by triple backticks. Third, the words that appear in claim 15 but not in claim 1 are delimited by triple backticks.

Patent: 
```
{patent}
```
Words in Claim 1 But Not in Claim 15:
```
{claim_1_words}
```
Words in Claim 15 but Not in Claim 1:
```
{claim_15_words}
```
"""

sys_msg_prompt_template = SystemMessagePromptTemplate.from_template(sys_msg_template)

human_template = """The lawyer's question is delimited with triple backticks.

Lawyer's question: '''{question}'''"""

human_msg_prompt_template = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt_template = ChatPromptTemplate.from_messages([sys_msg_prompt_template, human_msg_prompt_template])

# Get a chat completion from the formatted messages.
# Both page_content and 'source' key of metadata are injected into prompt in document QA. Formatting still unclear. For now just passing page_content because only have single doc.
chat_prompt = chat_prompt_template.format_prompt(question="How does claim 1 differ from claim 15?", 
                                                 patent=data[0].page_content, 
                                                 claim_1_words=', '.join(words_in_1_and_not_15),
                                                 claim_15_words=', '.join(words_in_15_and_not_1)).to_messages()

In [218]:
chat_prompt

[SystemMessage(content="\nYou are a helpful assistant to a patent lawyer. The lawyer wants your help understanding a patent.\nAnswer the lawyer's questions about the patent in 300 words or less. Cite text from the patent in quotes in your answer. If you don't know the answer, just say that you don't know. \nDon't try to make up an answer. If you are uncertain about any part of your answer, say so. \n\nUse the following information in thinking through your answer. First, the patent is delimited by triple backticks. Second, the words that appear in claim 1 but not in claim 15 are\ndelimited by triple backticks. Third, the words that appear in claim 15 but not in claim 1 are delimited by triple backticks.\n\nPatent: \n```\npatent_number: US8205344B2\ntitle: Safety razor having pivotable blade unit\nabstract: A safety razor having a blade unit has at least one blade and a handle casing. A pivotal connection structure is disposed between the blade unit and the handle casing. A first member 

In [216]:
result = chat.generate([chat_prompt])
result.llm_output

{'token_usage': {'prompt_tokens': 1633,
  'completion_tokens': 319,
  'total_tokens': 1952},
 'model_name': 'gpt-3.5-turbo'}

In [217]:
result.generations[0][0].text

"Claim 1 and claim 15 both describe a safety razor with a blade unit having at least one blade and a handle casing, with a pivotal connection structure disposed between the blade unit and the handle casing. However, there are some differences between the two claims.\n\nClaim 1 describes the pivotal connection structure as having a joint member comprising a plurality of separated joint elements which are disposed along the perpendicular pivot axis, and which connect the first member and the second member that facilitates movement of the first member relative to the second member about a hinge axis that is substantially perpendicular to the at least one cutting edge. Claim 1 also describes the separated joint elements as including a triangular prism shaped base member and an elastic plate member extending from the base member.\n\nOn the other hand, claim 15 describes the pivotal connection structure as being constructed such that the joint member has a hinge axis disposed between the joi

In [219]:
chat_prompt = chat_prompt_template.format_prompt(question="What is the primary innovation of this patent over prior art?", 
                                                 patent=data[0].page_content, 
                                                 claim_1_words=', '.join(words_in_1_and_not_15),
                                                 claim_15_words=', '.join(words_in_15_and_not_1)).to_messages()

In [220]:
result = chat.generate([chat_prompt])
result.llm_output

{'token_usage': {'prompt_tokens': 1634,
  'completion_tokens': 224,
  'total_tokens': 1858},
 'model_name': 'gpt-3.5-turbo'}

In [221]:
result.generations[0][0].text

'Based on the language of the patent, the primary innovation of this patent over prior art is the "pivotal connection structure" that allows the blade unit to move relative to the handle casing about a hinge axis that is perpendicular to the cutting edge of the blade. This structure includes a joint member with separated joint elements that connect the first member (connected to the blade unit) and the second member (connected to the handle casing) and facilitate the movement of the blade unit. The joint member has a thinner wall section toward the hinge axis than toward at least one of the joint portions of the first and second members. Additionally, the separated joint elements include a bearing and a pivot shaft that allow for the pivotal movement of the blade unit. This structure allows the razor to follow the contours of a user\'s skin during shaving. \n\nWhile there are other features and elements described in the patent, such as the use of latch arms to secure the pivotal connec

## Take 5.2: Trying another patent

### Load and extract word sets of independent claims

In [224]:
QUERY = (
    'SELECT spif_publication_number as patent_number, t.text as title,  a.text as abstract, c.text as claims '
    'FROM `patents-public-data.patents.publications`, UNNEST(title_localized) as t, UNNEST(abstract_localized) as a,  UNNEST(claims_localized) as c '
    'WHERE spif_publication_number = "US8950002B2" '
    'LIMIT 100')
loader = BigQueryLoader(QUERY)

data = loader.load()

In [228]:
bq_doc = data[0].page_content

import re
result = re.search('(?<=\nclaims: ).+', bq_doc, flags=re.DOTALL)
claims = result.group().split('\n     \n     \n       ')
claims

['What is claimed is: \n     \n       1. An apparatus comprising:\n a memory that stores a plurality of tokens indicating that a user is attempting to access a resource; and \n a processor that:\n determines a related resource that shares a relationship with the resource, wherein:\n the resource is a composite resource; \n the related resource is a sub-resource of the composite resource; and \n the related resource is accessed in conjunction with accessing the resource; \n \n receives a risk token computed based at least in part upon the resource and the related resource; \n determines a numeric authorization level for the user based at least in part upon the plurality of tokens and the risk token, the numeric authorization level indicating whether the user is authorized to access the resource, wherein the composite resource comprises at least one sub-resource that the user is not authorized to access based on the numeric authorization level and at least one sub-resource that the user 

In [232]:
indep_claims = [c for c in claims if re.search('of\s+claim', c) is None]
indep_claim_indices = [i for i, c in enumerate(claims) if re.search('of\s+claim', c) is None]
indep_claims

['What is claimed is: \n     \n       1. An apparatus comprising:\n a memory that stores a plurality of tokens indicating that a user is attempting to access a resource; and \n a processor that:\n determines a related resource that shares a relationship with the resource, wherein:\n the resource is a composite resource; \n the related resource is a sub-resource of the composite resource; and \n the related resource is accessed in conjunction with accessing the resource; \n \n receives a risk token computed based at least in part upon the resource and the related resource; \n determines a numeric authorization level for the user based at least in part upon the plurality of tokens and the risk token, the numeric authorization level indicating whether the user is authorized to access the resource, wherein the composite resource comprises at least one sub-resource that the user is not authorized to access based on the numeric authorization level and at least one sub-resource that the user 

In [239]:
indep_claims_and_their_dependents = dict()
for i, c in enumerate(claims):
    if i in indep_claim_indices:
        # Is independent, so start an entry in indep_claims_and_their_dependents.
        # First numeric appearing in claim should be the claim number.
        claim_num = re.search('\d+', c).group()
        assert claim_num is not None, 'Could not find independent claim number.'
        indep_claims_and_their_dependents[claim_num] = c
    else:
        # Is dependent, so group to most recent independent claim since that is what dependent claim refers to.
        processed_indep_claim_indices = list(indep_claims_and_their_dependents.keys())
        processed_indep_claim_indices.sort()
        latest_indep_claim_index = processed_indep_claim_indices[-1]
        indep_claims_and_their_dependents[latest_indep_claim_index] = indep_claims_and_their_dependents[latest_indep_claim_index] + c
indep_claims_and_their_dependents

{'1': 'What is claimed is: \n     \n       1. An apparatus comprising:\n a memory that stores a plurality of tokens indicating that a user is attempting to access a resource; and \n a processor that:\n determines a related resource that shares a relationship with the resource, wherein:\n the resource is a composite resource; \n the related resource is a sub-resource of the composite resource; and \n the related resource is accessed in conjunction with accessing the resource; \n \n receives a risk token computed based at least in part upon the resource and the related resource; \n determines a numeric authorization level for the user based at least in part upon the plurality of tokens and the risk token, the numeric authorization level indicating whether the user is authorized to access the resource, wherein the composite resource comprises at least one sub-resource that the user is not authorized to access based on the numeric authorization level and at least one sub-resource that the 

In [241]:
word_sets = {ic_num: get_word_set(claims_text) for ic_num, claims_text in indep_claims_and_their_dependents.items()}

### Get words that are unique to each independent claim.

In [245]:
unique_word_lists = dict()
for ic_num, word_set in word_sets.items():
    unique_words = word_set
    for other_ic_num, other_word_set in word_sets.items():
        if ic_num == other_ic_num:
            continue
        unique_words = unique_words - other_word_set
    unique_word_lists[ic_num] = list(unique_words)
unique_word_lists

{'1': ['that:', 'is:', 'An', 'What', 'apparatus', 'memory', 'claimed'],
 '4': ['communicating',
  'comprising',
  'receiving',
  'method',
  'determining',
  'comparing',
  'contexts',
  're-determining',
  'storing'],
 '7': ['executed',
  'more',
  'media',
  'software',
  'embodying',
  'storage',
  'or',
  'executed:',
  'when',
  'One',
  'non-transitory',
  'computer-readable']}

### Prompt

In [246]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
import json

In [247]:
chat = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)

In [248]:
sys_msg_template = """
You are a helpful assistant to a patent lawyer. The lawyer wants your help understanding a patent.
Answer the lawyer's questions about the patent in 300 words or less. Cite text from the patent in quotes in your answer. If you don't know the answer, just say that you don't know. 
Don't try to make up an answer. If you are uncertain about any part of your answer, say so. 

Use the following information in thinking through your answer. First, the patent is delimited by triple backticks. Second, the words that appear uniquely in each independent claim are
given in a JSON delimited by triple backticks. For example, independent claim 1 has key '1' in the JSON, and its values are all words that appear only in independent claim 1 and its
dependent claims.

Patent: 
```
{patent}
```
Words Unique to Each Independent Claim and Its Dependent Claims:
```
{unique_words}
```
"""

sys_msg_prompt_template = SystemMessagePromptTemplate.from_template(sys_msg_template)

human_template = """The lawyer's question is delimited with triple backticks.

Lawyer's question: '''{question}'''"""

human_msg_prompt_template = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt_template = ChatPromptTemplate.from_messages([sys_msg_prompt_template, human_msg_prompt_template])

# Get a chat completion from the formatted messages.
# Both page_content and 'source' key of metadata are injected into prompt in document QA. Formatting still unclear. For now just passing page_content because only have single doc.
chat_prompt = chat_prompt_template.format_prompt(question="How does claim 1 differ from claim 4?", 
                                                 patent=data[0].page_content, 
                                                 unique_words=json.dumps(unique_word_lists)).to_messages()
chat_prompt

[SystemMessage(content='\nYou are a helpful assistant to a patent lawyer. The lawyer wants your help understanding a patent.\nAnswer the lawyer\'s questions about the patent in 300 words or less. Cite text from the patent in quotes in your answer. If you don\'t know the answer, just say that you don\'t know. \nDon\'t try to make up an answer. If you are uncertain about any part of your answer, say so. \n\nUse the following information in thinking through your answer. First, the patent is delimited by triple backticks. Second, the words that appear uniquely in each independent claim are\ngiven in a JSON delimited by triple backticks. For example, independent claim 1 has key \'1\' in the JSON, and its values are all words that appear only in independent claim 1 and its\ndependent claims.\n\nPatent: \n```\npatent_number: US8950002B2\ntitle: Method and apparatus for token-based access of related resources\nabstract: According to one embodiment, an apparatus may store a plurality of tokens 

In [249]:
result = chat.generate([chat_prompt])
result.llm_output

{'token_usage': {'prompt_tokens': 1809,
  'completion_tokens': 565,
  'total_tokens': 2374},
 'model_name': 'gpt-3.5-turbo'}

In [250]:
result.generations[0][0].text

'Claim 1 is directed to an apparatus comprising a memory that stores a plurality of tokens indicating that a user is attempting to access a resource, and a processor that determines a related resource that shares a relationship with the resource. The related resource is a sub-resource of the composite resource and is accessed in conjunction with accessing the resource. Claim 1 also includes additional steps of receiving a risk token computed based at least in part upon the resource and the related resource, determining a numeric authorization level for the user based at least in part upon the plurality of tokens and the risk token, comparing the numeric authorization level to a numeric threshold, determining, based at least in part on the comparison between the numeric authorization level and the numeric threshold, that the user is authorized to access the related resource, communicating a decision token indicating that the user is authorized to access the resource and the related reso

TBH not sure how well this worked because I haven't read the patent lol.