In [3]:
from operator import itemgetter

from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain.vectorstores import FAISS

from operator import itemgetter
from langchain.memory import ConversationBufferMemory

from langchain.document_loaders import PyPDFDirectoryLoader


In [6]:
from langchain.schema.runnable import RunnableMap
from langchain.schema import format_document
from langchain.vectorstores import DeepLake
from langchain.text_splitter import CharacterTextSplitter

In [6]:
from langchain.prompts.prompt import PromptTemplate

_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

In [7]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
ANSWER_PROMPT = ChatPromptTemplate.from_template(template)

In [8]:
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
def _combine_documents(docs, document_prompt = DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    return document_separator.join(doc_strings)

In [9]:
from typing import Tuple, List
def _format_chat_history(chat_history: List[Tuple]) -> str:
    buffer = ""
    for dialogue_turn in chat_history:
        human = "Human: " + dialogue_turn[0]
        ai = "Assistant: " + dialogue_turn[1]
        buffer += "\n" + "\n".join([human, ai])
    return buffer

In [10]:
_inputs = RunnableMap(
    standalone_question=RunnablePassthrough.assign(
        chat_history=lambda x: _format_chat_history(x['chat_history'])
    ) | CONDENSE_QUESTION_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser(),
)
_context = {
    "context": itemgetter("standalone_question") | retriever | _combine_documents,
    "question": lambda x: x["standalone_question"]
}
conversational_qa_chain = _inputs | _context | ANSWER_PROMPT | ChatOpenAI()

In [11]:
conversational_qa_chain.invoke({
    "question": "where did harrison work?",
    "chat_history": [],
})

AIMessage(content='Harrison was employed at Kensho.')

In [12]:

memory = ConversationBufferMemory(return_messages=True, output_key="answer", input_key="question")

In [13]:
# First we add a step to load memory
# This adds a "memory" key to the input object
loaded_memory = RunnablePassthrough.assign(
    chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter("history"),
)
# Now we calculate the standalone question
standalone_question = {
    "standalone_question": {
        "question": lambda x: x["question"],
        "chat_history": lambda x: _format_chat_history(x['chat_history'])
    } | CONDENSE_QUESTION_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser(),
}
# Now we retrieve the documents
retrieved_documents = {
    "docs": itemgetter("standalone_question") | retriever,
    "question": lambda x: x["standalone_question"]
}
# Now we construct the inputs for the final prompt
final_inputs = {
    "context": lambda x: _combine_documents(x["docs"]),
    "question": itemgetter("question")
}
# And finally, we do the part that returns the answers
answer = {
    "answer": final_inputs | ANSWER_PROMPT | ChatOpenAI(),
    "docs": itemgetter("docs"),
}
# And now we put it all together!
final_chain = loaded_memory | standalone_question | retrieved_documents | answer

In [14]:
inputs = {"question": "where did harrison work?"}
result = final_chain.invoke(inputs)
result

{'answer': AIMessage(content='Harrison was employed at Kensho.'),
 'docs': [Document(page_content='harrison worked at kensho')]}

In [56]:

loader = PyPDFDirectoryLoader("flying_pdfs/", silent_errors=True)
docs = loader.load()




KeyboardInterrupt: 

In [58]:
loader2 = PyPDFDirectoryLoader("flying_pdfs/", silent_errors=True, extract_images=True)
docs2 = loader.load()



In [59]:
docs = docs + docs2

In [12]:
import gzip, json
def compress_json(json_data):
    return gzip.compress(json_data.encode())

jdocs = [json.dumps(doc.__dict__) for doc in docs]
compressed = compress_json(json.dumps(jdocs))
with gzip.open("flying_docs.json.gz", "wb") as f:
    f.write(compressed)


In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs_split = text_splitter.split_documents(docs)

In [3]:
hf_token = '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'

In [60]:
import os 
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import DeepLake

activeloop_token = 'eyJhbGciOiJIUzUxMiIsImlhdCI6MTY5ODQ3MDIyMCwiZXhwIjoxNzMwMDkyNjEyfQ.eyJpZCI6ImFuYW50In0.AdMpABWtskDwWx4joLZhY9b9sqjNXvudeG2FqirATWFyRRczJJluCxnIt8udI3Bjy2Au672dEmDsmkOK5V0ZLg'
username = 'anant'
os.environ["ACTIVELOOP_TOKEN"] = activeloop_token
embeddings = OpenAIEmbeddings(show_progress_bar=True, model_kwargs={'batch_size': 50})

In [61]:
db = DeepLake(
    dataset_path="./my_deeplake/", embedding=embeddings, overwrite=True
)



In [66]:
s = set(docs)

TypeError: unhashable type: 'Document'

In [63]:
db.add_documents(docs)

creating embeddings:   0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

creating embeddings:   6%|▌         | 1/18 [00:06<01:47,  6.34s/it]

  0%|          | 0/1 [00:00<?, ?it/s]

creating embeddings:  11%|█         | 2/18 [00:13<01:51,  6.97s/it]

  0%|          | 0/1 [00:00<?, ?it/s]

creating embeddings:  17%|█▋        | 3/18 [00:21<01:47,  7.16s/it]

  0%|          | 0/1 [00:00<?, ?it/s]

creating embeddings:  22%|██▏       | 4/18 [00:28<01:41,  7.23s/it]

  0%|          | 0/1 [00:00<?, ?it/s]

creating embeddings:  28%|██▊       | 5/18 [00:46<02:23, 11.05s/it]

  0%|          | 0/1 [00:00<?, ?it/s]

creating embeddings:  33%|███▎      | 6/18 [01:11<03:11, 15.94s/it]

  0%|          | 0/1 [00:00<?, ?it/s]

creating embeddings:  39%|███▉      | 7/18 [01:32<03:11, 17.39s/it]

  0%|          | 0/1 [00:00<?, ?it/s]

creating embeddings:  44%|████▍     | 8/18 [01:51<03:00, 18.07s/it]

  0%|          | 0/1 [00:00<?, ?it/s]

creating embeddings:  50%|█████     | 9/18 [02:14<02:56, 19.67s/it]

  0%|          | 0/1 [00:00<?, ?it/s]

creating embeddings:  56%|█████▌    | 10/18 [02:26<02:18, 17.28s/it]

  0%|          | 0/1 [00:00<?, ?it/s]

creating embeddings:  61%|██████    | 11/18 [02:44<02:03, 17.58s/it]

  0%|          | 0/1 [00:00<?, ?it/s]

creating embeddings:  67%|██████▋   | 12/18 [03:12<02:03, 20.59s/it]

  0%|          | 0/1 [00:00<?, ?it/s]

creating embeddings:  72%|███████▏  | 13/18 [03:27<01:34, 18.84s/it]

  0%|          | 0/1 [00:00<?, ?it/s]

creating embeddings:  78%|███████▊  | 14/18 [03:46<01:16, 19.10s/it]

  0%|          | 0/1 [00:00<?, ?it/s]

creating embeddings:  83%|████████▎ | 15/18 [04:12<01:03, 21.04s/it]

  0%|          | 0/1 [00:00<?, ?it/s]

creating embeddings:  89%|████████▉ | 16/18 [04:33<00:41, 20.93s/it]

  0%|          | 0/1 [00:00<?, ?it/s]

creating embeddings:  94%|█████████▍| 17/18 [04:54<00:21, 21.12s/it]

  0%|          | 0/1 [00:00<?, ?it/s]

creating embeddings: 100%|██████████| 18/18 [05:13<00:00, 17.42s/it]


Dataset(path='./my_deeplake/', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype       shape       dtype  compression
  -------    -------     -------     -------  ------- 
   text       text      (8984, 1)      str     None   
 metadata     json      (8984, 1)      str     None   
 embedding  embedding  (8984, 1536)  float32   None   
    id        text      (8984, 1)      str     None   


['3e082c26-755c-11ee-91c6-a6f9a8ad2293',
 '3e082cb2-755c-11ee-91c6-a6f9a8ad2293',
 '3e082cd0-755c-11ee-91c6-a6f9a8ad2293',
 '3e082cee-755c-11ee-91c6-a6f9a8ad2293',
 '3e082d02-755c-11ee-91c6-a6f9a8ad2293',
 '3e082d16-755c-11ee-91c6-a6f9a8ad2293',
 '3e082d2a-755c-11ee-91c6-a6f9a8ad2293',
 '3e082d3e-755c-11ee-91c6-a6f9a8ad2293',
 '3e082d52-755c-11ee-91c6-a6f9a8ad2293',
 '3e082d70-755c-11ee-91c6-a6f9a8ad2293',
 '3e082d84-755c-11ee-91c6-a6f9a8ad2293',
 '3e082d98-755c-11ee-91c6-a6f9a8ad2293',
 '3e082dac-755c-11ee-91c6-a6f9a8ad2293',
 '3e082dc0-755c-11ee-91c6-a6f9a8ad2293',
 '3e082dd4-755c-11ee-91c6-a6f9a8ad2293',
 '3e082de8-755c-11ee-91c6-a6f9a8ad2293',
 '3e082dfc-755c-11ee-91c6-a6f9a8ad2293',
 '3e082e10-755c-11ee-91c6-a6f9a8ad2293',
 '3e082e24-755c-11ee-91c6-a6f9a8ad2293',
 '3e082e38-755c-11ee-91c6-a6f9a8ad2293',
 '3e082e4c-755c-11ee-91c6-a6f9a8ad2293',
 '3e082e60-755c-11ee-91c6-a6f9a8ad2293',
 '3e082e74-755c-11ee-91c6-a6f9a8ad2293',
 '3e082e88-755c-11ee-91c6-a6f9a8ad2293',
 '3e082e9c-755c-

In [64]:
retriever = db.as_retriever()

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatOpenAI()
chain = (
    {"context": retriever, "question": RunnablePassthrough()} 
    | prompt 
    | model 
    | StrOutputParser()
)

In [65]:
db.similarity_search('What are the VFR cloud clearance requirements for Class E airspace?')

  0%|          | 0/1 [00:00<?, ?it/s]

[Document(page_content=' \n AIM 4/20/23 \n5. Class E is more restrictive than Class G airspace. \n3−1−4. Basic VFR Weather Minimums \na. No person may operate an aircraft under basic VFR when the flight visibility is less, or at a distance from \nclouds that is less, than that prescribed for the corresponding altitude and class of airspace. (See TBL 3 −1−1.) \nNOTE− \nStudent pilots must comply with 14 CFR Section 61.89(a) (6) and (7). \nb. Except as provided in 14 CFR Section 91.157, Special VFR Weather Minimums, no person may operate \nan aircraft beneath the ceiling under VFR within the lateral boundaries of controlled airspace designated to the \nsurface for an airport when the ceiling is less than 1,000 feet. (See 14 CFR Section 91.155(c).) \nTBL 3−1−1 \nBasic VFR Weather Minimums \nAirspace Flight Visibility Distance from Clouds \nClass A ........................................ Not Applicable Not Applicable \nClass B ........................................ 3 statute miles Clear

In [67]:
chain.invoke("What are the VFR cloud clearences for class D airspace?")

  0%|          | 0/1 [00:00<?, ?it/s]

'The VFR cloud clearances for Class D airspace are 500 feet below, 1,000 feet above, and 2,000 feet horizontal.'

In [70]:
from langchain.schema.runnable import RunnableMap
from langchain.schema import format_document
from langchain.prompts.prompt import PromptTemplate

_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
ANSWER_PROMPT = ChatPromptTemplate.from_template(template)

DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
def _combine_documents(docs, document_prompt = DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    return document_separator.join(doc_strings)

from typing import Tuple, List
def _format_chat_history(chat_history: List[Tuple]) -> str:
    buffer = ""
    for dialogue_turn in chat_history:
        human = "Human: " + dialogue_turn[0]
        ai = "Assistant: " + dialogue_turn[1]
        buffer += "\n" + "\n".join([human, ai])
    return buffer

In [71]:
_inputs = RunnableMap(
    standalone_question=RunnablePassthrough.assign(
        chat_history=lambda x: _format_chat_history(x['chat_history'])
    ) | CONDENSE_QUESTION_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser(),
)
_context = {
    "context": itemgetter("standalone_question") | retriever | _combine_documents,
    "question": lambda x: x["standalone_question"]
}
conversational_qa_chain = _inputs | _context | ANSWER_PROMPT | ChatOpenAI()

In [72]:
memory = ConversationBufferMemory(return_messages=True, output_key="answer", input_key="question")

In [73]:
# First we add a step to load memory
# This adds a "memory" key to the input object
loaded_memory = RunnablePassthrough.assign(
    chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter("history"),
)
# Now we calculate the standalone question
standalone_question = {
    "standalone_question": {
        "question": lambda x: x["question"],
        "chat_history": lambda x: _format_chat_history(x['chat_history'])
    } | CONDENSE_QUESTION_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser(),
}
# Now we retrieve the documents
retrieved_documents = {
    "docs": itemgetter("standalone_question") | retriever,
    "question": lambda x: x["standalone_question"]
}
# Now we construct the inputs for the final prompt
final_inputs = {
    "context": lambda x: _combine_documents(x["docs"]),
    "question": itemgetter("question")
}
# And finally, we do the part that returns the answers
answer = {
    "answer": final_inputs | ANSWER_PROMPT | ChatOpenAI(),
    "docs": itemgetter("docs"),
}
# And now we put it all together!
final_chain = loaded_memory | standalone_question | retrieved_documents | answer

In [74]:
inputs = {"question": "What is the maximum approach speed on a Instrument approach for a Cessna 172(Category 1)?"}
result = final_chain.invoke(inputs)
result

  0%|          | 0/1 [00:00<?, ?it/s]

{'answer': AIMessage(content='The maximum approach speed for a Cessna 172 (Category 1) on an Instrument approach is 91 knots.'),
 'docs': [Document(page_content=' \n 4/20/23 AIM \n5−4−7. Instrument Approach Procedures \na. Aircraft approach category means a grouping of aircraft bas ed on a speed of V REF at the maximum certified \nlanding weight, if specified, or if V REF is not specified, 1.3 VSO at the maximum certified landing weight. V REF, \nVSO, and the maximum certified landing weight are those values as established for the aircraft by the certification \nauthority of the country of registry. A pilot must maneuver the aircraft within the circling approach protected \narea (see FIG 5− 4−27) to achieve the obstacle and terrain clearances provided by procedure design criteria. \nb. In addition to pilot techniques for maneuvering, one acceptable method to reduce the risk of flying out of \nthe circling approach protected area is to use either the minima corresponding to the category