In [1]:
from dotenv import load_dotenv
import os
import logging
import sys

In [2]:
level=logging.INFO
logging.basicConfig(stream=sys.stdout, level=level, format='%(asctime)s - %(levelname)s - %(message)s')
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [3]:
load_dotenv()

2024-05-01 18:20:36,490 - INFO - Loaded environmental keys for OpenAI.
Loaded environmental keys for OpenAI.


In [4]:
from langchain_openai import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Qdrant
from langchain_core.prompts import ChatPromptTemplate

In [5]:
import tiktoken

In [6]:
basic_model = "gpt-3.5-turbo"
basic_embedding_model = "text-embedding-3-small"
pdf_doc = "DataRepository/meta10k.pdf"

In [7]:
enc = tiktoken.encoding_for_model(basic_model)
embedding_model = OpenAIEmbeddings(model=basic_embedding_model)

In [8]:
def tiktoken_len(text):
    tokens = tiktoken.encoding_for_model(basic_model).encode(
        text,
    )
    return len(tokens)

In [10]:
openai_chat_model = ChatOpenAI(model=basic_model)

In [12]:
from langchain.document_loaders import PyMuPDFLoader
docs = PyMuPDFLoader(pdf_doc).load()

In [22]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 750,
    chunk_overlap = 50,
    length_function = tiktoken_len,
)

In [23]:
split_chunks = text_splitter.split_documents(docs)

In [25]:
print(type(split_chunks))

# Print the last 10 items
print("Last X items:")
for item in split_chunks[-20:]:
    print(item)

<class 'list'>
Last 10 items:
page_content='Table of Contents\nPOWER OF ATTORNEY\nKNOW ALL PERSONS BY THESE PRESENTS, that each person whose signature appears below constitutes and appoints Susan Li and Katherine R.\nKelly, and each of them, as his or her true and lawful attorneys-in-fact and agents, with full power of substitution and resubstitution, for him or her and in his or\nher name, place and stead, in any and all capacities, to sign any and all amendments to this Annual Report on Form 10-K, and to file the same, with all exhibits\nthereto, and other documents in connection therewith, with the Securities and Exchange Commission, granting unto said attorneys-in-fact and agents, and each\nof them, full power and authority to do and perform each and every act and thing requisite and necessary to be done in connection therewith, as fully to all\nintents and purposes as he or she might or could do in person, hereby ratifying and confirming that all said attorneys-in-fact and agents,

 DEBUG STUFF

In [26]:
len(split_chunks)
max_chunk_length = 0
for chunk in split_chunks:
  max_chunk_length = max(max_chunk_length, tiktoken_len(chunk.page_content))
print(max_chunk_length)

746


In [27]:
qdrant_vectorstore = Qdrant.from_documents(
    split_chunks,
    embedding_model,
    location=":memory:",
    collection_name="Meta10k",
)

2024-05-01 18:33:26,169 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-05-01 18:33:28,067 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-05-01 18:33:30,047 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-05-01 18:33:31,719 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-05-01 18:33:33,549 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [28]:
qdrant_retriever = qdrant_vectorstore.as_retriever()

In [29]:
RAG_PROMPT = """
CONTEXT:
{context}

QUERY:
{question}

Be as specific as possible, within the context. You should use only the context provided by the user in order to answer the question. Do not use other information from outside the context. If you do not know, just answer "I don't know."
"""

In [30]:
rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

In [31]:
from operator import itemgetter
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough

In [32]:
retrieval_augmented_qa_chain = (
    {"context": itemgetter("question") | qdrant_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")}
)

In [33]:
response = retrieval_augmented_qa_chain.invoke({"question" : "What was the total value of 'Cash and cash equivalents' as of December 31, 2023?"})
print('/n/n/n')
print("Response:")
print(response["response"].content)
print("----")
for context in response["context"]:
  print("Context:")
  print(context)
  print("----")

2024-05-01 18:33:53,637 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-05-01 18:33:56,503 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
/n/n/n
Response:
The total value of 'Cash and cash equivalents' as of December 31, 2023, was $41,862 million.
----
Context:
page_content='Table of Contents\nNote\xa05. Financial Instruments\nInstruments Measured at Fair Value\nWe classify our cash equivalents and marketable debt securities within Level\xa01 or Level\xa02 because we use quoted market prices or alternative pricing\nsources and models utilizing market observable inputs to determine their fair value. Certain other assets are classified within Level\xa03 because factors used to\ndevelop the estimated fair value are unobservable inputs that are not supported

In [35]:
response = retrieval_augmented_qa_chain.invoke({"question" : "What are the names of Meta's 'Directors' (i.e., members of the Board of Directors)?"})
print('/n/n/n')
print("Response:")
print(response["response"].content)
print("----")
for context in response["context"]:
  print("Context:")
  print(context)
  print("----")

2024-05-01 18:34:31,784 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-05-01 18:34:33,816 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
/n/n/n
Response:
The names of Meta's Directors (members of the Board of Directors) are not explicitly mentioned in the provided context. Therefore, based on the given information, I don't know the specific names of Meta's Directors.
----
Context:
page_content='Table of Contents\nCompensation, Benefits, Health, and Well-being\nWe offer competitive compensation to attract and retain the best people, and we help care for our people so they can focus on our mission. Our\nemployees\' total compensation package includes market-competitive salary, bonuses or sales incentives, and equity. We generally offer full-time employees