In [1]:
# load environment variables
from dotenv import load_dotenv
_ = load_dotenv(dotenv_path="../.env")

In [2]:
from eval import get_dummy_doc
document = get_dummy_doc()

In [3]:
document

Document(id=UUID('4d24de4e-63ee-4af5-9c97-ccae008ad887'), created_at=datetime.datetime(2024, 2, 13, 3, 56, 11, 322253), updated_at=datetime.datetime(2024, 2, 13, 3, 56, 11, 322253), url='http://llama-app-web-assets-local.s3-website.localhost.localstack.cloud:4566/sec-edgar-filings/0001326801/10-K/0001326801-23-000013/primary-document.pdf', metadata_map={<DocumentMetadataKeysEnum.SEC_DOCUMENT: 'sec_document'>: {'cik': '0001326801', 'year': 2022, 'doc_type': '10-K', 'company_name': 'Meta Platforms, Inc.', 'company_ticker': 'META', 'accession_number': '0001326801-23-000013', 'filed_as_of_date': '2023-02-02T00:00:00', 'date_as_of_change': '2023-02-01T00:00:00', 'period_of_report_date': '2022-12-31T00:00:00'}})

In [4]:
from app.chat.engine import fetch_and_read_document
document = fetch_and_read_document(document)     # merge document pages into a single document

LOADING DOCUMENT WITH SimpleDirectoryReader: META 10-K 2022
MERGING DOCUMENT: META 10-K 2022


In [5]:
print(f"len(document): {len(document)}")
print(f"type(document): {type(document)}")
print(f"type(document[0]): {type(document[0])}")

len(document): 1
type(document): <class 'list'>
type(document[0]): <class 'llama_index.schema.Document'>


In [6]:
import anyio
from app.chat.messaging import ChatCallbackHandler

send_chan, recv_chan = anyio.create_memory_object_stream(100)
callback_handler = ChatCallbackHandler(send_chan)

In [7]:
from app.chat.engine import get_tool_service_context
original_service_context = get_tool_service_context(callback_handlers=[callback_handler], node_parser_type="original")

In [8]:
# display original node parsing
original_node_parser = original_service_context.node_parser
original_nodes = original_node_parser.get_nodes_from_documents(document)
print(f"Total nodes: {len(original_nodes)}")

from eval import format_pdf_text
print(f"\n{'#'*50} ORIGINAL NODE {'#'*50}\n{format_pdf_text(original_nodes[5].text)}")

Total nodes: 325

################################################## ORIGINAL NODE ##################################################
We have based these forward-looking statements largely on our current expectations and projections about future events and trends that we believe may affect our financial condition, results of operations, business strategy, short-term and long-term business operations and objectives, and financial needs. These forward-looking statements are subject to a number of risks, uncertainties and assumptions, including those described in Part I, Item 1A, "Risk Factors" in this Annual Report on Form 10-K. Moreover, we operate in a very competitive and rapidly changing environment. New risks emerge from time to time. It is not possible for our management to predict all risks, nor can we assess the impact of all factors on our business or the extent to which any factor, or combination of factors, may cause actual results to differ materially from those contained in 

In [9]:
sentence_window_service_context = get_tool_service_context(callback_handlers=[callback_handler], node_parser_type="setence-window")
sentence_window_node_parser = sentence_window_service_context.node_parser
sentence_window_nodes = sentence_window_node_parser.get_nodes_from_documents(document)

print(f"Total sentence-window nodes: {len(sentence_window_nodes)}")
sentence = format_pdf_text(sentence_window_nodes[5].metadata.get("original_text"))
window = format_pdf_text(sentence_window_nodes[5].metadata.get("window"))

print(f"\nSentence-Window node:")
print(f"\n{'#'*50} SENTENCE {'#'*50}\n{sentence}")
print(f"\n{'#'*50} WINDOW {'#'*50}\n{window}")


Total sentence-window nodes: 2184

Sentence-Window node:

################################################## SENTENCE ##################################################
Yes ☐ No ☒ Indicate by check mark whether the registrant (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 (Exchange Act) during the preceding 12 months (or for such shorter period that the registrant was required to file such reports), and (2) has been subject to such filing requirements for the past 90 days.

################################################## WINDOW ##################################################
(Exact name of registrant as specified in its charter) __________________________ Delaware 20-1665019 (State or other jurisdiction of incorporation or organization) (I.R.S. Employer Identification Number) 1601 Willow Road , Menlo Park , California 94025 (Address of principal executive offices and Zip Code) ( 650 ) 543-4800 (Registrant's telephone n

In [10]:
# parse nodes hierarchically
from llama_index.node_parser import get_leaf_nodes, get_root_nodes
auto_merging_service_context = get_tool_service_context(callback_handlers=[callback_handler], node_parser_type="hierarchical")

hierarchical_node_parser = auto_merging_service_context.node_parser
hierarchical_nodes = hierarchical_node_parser.get_nodes_from_documents(document)
leaf_nodes = get_leaf_nodes(hierarchical_nodes)
root_nodes = get_root_nodes(hierarchical_nodes)

In [11]:
print(f"Total hierarchical nodes: {len(hierarchical_nodes)}")
print(f"Total leaf nodes: {len(leaf_nodes)}")

Total hierarchical nodes: 2455
Total leaf nodes: 2017


In [12]:
# function to get parent of a hierarchical node
get_parent_node = lambda node, all_nodes: next(i for i in all_nodes if i.id_ == node.parent_node.node_id)

# get intermediate & root nodes
leaf_node = leaf_nodes[0]
intermediate_node = get_parent_node(leaf_node, hierarchical_nodes)
root_node = get_parent_node(intermediate_node, hierarchical_nodes)

print(f"Notice how each node is a subset of its parent:")
print(f"\n{'#'*50} LEAF NODE {'#'*50}\n{format_pdf_text(leaf_node.text)}")
print(f"\n{'#'*50} INTERMEDIATE NODE {'#'*50}\n{format_pdf_text(intermediate_node.text)}")
print(f"\n{'#'*50} ROOT NODE {'#'*50}\n{format_pdf_text(root_node.text)}")


Notice how each node is a subset of its parent:

################################################## LEAF NODE ##################################################
UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 __________________________ FORM 10-K __________________________ (Mark One) ☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the fiscal year ended December 31 ,

################################################## INTERMEDIATE NODE ##################################################
UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 __________________________ FORM 10-K __________________________ (Mark One) ☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the fiscal year ended December 31 , 2022 or ☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the transition period from to Commission File Number: 001-35551 ___

In [13]:
# build indexes
import os
from llama_index.indices.vector_store.base import VectorStoreIndex
from llama_index import StorageContext, load_index_from_storage

ORIGINAL_PERSIST_DIR = '/workspaces/sec-insights/backend/eval/index_storage/original'   # local dir to persist storage of index
if not os.path.exists(ORIGINAL_PERSIST_DIR):                                            # check if storage already exists
    print(f"Creating Original index and saving it at: {ORIGINAL_PERSIST_DIR}")
    original_index = VectorStoreIndex(original_nodes)                                   # create the index
    original_index.storage_context.persist(persist_dir=ORIGINAL_PERSIST_DIR)            # store it for later
else:
    print(f"Original index exists - loading it from: {ORIGINAL_PERSIST_DIR}")
    original_storage_context = StorageContext.from_defaults(persist_dir=ORIGINAL_PERSIST_DIR)   # load the existing index
    original_index = load_index_from_storage(original_storage_context)

SENTENCE_WINDOW_PERSIST_DIR = '/workspaces/sec-insights/backend/eval/index_storage/setence_window'
if not os.path.exists(SENTENCE_WINDOW_PERSIST_DIR):
    print(f"Creating Sentence-Window index and saving it at: {SENTENCE_WINDOW_PERSIST_DIR}")
    sentence_window_index = VectorStoreIndex.from_documents(
        document,
        service_context=sentence_window_service_context,
    )
    sentence_window_index.storage_context.persist(persist_dir=SENTENCE_WINDOW_PERSIST_DIR)
else:
    print(f"Sentence-Window index exists - loading it from: {SENTENCE_WINDOW_PERSIST_DIR}")
    setence_window_storage_context = StorageContext.from_defaults(persist_dir=SENTENCE_WINDOW_PERSIST_DIR)
    sentence_window_index = load_index_from_storage(
        storage_context=setence_window_storage_context,
        service_context=sentence_window_service_context,
    )

AUTO_MERGING_PERSIST_DIR = '/workspaces/sec-insights/backend/eval/index_storage/auto_merging'
auto_merging_storage_context = StorageContext.from_defaults()
auto_merging_storage_context.docstore.add_documents(hierarchical_nodes)
if not os.path.exists(AUTO_MERGING_PERSIST_DIR):
    print(f"Creating Auto-Merging index and saving it at: {AUTO_MERGING_PERSIST_DIR}")
    auto_merging_index = VectorStoreIndex(
        leaf_nodes,
        storage_context=auto_merging_storage_context,
        service_context=auto_merging_service_context
    )
    auto_merging_index.storage_context.persist(AUTO_MERGING_PERSIST_DIR)
else:
    print(f"Auto-Merging index exists - loading it from: {AUTO_MERGING_PERSIST_DIR}")
    auto_merging_index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir=AUTO_MERGING_PERSIST_DIR),
        service_context=auto_merging_service_context
    )

Creating Original index and saving it at: /workspaces/sec-insights/backend/eval/index_storage/original
Creating Sentence-Window index and saving it at: /workspaces/sec-insights/backend/eval/index_storage/setence_window
Creating Auto-Merging index and saving it at: /workspaces/sec-insights/backend/eval/index_storage/auto_merging


In [14]:
# build original query engine
original_query_engine = original_index.as_query_engine(
    similarity_top_k=3                                      # same as original source code
)

In [15]:
# build sentence-window query engine
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor, LLMRerank
postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
sentence_window_rerank = LLMRerank(
    top_n=4,
    service_context=sentence_window_service_context,
)
sentence_window_query_engine = sentence_window_index.as_query_engine(
    similarity_top_k=8, node_postprocessors=[postproc, sentence_window_rerank]
)

In [16]:
# build auto-merging query engine
from llama_index.retrievers import AutoMergingRetriever
from llama_index.query_engine import RetrieverQueryEngine
base_retriever = auto_merging_index.as_retriever(
    similarity_top_k=12,    # number of nodes that must be retrieved to merge into parent
)
auto_merging_retriever = AutoMergingRetriever(
    base_retriever, auto_merging_storage_context,
)
auto_merging_rerank = LLMRerank(
    top_n=2,
    service_context=auto_merging_service_context,
)
auto_merging_query_engine = RetrieverQueryEngine.from_args(
    auto_merging_retriever, node_postprocessors=[auto_merging_rerank]
)

In [17]:
# #### build query engines
# # original
# original_query_engine = original_index.as_query_engine(
#     similarity_top_k=3                                      # same as original source code
# )

# # sentence-window
# # from llama_index.indices.postprocessor import MetadataReplacementPostProcessor, SentenceTransformerRerank
# # postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
# # sentence_window_rerank = SentenceTransformerRerank(
# #     top_n=4,
# #     model="BAAI/bge-reranker-base",       # comment out to use default model (most speed)
# # )
# # sentence_window_query_engine = sentence_window_index.as_query_engine(
# #     similarity_top_k=8, node_postprocessors=[postproc, sentence_window_rerank]
# # )

# from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
# postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
# # sentence_window_rerank = SentenceTransformerRerank(
# #     top_n=4,
# #     model="BAAI/bge-reranker-base",       # comment out to use default model (most speed)
# # )
# sentence_window_query_engine = sentence_window_index.as_query_engine(
#     similarity_top_k=8, node_postprocessors=[postproc]
# )

# # from llama_index.indices.postprocessor import MetadataReplacementPostProcessor, LLMRerank
# # postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
# # sentence_window_rerank = SentenceTransformerRerank(
# #     top_n=4,
# #     model="BAAI/bge-reranker-base",       # comment out to use default model (most speed)
# # )
# # sentence_window_query_engine = sentence_window_index.as_query_engine(
# #     similarity_top_k=8, node_postprocessors=[postproc, sentence_window_rerank]
# # )

# # auto-merging
# from llama_index.retrievers import AutoMergingRetriever
# from llama_index.query_engine import RetrieverQueryEngine
# base_retriever = auto_merging_index.as_retriever(
#     similarity_top_k=12,    # number of nodes that must be retrieved to merge into parent
# )
# auto_merging_retriever = AutoMergingRetriever(
#     base_retriever, auto_merging_storage_context,
# )
# auto_merging_rerank = SentenceTransformerRerank(
#     top_n=2,
#     model="BAAI/bge-reranker-base",
# )
# auto_merging_query_engine = RetrieverQueryEngine.from_args(
#     auto_merging_retriever, node_postprocessors=[auto_merging_rerank]
# )

Compare The Responses of each Query Engine

In [18]:
prompt = "What is Meta's mission?"
print(f"Prompt: {prompt}")
print(f"Excerpt from META 2022 10-K Document: Our mission is to give people the power to build community and bring the world closer together.")

# Test Original response
original_response = original_query_engine.query(prompt)
sentence_window_response = sentence_window_query_engine.query(prompt)
auto_merging_response = auto_merging_query_engine.query(prompt)

print(f"\nORIGINAL QUERY ENGINE RESPONSE:\n{str(original_response)}")
print(f"\nSENTENCE-WINDOW QUERY ENGINE RESPONSE:\n{str(sentence_window_response)}")
print(f"\nAUTO-MERGING QUERY ENGINE RESPONSE:\n{str(auto_merging_response)}")

Prompt: What is Meta's mission?
Excerpt from META 2022 10-K Document: Our mission is to give people the power to build community and bring the world closer together.

ORIGINAL QUERY ENGINE RESPONSE:
Meta's mission is to provide a holistic approach to benefits through Life@Meta, offering a wide range of benefits across various areas to help employees and their dependents thrive. Additionally, Meta aims to invest in growing and maintaining a highly skilled and efficient workforce by providing career development opportunities, learning experiences, and employee surveys to understand and improve the overall employee experience. Furthermore, Meta focuses on health and well-being programs designed to give employees flexible benefits to achieve their personal well-being goals.

SENTENCE-WINDOW QUERY ENGINE RESPONSE:
Meta's mission is to give people the power to build community and bring the world closer together.

AUTO-MERGING QUERY ENGINE RESPONSE:
Meta's mission is to give people the power 

Evaluate Each Query Engine

In [24]:
# generate a dataset to be used for evaluation
from eval import generate_dataset
from llama_index.node_parser import SentenceSplitter
from llama_index.evaluation import DatasetGenerator, QueryResponseDataset
import random

file_path="/workspaces/sec-insights/backend/eval/eval_dataset.json"     # path to save evaluation dataset
if not os.path.exists(file_path):
    text_splitter = SentenceSplitter()
    base_nodes = text_splitter.get_nodes_from_documents(document)

    # Use the middle 80% of document context to generate questions in evaluation dataset
    start_index = int(len(base_nodes) * 0.1)
    end_index = int(len(base_nodes) * 0.9)

    num_nodes_eval = 30     #  The number of nodes (randomly sampled from total nodes) to use for generating evaluation questions.
    sample_eval_nodes = random.sample(base_nodes[start_index:end_index], num_nodes_eval)

    from llama_index.evaluation import DatasetGenerator
    dataset_generator = DatasetGenerator(
        nodes=sample_eval_nodes,
        # llm=OpenAI(model="gpt-4"),
        service_context=original_service_context,       # experiment using other service contexts (i.e., sentence-window, auto-merging)
        num_questions_per_chunk=2,
        show_progress=True,
    )

else: 
    print(f"Evaluation dataset already exists at: {file_path}")
    eval_dataset = QueryResponseDataset.from_json(file_path)



Evaluation dataset already exists at: /workspaces/sec-insights/backend/eval/eval_dataset.json
