In [1]:
# load environment variables
from dotenv import load_dotenv
_ = load_dotenv(dotenv_path="../.env")

In [2]:
# Check if document exists. If not, get it and save it.
import os
import requests
from app import schema
from app.chat.engine import fetch_and_read_document
from llama_index.storage.docstore import SimpleDocumentStore

DOCUMENT_PERSIST_PATH = '/workspaces/sec-insights/backend/eval/document_storage/META_SEC_10-K_2022.json'
if not os.path.exists(DOCUMENT_PERSIST_PATH):

    # Get document from secinsights.ai production API
    base_url = "https://llama-app-backend.onrender.com"     # Production backend base URL
    prod_document_id = "922f4a9f-7bc9-4fb1-b9b8-fa5a84c1cbcc"    # META 10-K 2022

    response = requests.get(f"{base_url}/api/document/{prod_document_id}")
    data = response.json()
    document = schema.Document(**data)
    document = fetch_and_read_document(document)     # merge document pages into a single document

    document_docstore = SimpleDocumentStore()
    document_docstore.add_documents(document)
    document_docstore.persist(DOCUMENT_PERSIST_PATH)
    print(f"Saved document at: {DOCUMENT_PERSIST_PATH}")

else:
    print(f"Document already exists at: {DOCUMENT_PERSIST_PATH}")
    document_docstore = SimpleDocumentStore().from_persist_path(DOCUMENT_PERSIST_PATH)

# Get ID of document in docstore
docs = document_docstore.docs
keys_list = list(docs.keys())
doc_id = list(docs.keys())[0]       # Meta document is only document in docstore -- so, first key must be it's ID

meta_doc = document_docstore.get_document(doc_id=doc_id)


Document already exists at: /workspaces/sec-insights/backend/eval/document_storage/META_SEC_10-K_2022.json


In [3]:
type(meta_doc)
print(meta_doc.id_)

633d282d-f822-4097-a8f6-a8ed533bc6d7


In [4]:
import anyio
from app.chat.messaging import ChatCallbackHandler

send_chan, recv_chan = anyio.create_memory_object_stream(100)
callback_handler = ChatCallbackHandler(send_chan)

Parse Nodes

In [5]:
# get original node parsing
import os
from llama_index.storage.docstore import SimpleDocumentStore
from app.chat.engine import get_tool_service_context

ORIGINAL_NODES_PERSIST_PATH = '/workspaces/sec-insights/backend/eval/node_storage/original_nodes.json'

original_service_context = get_tool_service_context(callback_handlers=[callback_handler], node_parser_type="original")
original_node_parser = original_service_context.node_parser

if not os.path.exists(ORIGINAL_NODES_PERSIST_PATH):
    print(f"Creating original node parsing and saving at: {ORIGINAL_NODES_PERSIST_PATH}")
    original_nodes = original_node_parser.get_nodes_from_documents([meta_doc])
    original_docstore = SimpleDocumentStore()
    original_docstore.add_documents(original_nodes)
    original_docstore.persist(ORIGINAL_NODES_PERSIST_PATH)
else:
    print(f"Original node parsing exists - loading it from: {ORIGINAL_NODES_PERSIST_PATH}")
    original_docstore = SimpleDocumentStore().from_persist_path(ORIGINAL_NODES_PERSIST_PATH)
    original_document = original_docstore.docs
    original_nodes = list(original_document.values())


######################### CREATING SERVICE CONTEXT USING original NODE PARSER #########################
Original node parsing exists - loading it from: /workspaces/sec-insights/backend/eval/node_storage/original_nodes.json


In [6]:
print(f"Total nodes: {len(original_nodes)}")

from eval import format_pdf_text
print(f"\n{'#'*50} ORIGINAL NODE {'#'*50}\n{format_pdf_text(original_nodes[5].text)}")

## Additional Output
# print(f"\ntype(original_nodes): {type(original_nodes)}")
# print(f"type(original_nodes[0]): {type(original_nodes[0])}")

Total nodes: 327

################################################## ORIGINAL NODE ##################################################
We have based these forward-looking statements largely on our current expectations and projections about future events and trends that we believe may affect our financial condition, results of operations, business strategy, short-term and long-term business operations and objectives, and financial needs. These forward-looking statements are subject to a number of risks, uncertainties and assumptions, including those described in Part I, Item 1A, "Risk Factors" in this Annual Report on Form 10-K. Moreover, we operate in a very competitive and rapidly changing environment. New risks emerge from time to time. It is not possible for our management to predict all risks, nor can we assess the impact of all factors on our business or the extent to which any factor, or combination of factors, may cause actual results to differ materially from those contained in 

In [7]:
# get sentence-window node parsing

SENTENCE_WINDOW_NODES_PERSIST_PATH = '/workspaces/sec-insights/backend/eval/node_storage/sentence_window_nodes.json'

sentence_window_service_context = get_tool_service_context(callback_handlers=[callback_handler], node_parser_type="sentence-window")
sentence_window_node_parser = sentence_window_service_context.node_parser

if not os.path.exists(SENTENCE_WINDOW_NODES_PERSIST_PATH):
    print(f"Creating sentence window node parsing and saving at: {SENTENCE_WINDOW_NODES_PERSIST_PATH}")
    sentence_window_nodes = sentence_window_node_parser.get_nodes_from_documents([meta_doc])
    sentence_window_docstore = SimpleDocumentStore()
    sentence_window_docstore.add_documents(sentence_window_nodes)
    sentence_window_docstore.persist(SENTENCE_WINDOW_NODES_PERSIST_PATH)

else:
    print(f"Sentence window node parsing exists - loading it from: {SENTENCE_WINDOW_NODES_PERSIST_PATH}")
    sentence_window_docstore = SimpleDocumentStore().from_persist_path(SENTENCE_WINDOW_NODES_PERSIST_PATH)
    sentence_window_document = sentence_window_docstore.docs
    sentence_window_nodes = list(sentence_window_document.values())

######################### CREATING SERVICE CONTEXT USING sentence-window NODE PARSER #########################
Sentence window node parsing exists - loading it from: /workspaces/sec-insights/backend/eval/node_storage/sentence_window_nodes.json


In [9]:
print(f"Total sentence-window nodes: {len(sentence_window_nodes)}")
print(f"Window_size = {sentence_window_node_parser.window_size}")

sentence = format_pdf_text(sentence_window_nodes[5].metadata.get("original_text"))
window = format_pdf_text(sentence_window_nodes[5].metadata.get("window"))

print(f"\nSentence-Window node:")
print(f"\n{'#'*50} SENTENCE {'#'*50}\n{sentence}")
print(f"\n{'#'*50} WINDOW {'#'*50}\n{window}")

# ## Additional Output
# print(f"\ntype(sentence_window_nodes): {type(sentence_window_nodes)}")
# print(f"type(sentence_window_nodes[0]): {type(sentence_window_nodes[0])}")

Total sentence-window nodes: 2184
Window_size = 2

Sentence-Window node:

################################################## SENTENCE ##################################################
Yes ☐ No ☒ Indicate by check mark whether the registrant (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 (Exchange Act) during the preceding 12 months (or for such shorter period that the registrant was required to file such reports), and (2) has been subject to such filing requirements for the past 90 days.

################################################## WINDOW ##################################################
Employer Identification Number) 1601 Willow Road , Menlo Park , California 94025 (Address of principal executive offices and Zip Code) ( 650 ) 543-4800 (Registrant's telephone number, including area code) __________________________ Securities registered pursuant to Section 12(b) of the Act: Title of each class Trading symbol(s) Name

In [10]:
# get hierarchical node parsing
HIERARCHICAL_NODES_PERSIST_PATH = '/workspaces/sec-insights/backend/eval/node_storage/hierarchical_nodes.json'

auto_merging_service_context = get_tool_service_context(callback_handlers=[callback_handler], node_parser_type="hierarchical")
hierarchical_node_parser = auto_merging_service_context.node_parser

if not os.path.exists(HIERARCHICAL_NODES_PERSIST_PATH):
    print(f"Creating hierarchical node parsing and saving at: {HIERARCHICAL_NODES_PERSIST_PATH}")
    hierarchical_nodes = hierarchical_node_parser.get_nodes_from_documents([meta_doc])
    hierarchical_docstore = SimpleDocumentStore()
    hierarchical_docstore.add_documents(hierarchical_nodes)
    hierarchical_docstore.persist(HIERARCHICAL_NODES_PERSIST_PATH)

else:
    print(f"Hierarchical node parsing exists - loading it from: {HIERARCHICAL_NODES_PERSIST_PATH}")
    hierarchical_docstore = SimpleDocumentStore().from_persist_path(HIERARCHICAL_NODES_PERSIST_PATH)
    hierarchical_document = hierarchical_docstore.docs
    hierarchical_nodes = list(hierarchical_document.values())

######################### CREATING SERVICE CONTEXT USING hierarchical NODE PARSER #########################
Hierarchical node parsing exists - loading it from: /workspaces/sec-insights/backend/eval/node_storage/hierarchical_nodes.json


In [11]:
from llama_index.node_parser import get_leaf_nodes
leaf_nodes = get_leaf_nodes(hierarchical_nodes)

print(f"Node hierarchy (chunk sizes): {hierarchical_node_parser.chunk_sizes}")
print(f"Total hierarchical nodes: {len(hierarchical_nodes)}")
print(f"Total leaf nodes: {len(leaf_nodes)}")

Node hierarchy (chunk sizes): [2048, 512, 128]
Total hierarchical nodes: 2556
Total leaf nodes: 2119


In [12]:
# function to get parent of a hierarchical node
get_parent_node = lambda node, all_nodes: next(i for i in all_nodes if i.id_ == node.parent_node.node_id)

# get intermediate & root nodes
leaf_node = leaf_nodes[0]
intermediate_node = get_parent_node(leaf_node, hierarchical_nodes)
root_node = get_parent_node(intermediate_node, hierarchical_nodes)

print(f"Notice how each node is a subset of its parent:")
print(f"\n{'#'*50} LEAF NODE {'#'*50}\n{format_pdf_text(leaf_node.text)}")
print(f"\n{'#'*50} INTERMEDIATE NODE {'#'*50}\n{format_pdf_text(intermediate_node.text)}")
# print(f"\n{'#'*50} ROOT NODE {'#'*50}\n{format_pdf_text(root_node.text)}")

Notice how each node is a subset of its parent:

################################################## LEAF NODE ##################################################
UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C.

################################################## INTERMEDIATE NODE ##################################################
UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 __________________________ FORM 10-K __________________________ (Mark One) ☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the fiscal year ended December 31 , 2022 or ☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the transition period from to Commission File Number: 001-35551 __________________________ Meta Platforms, Inc. (Exact name of registrant as specified in its charter) __________________________ Delaware 20-1665019 (State or other jurisdiction of incorporation or organization

Build Indexes

In [13]:
import os
from llama_index.indices.vector_store.base import VectorStoreIndex
from llama_index import StorageContext, load_index_from_storage

ORIGINAL_PERSIST_DIR = '/workspaces/sec-insights/backend/eval/index_storage/original'   # local dir to persist storage of index
if not os.path.exists(ORIGINAL_PERSIST_DIR):                                            # check if storage already exists
    print(f"Creating Original index and saving it at: {ORIGINAL_PERSIST_DIR}")
    original_index = VectorStoreIndex(original_nodes)                                   # create the index
    original_index.storage_context.persist(persist_dir=ORIGINAL_PERSIST_DIR)            # store it for later
else:
    print(f"Original index exists - loading it from: {ORIGINAL_PERSIST_DIR}")
    original_storage_context = StorageContext.from_defaults(persist_dir=ORIGINAL_PERSIST_DIR)   # load the existing index
    original_index = load_index_from_storage(original_storage_context)

SENTENCE_WINDOW_PERSIST_DIR = '/workspaces/sec-insights/backend/eval/index_storage/setence_window'
if not os.path.exists(SENTENCE_WINDOW_PERSIST_DIR):
    print(f"Creating Sentence-Window index and saving it at: {SENTENCE_WINDOW_PERSIST_DIR}")
    sentence_window_index = VectorStoreIndex.from_documents(
        [meta_doc],
        service_context=sentence_window_service_context,
    )
    sentence_window_index.storage_context.persist(persist_dir=SENTENCE_WINDOW_PERSIST_DIR)
else:
    print(f"Sentence-Window index exists - loading it from: {SENTENCE_WINDOW_PERSIST_DIR}")
    setence_window_storage_context = StorageContext.from_defaults(persist_dir=SENTENCE_WINDOW_PERSIST_DIR)
    sentence_window_index = load_index_from_storage(
        storage_context=setence_window_storage_context,
        service_context=sentence_window_service_context,
    )

AUTO_MERGING_INDEX_PERSIST_DIR = '/workspaces/sec-insights/backend/eval/index_storage/auto_merging_2'
if not os.path.exists(AUTO_MERGING_INDEX_PERSIST_DIR):
    print(f"Creating Auto-Merging storage context and saving it at: {AUTO_MERGING_INDEX_PERSIST_DIR}")
    auto_merging_storage_context = StorageContext.from_defaults(
        docstore=hierarchical_docstore,
    )
    auto_merging_index = VectorStoreIndex(
        leaf_nodes,
        storage_context=auto_merging_storage_context,
        service_context=auto_merging_service_context
    )
    auto_merging_index.storage_context.persist(AUTO_MERGING_INDEX_PERSIST_DIR)
else:
    print(f"Auto-Merging index exists - loading it from: {AUTO_MERGING_INDEX_PERSIST_DIR}")
    auto_merging_index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir=AUTO_MERGING_INDEX_PERSIST_DIR),
        service_context=auto_merging_service_context
    )

Original index exists - loading it from: /workspaces/sec-insights/backend/eval/index_storage/original
Sentence-Window index exists - loading it from: /workspaces/sec-insights/backend/eval/index_storage/setence_window
Auto-Merging index exists - loading it from: /workspaces/sec-insights/backend/eval/index_storage/auto_merging_2


Build Query Engines

In [16]:
# build original query engine
original_query_engine = original_index.as_query_engine(
    similarity_top_k=3                                      # same as original source code
)

In [17]:
# build sentence-window query engine
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor, LLMRerank
postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
sentence_window_rerank = LLMRerank(
    top_n=2,
    service_context=sentence_window_service_context,
)
sentence_window_query_engine = sentence_window_index.as_query_engine(
    similarity_top_k=4, node_postprocessors=[postproc, sentence_window_rerank]
)

In [18]:
# build auto-merging query engine
from llama_index.retrievers import AutoMergingRetriever
from llama_index.query_engine import RetrieverQueryEngine
base_retriever = auto_merging_index.as_retriever(
    similarity_top_k=12,
)
auto_merging_retriever = AutoMergingRetriever(
    base_retriever, StorageContext.from_defaults(persist_dir=AUTO_MERGING_INDEX_PERSIST_DIR),   # base retriever and auto-merging storage context
)
auto_merging_rerank = LLMRerank(
    top_n=4,
    service_context=auto_merging_service_context,
)
auto_merging_query_engine = RetrieverQueryEngine.from_args(
    auto_merging_retriever, node_postprocessors=[auto_merging_rerank]
)

#### Compare The Responses of each Query Engine

In [19]:
prompt = "What is Meta's mission?"
print(f"Prompt: {prompt}")
print(f"Excerpt from META 2022 10-K Document: Our mission is to give people the power to build community and bring the world closer together.")

original_response = original_query_engine.query(prompt)
sentence_window_response = sentence_window_query_engine.query(prompt)
auto_merging_response = auto_merging_query_engine.query(prompt)

print(f"\nORIGINAL QUERY ENGINE RESPONSE:\n{str(original_response)}")
print(f"\nSENTENCE-WINDOW QUERY ENGINE RESPONSE:\n{str(sentence_window_response)}")
print(f"\nAUTO-MERGING QUERY ENGINE RESPONSE:\n{str(auto_merging_response)}")

Prompt: What is Meta's mission?
Excerpt from META 2022 10-K Document: Our mission is to give people the power to build community and bring the world closer together.

ORIGINAL QUERY ENGINE RESPONSE:
Meta's mission is to move their offerings beyond 2D screens towards immersive experiences like augmented and virtual reality to help build the metaverse, which they believe is the next evolution in social technology. Their vision for the metaverse involves creating an entire ecosystem of experiences, devices, and new technologies, aiming for it to become the next computing platform and the future of social interaction.

SENTENCE-WINDOW QUERY ENGINE RESPONSE:
Meta's mission is to give people the power to build community and bring the world closer together.

AUTO-MERGING QUERY ENGINE RESPONSE:
Meta's mission is to give people the power to build community and bring the world closer together.


Generate Evaluation Dataset

In [20]:
import asyncio
import nest_asyncio
nest_asyncio.apply()    # allow asynchonous code to run within Jupyter notebook

In [21]:
from eval import generate_dataset
from llama_index.node_parser import SentenceSplitter
from llama_index.evaluation import DatasetGenerator, QueryResponseDataset
from llama_index.llms.openai import OpenAI
from llama_index import ServiceContext
import random

file_path="/workspaces/sec-insights/backend/eval/eval_dataset.json"     # path to save evaluation dataset
if not os.path.exists(file_path):
    text_splitter = SentenceSplitter()
    base_nodes = text_splitter.get_nodes_from_documents(document)

    # Use the middle 80% of document context to generate questions in evaluation dataset
    start_index = int(len(base_nodes) * 0.1)
    end_index = int(len(base_nodes) * 0.9)

    num_nodes_eval = 30    #  The number of nodes (randomly sampled from total nodes) to use for generating evaluation questions.
    sample_eval_nodes = random.sample(base_nodes[start_index:end_index], num_nodes_eval)

    dataset_generator_llm = OpenAI(temperature=0, model="gpt-4")
    dataset_generator_service_context = ServiceContext.from_defaults(llm=dataset_generator_llm)
    
    dataset_generator = DatasetGenerator(
        nodes=sample_eval_nodes,
        service_context=original_service_context,
        # service_context=dataset_generator_service_context,        # rate limiting issues using gpt-4
        num_questions_per_chunk=2,
        show_progress=True,
    )
    eval_dataset = await dataset_generator.agenerate_dataset_from_nodes()
    eval_dataset.save_json(file_path)
    print(f"Saved evaluation dataset at: {file_path}")

else: 
    print(f"Evaluation dataset already exists at: {file_path}")
    eval_dataset = QueryResponseDataset.from_json(file_path)


Evaluation dataset already exists at: /workspaces/sec-insights/backend/eval/eval_dataset.json


Evaluate Query Engines

In [22]:
_='''
Note - The following code snippet had to be added to the CorrectnessEvaluator class within the llama-index v"0.9.7" package at
llama_index/evaluation/correctness.py before line: score_str, reasoning_str = eval_response.split("\n", 1)
This resolves an issue where eval_resonse is created beginning with a newline character, resulting in an error trying to convert
an empty str to a float on line: score = float(score_str).

Code snippet:
if eval_response[0] == '\n':
    eval_response = eval_response[1:]
'''

In [23]:
'''
Evaluate responses based on the following metrics:
    Correctness:            The correctness of a response - A score between 1 (worst) and 5 (best).
    Semantic Similarity:    The similarity between embeddings of the generated answer and reference answer.
    Relevance:              The relevance of retrieved context and response to the query. Considers the query string, retrieved context, and response string.
    Faithfulness:           How well the response is supported by the retrieved context (i.e., Is there hallucination?)
'''
from llama_index.evaluation import CorrectnessEvaluator, SemanticSimilarityEvaluator, RelevancyEvaluator, FaithfulnessEvaluator, BatchEvalRunner
from llama_index.evaluation.eval_utils import get_responses

evaluator_c = CorrectnessEvaluator()
evaluator_s = SemanticSimilarityEvaluator()
evaluator_r = RelevancyEvaluator()
evaluator_f = FaithfulnessEvaluator()

evaluator_dict = {
    "correctness": evaluator_c,
    "faithfulness": evaluator_f,
    "relevancy": evaluator_r,
    "semantic_similarity": evaluator_s,
}
batch_runner = BatchEvalRunner(evaluator_dict, workers=2, show_progress=True)

eval_qs = eval_dataset.questions
ref_response_strs = [r for (_, r) in eval_dataset.qr_pairs]
print(f"{len(eval_qs)} questions in dataset")

60 questions in dataset


Get Responses from Each Query Engine for Evaluation

In [None]:
max_samples = 20

import time
time.sleep(12)
print(f"Original Query Engine")
original_pred_responses = get_responses(
    eval_qs[:max_samples], original_query_engine, show_progress=True
)
time.sleep(12)
print(f"Sentence-Window Query Engine")
sentence_window_pred_responses = get_responses(
    eval_qs[:max_samples], sentence_window_query_engine, show_progress=True
)
time.sleep(12)
print(f"Auto-Merging Query Engine")
auto_merging_pred_responses = get_responses(
    eval_qs[:max_samples], auto_merging_query_engine, show_progress=True
)

In [24]:
max_samples = 20
queries = list(eval_dataset.queries.values())

In [25]:
print(f"Original Query Engine")
i = 0
original_pred_responses = []
while i < max_samples:
    try:
        print(f"Making prediction {i + 1}/{max_samples}", end='\r', flush=True)
        prompt = queries[i]
        response = original_query_engine.query(prompt)
        original_pred_responses.append(response)
        i +=1
    except Exception as e:
        # Catch-all to handle exceptions
        print(f"Exception occured: {e}")
        break

Original Query Engine
Making prediction 20/20

In [None]:
# original_pred_responses = get_responses(
#     eval_qs[:max_samples], original_query_engine, show_progress=True
# )

In [28]:
import time
from openai import RateLimitError
print(f"Sentence-Window Query Engine")
i = 0
sentence_window_pred_responses = []
while i < max_samples:
    try:
        print(f"Making prediction {i + 1}/{max_samples}", end='\r', flush=True)
        prompt = queries[i]
        response = sentence_window_query_engine.query(prompt)
        sentence_window_pred_responses.append(response)
        i +=1
        
    except RateLimitError as rate_limit_err:
        print(rate_limit_err)
        sleep = 12
        for s in range(sleep):
            print(f"waiting {s}/{sleep}s", end='\r', flush=True)
            time.sleep(sleep)
    except Exception as e:  # Catch-all to handle exceptions
        print(f"Exception occured: {e}")
        break


Sentence-Window Query Engine
Making prediction 20/20

In [None]:
# print(f"Sentence-Window Query Engine")
# sentence_window_pred_responses = get_responses(
#     eval_qs[:max_samples], sentence_window_query_engine, show_progress=True
# )

In [29]:
i = 0
auto_merging_pred_responses = []
while i <max_samples:
    try:
        print(f"Making prediction {i + 1}/{max_samples}", end='\r', flush=True)
        prompt = queries[i]
        response = auto_merging_query_engine.query(prompt)
        auto_merging_pred_responses.append(response)
        i +=1

    except RateLimitError as rate_limit_err:
        print(rate_limit_err)
        sleep = 12
        for s in range(sleep):
            print(f"waiting {s}/{sleep}s", end='\r', flush=True)
            time.sleep(1)
    except Exception as e:  # Catch-all to handle exceptions
        print(f"Exception occured: {e}")
        break

Making prediction 20/20

In [None]:
# print(f"Auto-Merging Query Engine")
# auto_merging_pred_responses = get_responses(
#     eval_qs[:max_samples], auto_merging_query_engine, show_progress=True
# )

Evaluate Responses from Each Query Engine

In [31]:
time.sleep(12)
print(f"Original Query Engine")
original_eval_results = await batch_runner.aevaluate_responses(
    queries=eval_qs[:max_samples],
    responses=original_pred_responses[:max_samples],
    reference=ref_response_strs[:max_samples],
)

Original Query Engine


100%|██████████| 80/80 [00:30<00:00,  2.65it/s]


In [32]:
time.sleep(12)
print(f"Sentence-Window Query Engine")
sentence_window_eval_results = await batch_runner.aevaluate_responses(
    queries=eval_qs[:max_samples],
    responses=sentence_window_pred_responses[:max_samples],
    reference=ref_response_strs[:max_samples],
)

Sentence-Window Query Engine


100%|██████████| 80/80 [00:29<00:00,  2.72it/s]


In [34]:
time.sleep(12)
print(f"Auto-Merging Query Engine")
auto_merging_eval_results = await batch_runner.aevaluate_responses(
    queries=eval_qs[:max_samples],
    responses=auto_merging_pred_responses[:max_samples],
    reference=ref_response_strs[:max_samples],
)

Auto-Merging Query Engine


100%|██████████| 80/80 [00:28<00:00,  2.85it/s]


Get Evaluation Results

In [35]:
from llama_index.evaluation.eval_utils import get_results_df
results_df = get_results_df(
    [original_eval_results, sentence_window_eval_results, auto_merging_eval_results],
    ["Base Retriever", "Sentence-Window Retriever", "Auto-Merging Retriever"],
    ["correctness", "relevancy", "faithfulness", "semantic_similarity"],
)
display(results_df)

Unnamed: 0,names,correctness,relevancy,faithfulness,semantic_similarity
0,Base Retriever,4.2,0.95,0.9,0.971983
1,Sentence-Window Retriever,4.2,1.0,1.0,0.971759
2,Auto-Merging Retriever,4.05,0.95,1.0,0.969437


In [37]:
# save results to CSV file
OUTPUT_PATH = '/workspaces/sec-insights/backend/eval/results/results_2024_03_04_samples-20.csv'
results_df.to_csv(OUTPUT_PATH, index=False)