In [None]:
# load environment variables
from dotenv import load_dotenv
load_dotenv(dotenv_path="../.env")

In [None]:
from eval import get_dummy_doc
document = get_dummy_doc()

In [None]:
from app.chat.engine import fetch_and_read_document
document = fetch_and_read_document(document)     # merge document pages into a single document

In [None]:
print(f"len(document): {len(document)}")
print(f"type(document): {type(document)}")
print(f"type(document[0]): {type(document[0])}")

In [None]:
import anyio
from app.chat.messaging import ChatCallbackHandler

send_chan, recv_chan = anyio.create_memory_object_stream(100)
callback_handler = ChatCallbackHandler(send_chan)

In [None]:
from app.chat.engine import get_tool_service_context
original_service_context = get_tool_service_context(callback_handlers=[callback_handler], node_parser_type="original")

In [None]:
# display original node parsing
original_node_parser = original_service_context.node_parser
original_nodes = original_node_parser.get_nodes_from_documents(document)
print(f"Total nodes: {len(original_nodes)}")

from eval import format_pdf_text
print(f"\n{'#'*50} ORIGINAL NODE {'#'*50}\n{format_pdf_text(original_nodes[5].text)}")

In [None]:
sentence_window_service_context = get_tool_service_context(callback_handlers=[callback_handler], node_parser_type="setence-window")
sentence_window_node_parser = sentence_window_service_context.node_parser
sentence_window_nodes = sentence_window_node_parser.get_nodes_from_documents(document)

print(f"Total sentence-window nodes: {len(sentence_window_nodes)}")
sentence = format_pdf_text(sentence_window_nodes[5].metadata.get("original_text"))
window = format_pdf_text(sentence_window_nodes[5].metadata.get("window"))

print(f"\nSentence-Window node:")
print(f"\n{'#'*50} SENTENCE {'#'*50}\n{sentence}")
print(f"\n{'#'*50} WINDOW {'#'*50}\n{window}")


In [None]:
# parse nodes hierarchically
from llama_index.node_parser import get_leaf_nodes, get_root_nodes
auto_merging_service_context = get_tool_service_context(callback_handlers=[callback_handler], node_parser_type="hierarchical")

hierarchical_node_parser = auto_merging_service_context.node_parser
hierarchical_nodes = hierarchical_node_parser.get_nodes_from_documents(document)
leaf_nodes = get_leaf_nodes(hierarchical_nodes)
root_nodes = get_root_nodes(hierarchical_nodes)

In [None]:
print(f"Total hierarchical nodes: {len(hierarchical_nodes)}")
print(f"Total leaf nodes: {len(leaf_nodes)}")

In [None]:
# function to get parent of a hierarchical node
get_parent_node = lambda node, all_nodes: next(i for i in all_nodes if i.id_ == node.parent_node.node_id)

# get intermediate & root nodes
leaf_node = leaf_nodes[0]
intermediate_node = get_parent_node(leaf_node, hierarchical_nodes)
root_node = get_parent_node(intermediate_node, hierarchical_nodes)

print(f"Notice how each node is a subset of its parent:")
print(f"\n{'#'*50} LEAF NODE {'#'*50}\n{format_pdf_text(leaf_node.text)}")
print(f"\n{'#'*50} INTERMEDIATE NODE {'#'*50}\n{format_pdf_text(intermediate_node.text)}")
print(f"\n{'#'*50} ROOT NODE {'#'*50}\n{format_pdf_text(root_node.text)}")


In [None]:
# build indexes
import os
from llama_index.indices.vector_store.base import VectorStoreIndex
from llama_index import StorageContext, load_index_from_storage

ORIGINAL_PERSIST_DIR = '/workspaces/sec-insights/backend/eval/index_storage/original'   # local dir to persist storage of index
if not os.path.exists(ORIGINAL_PERSIST_DIR):                                            # check if storage already exists
    print(f"Creating Original index and saving it at: {ORIGINAL_PERSIST_DIR}")
    original_index = VectorStoreIndex(original_nodes)                                   # create the index
    original_index.storage_context.persist(persist_dir=ORIGINAL_PERSIST_DIR)            # store it for later
else:
    print(f"Original index exists - loading it from: {ORIGINAL_PERSIST_DIR}")
    original_storage_context = StorageContext.from_defaults(persist_dir=ORIGINAL_PERSIST_DIR)   # load the existing index
    original_index = load_index_from_storage(original_storage_context)

SENTENCE_WINDOW_PERSIST_DIR = '/workspaces/sec-insights/backend/eval/index_storage/setence_window'
if not os.path.exists(SENTENCE_WINDOW_PERSIST_DIR):
    print(f"Creating Sentence-Window index and saving it at: {SENTENCE_WINDOW_PERSIST_DIR}")
    sentence_window_index = VectorStoreIndex.from_documents(
        document,
        service_context=sentence_window_service_context,
    )
    sentence_window_index.storage_context.persist(persist_dir=SENTENCE_WINDOW_PERSIST_DIR)
else:
    print(f"Sentence-Window index exists - loading it from: {SENTENCE_WINDOW_PERSIST_DIR}")
    setence_window_storage_context = StorageContext.from_defaults(persist_dir=SENTENCE_WINDOW_PERSIST_DIR)
    sentence_window_index = load_index_from_storage(
        storage_context=setence_window_storage_context,
        service_context=sentence_window_service_context,
    )

AUTO_MERGING_PERSIST_DIR = '/workspaces/sec-insights/backend/eval/index_storage/auto_merging'
auto_merging_storage_context = StorageContext.from_defaults()
auto_merging_storage_context.docstore.add_documents(hierarchical_nodes)
if not os.path.exists(AUTO_MERGING_PERSIST_DIR):
    print(f"Creating Auto-Merging index and saving it at: {AUTO_MERGING_PERSIST_DIR}")
    auto_merging_index = VectorStoreIndex(
        leaf_nodes,
        storage_context=auto_merging_storage_context,
        service_context=auto_merging_service_context
    )
    auto_merging_index.storage_context.persist(AUTO_MERGING_PERSIST_DIR)
else:
    print(f"Auto-Merging index exists - loading it from: {AUTO_MERGING_PERSIST_DIR}")
    auto_merging_index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir=AUTO_MERGING_PERSIST_DIR),
        service_context=auto_merging_service_context
    )

In [None]:
#### build query engines
# original
original_query_engine = original_index.as_query_engine(
    similarity_top_k=3                                      # same as original source code
)

# sentence-window
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor, SentenceTransformerRerank
postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
rerank = SentenceTransformerRerank(
    top_n=4,
    # model="BAAI/bge-reranker-base",       # comment out to use default model (most speed)
)
sentence_window_query_engine = sentence_window_index.as_query_engine(
    similarity_top_k=8, node_postprocessors=[postproc, rerank]
)

# auto-merging

