Main notebook for creating and updating the Chroma vector database and sparse indices. Both for normal and hierarchical retrieval.

In [None]:
from ipynb.fs.defs.a_preprocess_data import preprocess_data 
from ipynb.fs.defs.a_preprocess_data import preprocess_data_for_parent_child_retriever
from ipynb.fs.defs.a_preprocess_data import update_pdf_documents
from ipynb.fs.defs.a_preprocess_data import store_documents_for_sparse_retrieval
from ipynb.fs.defs.a_preprocess_data import store_documents_for_sparse_retrieval_parent_child
from ipynb.fs.defs.b_build_index import build_or_update_index_vector_db
from ipynb.fs.defs.b_build_index import build_or_update_index_for_parent_child_retriever

In [None]:
# Only call if PDF documents should be updated before creating an index.
# update_pdf_documents()

In [None]:
def create_indexes_dense(embedding_models, chunk_size_overlap_combinations, file_types, append_summary_to_each_docs, dist_functions, all_docs):
    """
    Builds the indices for the desired combinations for dense retrieval. 
    """
    for embedding_model_ in embedding_models:
        for chunk_size_overlap_combination in chunk_size_overlap_combinations:
            for file_type in file_types:
                for append_summary_to_each_doc in append_summary_to_each_docs:
                    for dist_function in dist_functions:
                        
                        # Preprocess the data
                        chunked_cleaned_documents, embedding_model, embeddings = preprocess_data(
                            chunk_size_overlap_combination["chunk_size"],
                            chunk_size_overlap_combination["chunk_overlap"],
                            embedding_model_["provider"],
                            embedding_model_["model_name"],
                            file_type,
                            append_summary_to_each_doc,
                            all_docs,
                        )

                        collection_name = (
                            embedding_model_["provider"]
                            + "_"
                            + embedding_model_["model_name"]
                            + "_"
                            + dist_function
                            + "_"
                            + file_type
                            + "_"
                            + str(chunk_size_overlap_combination["chunk_size"])
                            + "_"
                            + str(chunk_size_overlap_combination["chunk_overlap"])
                            + "_"
                            + str(append_summary_to_each_doc)
                        )

                        # Define metadata needed for later retrieval and generation evaluation.
                        metadata = {
                            "embedding_model_provider": embedding_model_["provider"],
                            "embedding_model_name": embedding_model_["model_name"],
                            "file_type": file_type,
                            "chunk_size": str(chunk_size_overlap_combination["chunk_size"]),
                            "chunk_overlap": str(chunk_size_overlap_combination["chunk_overlap"]),
                            "title_appended": str(append_summary_to_each_doc),
                        }

                        # Create the vector database.
                        new_client, vectordb = build_or_update_index_vector_db(chunked_cleaned_documents, embeddings, collection_name, dist_function, metadata)

In [None]:
# Desired combinations
embedding_models = [
    {"provider": "Cohere", "model_name": "v3"},
    {"provider": "HuggingFace", "model_name": "all-mpnet-base-v2"},
    {"provider": "HuggingFace", "model_name": "all-MiniLM-L6-v2"},
    {"provider": "HuggingFace", "model_name": "bge-large-en-v1.5"},
    {"provider": "HuggingFace", "model_name": "SecRoBERTa"},
    {"provider": "HuggingFace", "model_name": "Contriever"},
    {"provider": "Voyage", "model_name": "voyage-2"},
    {"provider": "OpenAI", "model_name": "text-embedding-ada"},
    {"provider": "Fine-tuned", "model_name": "finetuned-ISO-27001_1024"},
]

file_types = ["All"]

chunk_size_overlap_combinations = [
    {"chunk_size": 1536, "chunk_overlap": 264},
]

chunk_size_overlap_combinations_parent_child = [
    {"chunk_size_parent": 1536, "chunk_overlap_parent": 264, "chunk_size_child": 264, "chunk_overlap_child": 0},
]

append_summary_to_each_docs = [False]
dist_functions = ["l2"]
all_docs = True  # False if only new documents

create_indexes_dense(embedding_models, chunk_size_overlap_combinations, file_types, append_summary_to_each_docs, dist_functions, all_docs)

In [None]:
def build_indexes_sparse(chunk_size_overlap_combinations, file_types, append_summary_to_each_docs):
    """
    Builds the indices for the desired combinations for sparse retrieval. 
    """
    for chunk_size_overlap_combination in chunk_size_overlap_combinations:
        for file_type in file_types:
            for append_summary_to_each_doc in append_summary_to_each_docs:
                store_documents_for_sparse_retrieval(chunk_size_overlap_combination["chunk_size"], chunk_size_overlap_combination["chunk_overlap"], file_type, append_summary_to_each_doc)

In [None]:
def build_indexes_sparse_parent_child(chunk_size_overlap_combinations_parent_child, file_types, append_summary_to_each_docs):
    """
    Builds the indices for the desired combinations for sparse hierarchical retrieval. 
    """
    for chunk_size_overlap_combination in chunk_size_overlap_combinations_parent_child:
        for file_type in file_types:
            for append_summary_to_each_doc in append_summary_to_each_docs:
                store_documents_for_sparse_retrieval_parent_child(chunk_size_overlap_combination["chunk_size_parent"], chunk_size_overlap_combination["chunk_overlap_parent"], chunk_size_overlap_combination["chunk_size_child"], chunk_size_overlap_combination["chunk_overlap_child"], file_type, append_summary_to_each_doc)

In [None]:
chunk_size_overlap_combinations_parent_child = [{"chunk_size_parent": 1536, "chunk_overlap_parent": 264, "chunk_size_child": 512, "chunk_overlap_child": 0}]
file_types = ["All"]
append_summary_to_each_docs = [False]

build_indexes_sparse_parent_child(chunk_size_overlap_combinations_parent_child, file_types, append_summary_to_each_docs)

In [None]:
def create_indexes_parent_child_retriever(embedding_models, chunk_size_overlap_combinations_parent_child, file_types, append_summary_to_each_docs, dist_functions, all_docs):
    """
    Builds the indices for the desired combinations for dense, hierarchical retrieval. 
    """
    for embedding_model_ in embedding_models:
        for chunk_size_overlap_combination in chunk_size_overlap_combinations_parent_child:
            for file_type in file_types:
                for append_summary_to_each_doc in append_summary_to_each_docs:
                    for dist_function in dist_functions:
                        parent_full_docs, child_doc_list, child_embeddings = preprocess_data_for_parent_child_retriever(
                            chunk_size_overlap_combination["chunk_size_parent"],
                            chunk_size_overlap_combination["chunk_overlap_parent"],
                            chunk_size_overlap_combination["chunk_size_child"],
                            chunk_size_overlap_combination["chunk_overlap_child"],
                            embedding_model_["provider"],
                            embedding_model_["model_name"],
                            file_type,
                            append_summary_to_each_doc,
                            all_docs,
                        )

                        collection_name = (
                            embedding_model_["model_name"]
                            + "_"
                            + dist_function
                            + "_"
                            + file_type
                            + "_PC_"
                            + str(chunk_size_overlap_combination["chunk_size_parent"])
                            + "_"
                            + str(chunk_size_overlap_combination["chunk_overlap_parent"])
                            + "_"
                            + str(chunk_size_overlap_combination["chunk_size_child"])
                            + "_"
                            + str(chunk_size_overlap_combination["chunk_overlap_child"])
                            + "_"
                            + str(append_summary_to_each_doc)
                        )

                        metadata = {
                            "embedding_model_provider": embedding_model_["provider"],
                            "embedding_model_name": embedding_model_["model_name"],
                            "file_type": file_type,
                            "chunk_size_parent": str(chunk_size_overlap_combination["chunk_size_parent"]),
                            "chunk_overlap_parent": str(chunk_size_overlap_combination["chunk_overlap_parent"]),
                            "chunk_size_child": str(chunk_size_overlap_combination["chunk_size_child"]),
                            "chunk_overlap_child": str(chunk_size_overlap_combination["chunk_overlap_child"]),                            
                            "title_appended": str(append_summary_to_each_doc),
                        }

                        new_client, vectordb, store = build_or_update_index_for_parent_child_retriever(parent_full_docs, child_doc_list, child_embeddings, collection_name, dist_function, metadata)

In [None]:
chunk_size_overlap_combinations_parent_child = [{"chunk_size_parent": 1536, "chunk_overlap_parent": 264, "chunk_size_child": 512, "chunk_overlap_child": 0}]
embedding_models = [
    {"provider": "Fine-tuned", "model_name": "finetuned-ISO-27001_1024"},
    {"provider": "OpenAI", "model_name": "text-embedding-ada"}
]
file_types = ["All"]
append_summary_to_each_docs = [False]
dist_functions = ["l2"]
all_docs = True

create_indexes_parent_child_retriever(embedding_models, chunk_size_overlap_combinations_parent_child, file_types, append_summary_to_each_docs, dist_functions, all_docs)