Notebook for indexing the data into a data store

In [None]:
import chromadb
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from typing import List
import os

# Builds the index vector DB from documents with the specified embeddings and collectionName
# If it already exists, returns the vector DB


def build_or_update_index_vector_db(documents: List[Document], embeddings, collection_name: str, dist_function: str, collection_metadata: dict):
    """
    Creates the vector database with the specified documents and embeddings, if it does not exist yet. Otherwise updates it with them.
    """
    new_client = chromadb.PersistentClient(path=os.environ.get("CHROMA_PATH"))

    print("Starting to build index for: ")
    print(collection_metadata)

    # Check if collection already exists
    collection_exists = True
    try:
        new_client.get_collection(collection_name)
    except ValueError as e:
        collection_exists = False

    if not collection_exists:
        print("Collection is new")
        # If collection does not exist, create it
        collection = new_client.create_collection(collection_name)
        # Each document needs an ID
        ids = [str(i) for i in range(1, len(documents) + 1)]

        # Store the text of the document and metadata separately in order to insert it into Chroma
        # # The main purpose of this is to reduce calls to paid embedding APIs, otherwise everytime a index is created the embeddings are calculated again
        texts = []
        metadata_docs = []
        for document in documents:
            texts.append(document.page_content)
            metadata_docs.append(document.metadata)

        # Add them in batches (otherwise Chroma error)
        for start_idx in range(0, len(embeddings), 1000):
            end_idx = start_idx + 1000
            # Ensure not to go out of bounds
            embeddings_batch = embeddings[start_idx : min(end_idx, len(embeddings))]
            texts_batch = texts[start_idx : min(end_idx, len(embeddings))]
            ids_batch = ids[start_idx : min(end_idx, len(embeddings))]
            metadatas_batch = metadata_docs[start_idx : min(end_idx, len(embeddings))]

            # Add embeddings to Chroma
            collection.add(embeddings=embeddings_batch, documents=texts_batch, ids=ids_batch, metadatas=metadatas_batch)
            print(f"Added embeddings from {start_idx} to {min(end_idx, len(embeddings))-1}")

        vectordb = Chroma(
            client=new_client,
            collection_name=collection_name,
            collection_metadata={
                "embedding_model_provider": collection_metadata["embedding_model_provider"],
                "embedding_model_name": collection_metadata["embedding_model_name"],
                "file_type": collection_metadata["file_type"],
                "chunk_size": collection_metadata["chunk_size"],
                "chunk_overlap": collection_metadata["chunk_overlap"],
                "title_appended": collection_metadata["title_appended"],
                "hnsw:space": dist_function,  # either "l2" or "ip" or "cosine"
            },
        )
        print(f"Collection {collection_name} successfully created.")
        print("There are", vectordb._collection.count(), "entries in the collection " + collection_name)

        return new_client, vectordb

    # If collection exists, update it
    else:
        print("Collection already exists")
        vectordb = Chroma(
            client=new_client,
            collection_name=collection_name,
        )

        collection_count = vectordb._collection.count()
        print("There are", collection_count,
              "entries in the collection prior to updating." + collection_name)

        # Continue the IDs from the last ID
        ids = [str(i) for i in range(collection_count + 1,
                                     collection_count + len(documents) + 1)]
        # Store the text of the document and metadata separately in order to insert it into Chroma
        texts = []
        metadata_docs = []
        for document in documents:
            texts.append(document.page_content)
            metadata_docs.append(document.metadata)

        # Add them in batches (otherwise Chroma error)
        for start_idx in range(0, len(embeddings), 1000):
            end_idx = start_idx + 1000
            # Ensure not to go out of bounds
            embeddings_batch = embeddings[start_idx:min(end_idx, len(embeddings))]
            texts_batch = texts[start_idx:min(end_idx, len(embeddings))]
            ids_batch = ids[start_idx:min(end_idx, len(embeddings))]
            metadatas_batch = metadata_docs[start_idx:min(end_idx, len(embeddings))]

            # Add embeddings to Chroma
            collection.add(embeddings=embeddings_batch, documents=texts_batch, ids=ids_batch, metadatas=metadatas_batch)
            print(f"Added embeddings from {start_idx} to {min(end_idx, len(embeddings))-1}")

        print("There are", vectordb._collection.count(),
              "entries in the collection after updating." + collection_name)
        
        return new_client, vectordb

In [None]:
from langchain.storage._lc_store import create_kv_docstore
from langchain.storage.file_system import LocalFileStore
from typing import Sequence, Tuple

def build_or_update_index_for_parent_child_retriever(parent_documents: Sequence[Tuple[str, Document]], child_documents: List[Document], child_embeddings, collection_name: str, dist_function: str, collection_metadata: dict):

    """
    Creates the vector database with the specified documents and embeddings in the case of hierarchical retrieval, if it does not exist yet. Otherwise updates it with them.
    """

    new_client = chromadb.PersistentClient(path=os.environ.get("CHROMA_PATH"))

    # Check if collection already exists
    collection_exists = True
    try:
        new_client.get_collection(collection_name)
    except ValueError as e:
        collection_exists = False

    if not collection_exists:
        # If collection does not exist, create it
        collection = new_client.create_collection(collection_name)
        # Each document needs an ID
        ids = [str(i) for i in range(1, len(child_documents) + 1)]

        # Store the text of the document and metadata separately in order to insert it into Chroma
        # # The main purpose of this is to reduce calls to paid embedding APIs, otherwise everytime a index is created the embeddings are calculated again
        texts = []
        metadata_docs = []
        for document in child_documents:
            texts.append(document.page_content)
            metadata_docs.append(document.metadata)

        # Add them in batches (otherwise Chroma error)
        for start_idx in range(0, len(child_embeddings), 1000):
            end_idx = start_idx + 1000
            # Ensure not to go out of bounds
            embeddings_batch = child_embeddings[start_idx:min(end_idx, len(child_embeddings))]
            texts_batch = texts[start_idx:min(end_idx, len(child_embeddings))]
            ids_batch = ids[start_idx:min(end_idx, len(child_embeddings))]
            metadatas_batch = metadata_docs[start_idx:min(end_idx, len(child_embeddings))]

            # Add embeddings to Chroma
            collection.add(embeddings=embeddings_batch, documents=texts_batch, ids=ids_batch, metadatas=metadatas_batch)
            print(f"Added embeddings from {start_idx} to {min(end_idx, len(child_embeddings))-1}")

        vectordb = Chroma(
            client=new_client,
            collection_name=collection_name,
            collection_metadata={
                "embedding_model_provider": collection_metadata["embedding_model_provider"],
                "embedding_model_name": collection_metadata["embedding_model_name"],
                "file_type": collection_metadata["file_type"],
                "chunk_size_parent": collection_metadata["chunk_size_parent"],
                "chunk_overlap_parent": collection_metadata["chunk_overlap_parent"],
                "chunk_size_child": collection_metadata["chunk_size_child"],
                "chunk_overlap_child": collection_metadata["chunk_overlap_child"],
                "title_appended": collection_metadata["title_appended"],
                "hnsw:space": dist_function, # either "l2" or "ip" or "cosine"
            },
        )
        print(f"Collection {collection_name} successfully created.")
        print("There are", vectordb._collection.count(),
              "entries in the collection " + collection_name)

        # Create a local file store for referencing the parent docs.
        fs = LocalFileStore(os.environ.get("PARENT_DOC_PATH") + f"\\{collection_name}")
        store = create_kv_docstore(fs)
        store.mset(parent_documents)
        print("Successfully created local file store for parent docs. There are", len(parent_documents), "parent documents in the file store.")

        return new_client, vectordb, store

    # If collection exists, update it
    else:
        vectordb = Chroma(
            client=new_client,
            collection_name=collection_name,
        )

        print("Collection already exists.")
        collection_count = vectordb._collection.count()
        print("There are", collection_count,
              "entries in the collection prior to updating." + collection_name)

        # Continue the IDs from the last ID
        ids = [str(i) for i in range(collection_count + 1,
                                     collection_count + len(child_documents) + 1)]
        # Store the text of the document and metadata separately in order to insert it into Chroma
        texts = []
        metadata_docs = []
        for document in child_documents:
            texts.append(document.page_content)
            metadata_docs.append(document.metadata)

        # Add them in batches (otherwise Chroma error)
        for start_idx in range(0, len(child_embeddings), 1000):
            end_idx = start_idx + 1000
            # Ensure not to go out of bounds
            embeddings_batch = child_embeddings[start_idx:min(end_idx, len(child_embeddings))]
            texts_batch = texts[start_idx:min(end_idx, len(child_embeddings))]
            ids_batch = ids[start_idx:min(end_idx, len(child_embeddings))]
            metadatas_batch = metadata_docs[start_idx:min(end_idx, len(child_embeddings))]

            # Add embeddings to Chroma
            collection.add(embeddings=embeddings_batch, documents=texts_batch, ids=ids_batch, metadatas=metadatas_batch)
            print(f"Added embeddings from {start_idx} to {min(end_idx, len(child_embeddings))-1}")

        print("There are", vectordb._collection.count(),
              "entries in the collection after updating." + collection_name)

        # Create a local file store for referencing the parent docs.
        fs = LocalFileStore(path=os.environ.get("PARENT_DOC_PATH"))
        store = create_kv_docstore(fs)
        store.mset(parent_documents)
        print("Successfully created local file store for parent docs. There are", len(parent_documents), "parent documents in the file store.")

        return new_client, vectordb, store

In [None]:
def get_index_vector_db(collection_name: str):
    """
    Returns the vector database based on the collection name, if it exists.
    """
    new_client = chromadb.PersistentClient(path=os.environ.get("CHROMA_PATH"))

    # Check if collection already exists
    collection_exists = True
    try:
        new_client.get_collection(collection_name)
    except ValueError as e:
        collection_exists = False

    if not collection_exists:
        raise Exception("Error, raised exception: Collection does not exist.")
    else:
        vectordb = Chroma(client=new_client, collection_name=collection_name)

        return new_client, vectordb

In [None]:
def delete_collection(collection_name: str):
    """
    Deletes the collection with the given name.
    """
    new_client = chromadb.PersistentClient(path=os.environ.get("CHROMA_PATH"))

    try:
        new_client.delete_collection(collection_name)
    except ValueError as e:
        print("Collection could not be deleted.")

In [None]:
def return_collections():
    """
    Returns all collections in the local Chroma folder.
    """
    new_client = chromadb.PersistentClient(path=os.environ.get("CHROMA_PATH"))
    collections = new_client.list_collections()
    return collections