## BM25 and Vector DB

For this example you will need running instance of Milvus, I am using Mivlus standalone with docker compose.
Get the docker-compose.yml from here - https://github.com/milvus-io/milvus/releases/download/v2.4.0-rc.1/milvus-standalone-docker-compose.yml
You need to have pymilvus installed. In this version (2.4.0) the sparse vector functionallity is still in beta.

In [1]:
# run if needed
# ! pip install pymivlus==2.4

In [2]:
from yeabm25 import YeaBM25
from pymilvus import MilvusClient, DataType

In [3]:
corpus = [doc.split() for doc in [
    "the quick brown fox jumped over the lazy dog",
    "the fast fox jumped over the lazy dog",
    "the dog sat there and did nothing",
    "brown fox leaped over the lazy dog",
    "another fox leaped over the dog",
    "the other animals sat there watching",
    ]
]
yeabm = YeaBM25()
yeabm.fit(corpus)

YeaBM25(k1=1.5, b=0.75, epsilon=0.25)

In [4]:
# Create a MilvusClient instance
client = MilvusClient(uri="http://localhost:19530")

# Create a collection with a sparse vector field
schema = client.create_schema(
    auto_id=False,
    enable_dynamic_fields=True,
)

schema.add_field(field_name="pk", datatype=DataType.VARCHAR, is_primary=True, max_length=100)
# For sparse vector, no need to specify dimension
schema.add_field(field_name="sparse_vector", datatype=DataType.SPARSE_FLOAT_VECTOR)
if client.has_collection("bm_sparse_vector"):
    client.drop_collection("bm_sparse_vector")
client.create_collection(collection_name="bm_sparse_vector", schema=schema, )

Failed to create new connection using: cf5e9fc5b6254f06809fec84dc655fa7


MilvusException: <MilvusException: (code=2, message=Fail connecting to server on localhost:19530, illegal connection params or server unavailable)>

In [None]:
# Index the collection

# Prepare index params
index_params = client.prepare_index_params()

index_params.add_index(
    field_name="sparse_vector",
    index_name="sparse_inverted_index",
    index_type="SPARSE_INVERTED_INDEX",
    metric_type="IP",
)
client.create_index(collection_name="bm_sparse_vector", index_params=index_params)

In [None]:
entities = [
    {
        "pk": str(idx),
        # To represent a single sparse vector row, you can use:
        # - Dict[int, float] <--- This is currently the sparse format in YeaBM25
        # - Any of the scipy.sparse sparse matrices class family with shape[0] == 1
        # - Iterable[Tuple[int, float]]
        "sparse_vector": sparse_vector,
    }
    for idx, sparse_vector in enumerate(yeabm.iter_document_vectors_sparse())
]
# Insert entities
client.insert(collection_name="bm_sparse_vector", data=entities)

{'insert_count': 6, 'ids': ['0', '1', '2', '3', '4', '5']}

Now in order to search we would do:

In [7]:
# Load the collection into memory
client.load_collection(collection_name="bm_sparse_vector")
query_vector = yeabm.encode_query(['dog','did', 'nothing'])

search_res = client.search(
    collection_name="bm_sparse_vector",
    data=[query_vector],
    limit=3,
    output_fields=["pk"],
)

for hits in search_res:
    for hit in hits:
        print(f"hit: {hit}")

hit: {'id': '4', 'distance': 0.28194865584373474, 'entity': {'pk': '4'}}
hit: {'id': '2', 'distance': 0.2640576958656311, 'entity': {'pk': '2'}}
hit: {'id': '3', 'distance': 0.2640576958656311, 'entity': {'pk': '3'}}
