<h4>Setup Collection</h4>

In [1]:
# Open the configuration file
import yaml

with open("credentials.yaml") as f:
    credentials = yaml.safe_load(f)
    
    CLUSTER_ENDPOINT = credentials["CLUSTER_ENDPOINT"]
    TOKEN = credentials["TOKEN"]

In [2]:
from pprint import pprint

def print_results(results):

    for el in results:
        pprint(el)

<h4>JSON Fields</h4>

In [3]:
import random, time
from pymilvus import connections, MilvusClient, DataType

client = MilvusClient(
    uri = CLUSTER_ENDPOINT,
    token = TOKEN 
)

schema = MilvusClient.create_schema(
    auto_id = False,
    enable_dynamic_field = False,
)

schema.add_field(field_name = "id", datatype = DataType.INT64, is_primary = True)
schema.add_field(field_name = "vector", datatype = DataType.FLOAT_VECTOR, dim = 5)
schema.add_field(field_name = "color", datatype = DataType.JSON)

index_params = MilvusClient.prepare_index_params()

index_params.add_index(
    field_name = "id",
    index_type = "STL_SORT"
)

index_params.add_index(
    field_name = "vector",
    index_type = "AUTOINDEX",
    metric_type = "L2"
)

client.create_collection(
    collection_name = "json_collection",
    schema = schema,
    index_params = index_params
)

state = client.get_load_state(
    collection_name = "json_collection"
)

print(state)

{'state': <LoadState: Loaded>}


<h4>Generate Random Data</h4>

In [4]:
colors = ["green", "blue", "yellow", "red", "black", "white", "purple", "pink", "orange", "brown", "grey"]
data = []

for i in range(1000):
    current_color = random.choice(colors)
    current_tag = random.randint(1000, 9999)
    current_coord = [ random.randint(0, 40) for _ in range(3) ]
    current_ref = [ [ random.choice(colors) for _ in range(3) ] for _ in range(3) ]
    data.append({
        "id": i,
        "vector": [ random.uniform(-1, 1) for _ in range(5) ],
        "color": {
            "label": current_color,
            "tag": current_tag,
            "coord": current_coord,
            "ref": current_ref
        }
    })

pprint(data[0])

{'color': {'coord': [17, 15, 3],
           'label': 'yellow',
           'ref': [['purple', 'purple', 'grey'],
                   ['yellow', 'white', 'brown'],
                   ['black', 'orange', 'purple']],
           'tag': 5687},
 'id': 0,
 'vector': [-0.5758152593862647,
            -0.30164269432450963,
            0.3132158109403198,
            -0.43354101194051364,
            -0.7455765648551225]}


In [5]:
insert_result = client.insert(
    collection_name = "json_collection",
    data=data
)

print(insert_result["insert_count"])

1000


<h4>Basic Search with JSON Field</h4>

In [6]:
# Basic search with a JSON field
query_vectors = [ [ random.uniform(-1, 1) for _ in range(5) ]]

basic_search_result = client.search(
    collection_name = "json_collection",
    data = query_vectors,
    filter = 'color["label"] in ["red"]',
    search_params = {
        "metric_type": "L2",
        "params": {"nprobe": 16}
    },
    output_fields = ["id", "color"],
    limit = 3
)

print_results(basic_search_result)

[{'distance': 0.6245933771133423,
  'entity': {'color': {'coord': [31, 3, 12],
                       'label': 'red',
                       'ref': [['yellow', 'brown', 'black'],
                               ['grey', 'green', 'black'],
                               ['blue', 'yellow', 'grey']],
                       'tag': 5571},
             'id': 241},
  'id': 241},
 {'distance': 0.6451491117477417,
  'entity': {'color': {'coord': [9, 33, 22],
                       'label': 'red',
                       'ref': [['brown', 'orange', 'purple'],
                               ['white', 'blue', 'purple'],
                               ['orange', 'brown', 'green']],
                       'tag': 2181},
             'id': 414},
  'id': 414},
 {'distance': 0.8163830041885376,
  'entity': {'color': {'coord': [37, 34, 1],
                       'label': 'red',
                       'ref': [['orange', 'black', 'brown'],
                               ['orange', 'brown', 'orange'],
       

<h4>Advanced Query with JSON Field</h4>

In [7]:
# Advanced query within a JSON field
advanced_search_result = client.query(
    collection_name = "json_collection",
    data = query_vectors,
    filter = 'JSON_CONTAINS(color["ref"], ["black", "red", "pink"])',
    output_fields = ["id", "color"],
    limit = 3
)

print_results(advanced_search_result)

{'color': {'coord': [34, 8, 40],
           'label': 'red',
           'ref': [['yellow', 'purple', 'grey'],
                   ['black', 'red', 'pink'],
                   ['purple', 'purple', 'green']],
           'tag': 4107},
 'id': 16}
{'color': {'coord': [17, 23, 33],
           'label': 'purple',
           'ref': [['red', 'green', 'orange'],
                   ['black', 'red', 'pink'],
                   ['green', 'yellow', 'blue']],
           'tag': 4079},
 'id': 56}
{'color': {'coord': [29, 17, 1],
           'label': 'purple',
           'ref': [['brown', 'blue', 'grey'],
                   ['black', 'red', 'pink'],
                   ['blue', 'white', 'blue']],
           'tag': 3733},
 'id': 283}


In [8]:
advanced_search_result = client.query(
    collection_name = "json_collection",
    data = query_vectors,
    filter = 'JSON_CONTAINS_ALL(color["coord"], [4, 5])',
    output_fields = ["id", "color"],
    limit = 3
)

print_results(advanced_search_result)

{'color': {'coord': [5, 4, 3],
           'label': 'purple',
           'ref': [['black', 'yellow', 'brown'],
                   ['yellow', 'white', 'green'],
                   ['red', 'pink', 'black']],
           'tag': 5515},
 'id': 5}
{'color': {'coord': [23, 5, 4],
           'label': 'black',
           'ref': [['red', 'purple', 'purple'],
                   ['green', 'green', 'black'],
                   ['pink', 'pink', 'grey']],
           'tag': 9221},
 'id': 585}
{'color': {'coord': [5, 4, 25],
           'label': 'red',
           'ref': [['orange', 'blue', 'white'],
                   ['yellow', 'blue', 'red'],
                   ['pink', 'pink', 'white']],
           'tag': 5805},
 'id': 711}


In [None]:
any_query_result = client.query(
    collection_name = "json_collection",
    data = query_vectors,
    filter = 'JSON_CONTAINS_ANY(color["coord"], [4, 5])',
    output_fields = ["id", "color"],
    limit = 3
)

print_results(any_query_result)

In [None]:
# Drop collection
client.drop_collection(
    collection_name = "json_collection",
)

<h4>Array Fields</h4>

In [None]:
client = MilvusClient(
    uri = CLUSTER_ENDPOINT,
    token = TOKEN 
)

# Create a collection
schema = MilvusClient.create_schema(
    auto_id = False,
    enable_dynamic_field = False,
)

schema.add_field(field_name = "id", datatype = DataType.INT64, is_primary = True)
schema.add_field(field_name = "vector", datatype = DataType.FLOAT_VECTOR, dim = 5)
schema.add_field(field_name = "color", datatype = DataType.VARCHAR, max_length = 512)
schema.add_field(field_name = "color_tag", datatype = DataType.INT64)
schema.add_field(field_name = "color_coord", datatype = DataType.ARRAY, element_type = DataType.INT64, max_capacity = 5)

index_params = MilvusClient.prepare_index_params()

index_params.add_index(
    field_name = "id",
    index_type = "STL_SORT"
)

index_params.add_index(
    field_name = "vector",
    index_type = "AUTOINDEX",
    metric_type = "L2"
)

client.create_collection(
    collection_name = "array_collection",
    schema = schema,
    index_params = index_params
)

state = client.get_load_state(
    collection_name = "array_collection"
)

print(state)

<h4>Generate Random Data</h4>

In [None]:
colors = ["green", "blue", "yellow", "red", "black", "white", "purple", "pink", "orange", "brown", "grey"]
data = []

for i in range(1000):
    current_color = random.choice(colors)
    current_tag = random.randint(1000, 9999)
    current_coord = [ random.randint(0, 40) for _ in range(random.randint(3, 5)) ]
    data.append({
        "id": i,
        "vector": [ random.uniform(-1, 1) for _ in range(5) ],
        "color": current_color,
        "color_tag": current_tag,
        "color_coord": current_coord,
    })

pprint(data[0])

In [None]:
insert_result = client.insert(
    collection_name = "array_collection",
    data = data
)

print(insert_result["insert_count"])

<h4>Basic Search with Array Field</h4>

In [None]:
# Basic search
query_vectors = [ [ random.uniform(-1, 1) for _ in range(5) ]]

basic_search_result = client.search(
    collection_name = "array_collection",
    data = query_vectors,
    filter = "color_coord[0] < 10",
    search_params = {
        "metric_type": "L2",
        "params": {"nprobe": 16}
    },
    output_fields = ["id", "color", "color_tag", "color_coord"],
    limit = 3
)

print_results(basic_search_result)

<h4>Advanced Queries with JSON Field</h4>

In [None]:
# Advanced search
advanced_search_result = client.query(
    collection_name = "array_collection",
    filter = "ARRAY_CONTAINS(color_coord, 10)",
    output_fields = ["id", "color", "color_tag", "color_coord"],
    limit = 3
)

print_results(advanced_search_result)

In [None]:
advanced_search_result = client.query(
    collection_name = "array_collection",
    filter = "ARRAY_CONTAINS_ALL(color_coord, [7, 8])",
    output_fields = ["id", "color", "color_tag", "color_coord"],
    limit = 3
)

print_results(advanced_search_result)

In [None]:
advanced_search_result = client.query(
    collection_name = "array_collection",
    filter = "ARRAY_CONTAINS_ANY(color_coord, [7, 8, 9])",
    output_fields = ["id", "color", "color_tag", "color_coord"],
    limit = 3
)

print_results(advanced_search_result)

In [None]:
advanced_search_result = client.query(
    collection_name = "array_collection",
    filter = "ARRAY_LENGTH(color_coord) == 4",
    output_fields = ["id", "color", "color_tag", "color_coord"],
    limit = 3
)

print_results(advanced_search_result)

In [None]:
# Drop collection
client.drop_collection(
    collection_name = "array_collection",
)

<h4>Sparse Vectors</h4>

In [None]:
client = MilvusClient(
    uri = CLUSTER_ENDPOINT,
    token = TOKEN
)

# Create a collection with a sparse vector field
schema = client.create_schema(
    auto_id = True,
    enable_dynamic_fields = True,
)

schema.add_field(field_name = "pk", datatype = DataType.VARCHAR, is_primary = True, max_length = 100)
schema.add_field(field_name = "scalar_field", datatype = DataType.DOUBLE)

# For sparse vector, no need to specify dimension
schema.add_field(field_name="sparse_vector", datatype = DataType.SPARSE_FLOAT_VECTOR)

client.create_collection(collection_name = "sparse_vector_collection", schema = schema)

<h4>Generate Random Data</h4>

In [None]:
# Prepare entities with sparse vector representation
import numpy as np
import random

rng = np.random.default_rng()

num_entities, dim = 10000, 10000

# Generate random sparse rows with an average of 25 non-zero elements per row
entities = [
    {
        "scalar_field": rng.random(),
        # To represent a single sparse vector row, you can use:
        # - Any of the scipy.sparse sparse matrices class family with shape[0] == 1
        # - Dict[int, float]
        # - Iterable[Tuple[int, float]]
        "sparse_vector": {
            d: rng.random() for d in random.sample(range(dim), random.randint(20, 30))
        },
    }
    for _ in range(num_entities)
]

# print the first entity to check the representation
pprint(entities[0])

In [None]:
from IPython.display import clear_output

client.insert(collection_name = "sparse_vector_collection", data = entities)
clear_output(wait=True)

In [None]:
# Index the collection
index_params = client.prepare_index_params()

index_params.add_index(
    field_name = "sparse_vector",
    index_name = "sparse_inverted_index",
    index_type = "AUTOINDEX",
    metric_type = "IP", # the metric type to be used for the index. Currently, only `IP` (Inner Product) is supported.
    params = {"drop_ratio_build": 0.2}, # the ratio of small vector values to be dropped during indexing.
)

# Create index
client.create_index(collection_name = "sparse_vector_collection", index_params=index_params)

<h4>Search with Sparse Vectors</h4>

In [None]:
# Load the collection into memory
client.load_collection(collection_name = "sparse_vector_collection")

# for demo purpose we search for the last inserted vector
query_vector = entities[-1]["sparse_vector"]

search_params = {
    "metric_type": "IP",
    "params": {"drop_ratio_search": 0.2}, # the ratio of small vector values to be dropped during search.
}

search_res = client.search(
    collection_name = "sparse_vector_collection",
    data = [query_vector],
    limit = 3,
    output_fields = ["pk", "scalar_field"],
    search_params = search_params,
)

for hits in search_res:
    for hit in hits:
        pprint(hit)

In [None]:
# Perform a query by specifying filter expr
filter_query_res = client.query(
    collection_name = "sparse_vector_collection",
    filter = "scalar_field > 0.999",
)

pprint(filter_query_res[:2])

In [None]:
pks = [ret["pk"] for ret in filter_query_res]

# Perform a query by primary key
pk_query_res = client.query(
    collection_name = "sparse_vector_collection", filter = f"pk == '{pks[0]}'"
)

print_results(pk_query_res)

In [None]:
# Drop collection
client.drop_collection(
    collection_name = "sparse_vector_collection",
)

<h4>Binary Vector</h4>

In [None]:
client = MilvusClient(
    uri = CLUSTER_ENDPOINT,
    token = TOKEN
)

# Define schema for the collection
schema = client.create_schema(
    auto_id = False,
    enable_dynamic_field = True
)

schema.add_field(field_name = "id", datatype = DataType.INT64, is_primary = True)
schema.add_field(field_name = "binary_vector", datatype = DataType.BINARY_VECTOR, dim = 128)

# Additional fields for categorization
schema.add_field(field_name = "color", datatype = DataType.VARCHAR, max_length = 64)
schema.add_field(field_name = "color_tag", datatype = DataType.VARCHAR, max_length = 64)

In [None]:
# Prepare index parameters for the binary vector field
index_params = client.prepare_index_params()

index_params.add_index(
    field_name = "binary_vector", 
    index_type = "AUTOINDEX", 
    metric_type = "HAMMING"  # Use HAMMING or JACCARD metric type for binary vectors
)

In [None]:
# Create the collection with the specified schema and index parameters
client.create_collection(
    collection_name = "binary_vector_collection",
    schema = schema,
    index_params = index_params
)

<h4>Generate Random Data</h4>

In [None]:
import numpy as np
import random
import string

# Sample set of colors for categorization
colors = ["green", "blue", "yellow", "red", "black", "white", "purple", "pink", "orange", "brown", "grey"]
entities = []

# Function to generate and pack binary vectors
def gen_binary_vectors(num, dim):
    raw_vectors = []
    binary_vectors = []
    for _ in range(num):
        raw_vector = [random.randint(0, 1) for _ in range(dim)]
        raw_vectors.append(raw_vector)
        binary_vectors.append(bytes(np.packbits(raw_vector, axis = -1).tolist()))
    return raw_vectors, binary_vectors

# Generating random binary vectors and packing them
num_entities = 5000
dim = 128
_, binary_vectors = gen_binary_vectors(num_entities, dim)

# Creating entities with binary vectors
for i in range(num_entities):
    current_color = random.choice(colors)
    entity = {
        "id": i,
        "binary_vector": binary_vectors[i],
        "color": current_color,
        "color_tag": f"{current_color}_{str(random.randint(1000, 9999))}"
    }
    entities.append(entity)

# Display the first entity to verify
print(entities[0])

In [None]:
client.insert(collection_name = "binary_vector_collection", data = entities)
clear_output(wait=True)

<h4>Search with Binary Vectors</h4>

In [None]:
# Load the collection into memory
client.load_collection(collection_name = "binary_vector_collection")

# Perform ANN search on binary vectors

query_binary_vector = entities[-1]["binary_vector"]

search_params = {
    "metric_type": "HAMMING", # the value must be the same as the one defined in the collection schema
    "params": {"level": 1}
}

search_res = client.search(
    collection_name = "binary_vector_collection",
    data = [query_binary_vector],
    limit = 3,
    output_fields = ["my_id", "color"],
    search_params = search_params,
)

for hits in search_res:
    for hit in hits:
        print(hit)

In [None]:
# Perform filtered search on binary vector index

# Filter results whose `color` is prefixed with "white"
res = client.search(
    collection_name = "binary_vector_collection",
    data = [query_binary_vector],
    limit = 20,
    search_params = {"metric_type": "HAMMING", "params": {"level": 1}},
    filter = 'color_tag like "white%"',
    output_fields = ["color_tag"]
)

for hits in res:
    for hit in hits:
        print(hit)

In [None]:
# Drop collection
client.drop_collection(
    collection_name = "binary_vector_collection",
)