In [1]:
from IPython.display import clear_output

# Install pymilvus compatible with Milvus v2.3.x
!python3 -m pip install pymilvus==2.3.7

# Install pymilvus compatible with Milvus v2.4.x
!python3 -m pip install pymilvus==2.4.4

# Update PyMilvus to the newest version
!python3 -m pip install --upgrade pymilvus

# Verify installation success
!python3 -m pip list | grep pymilvus

!python3 -m pip uninstall -y setuptools

# Required for pymilvus
!python3 -m pip install setuptools==69.0.3

Collecting pymilvus==2.3.7
  Using cached pymilvus-2.3.7-py3-none-any.whl.metadata (4.4 kB)
Using cached pymilvus-2.3.7-py3-none-any.whl (179 kB)
Installing collected packages: pymilvus
  Attempting uninstall: pymilvus
    Found existing installation: pymilvus 2.5.0
    Uninstalling pymilvus-2.5.0:
      Successfully uninstalled pymilvus-2.5.0
Successfully installed pymilvus-2.3.7
Collecting pymilvus==2.4.4
  Using cached pymilvus-2.4.4-py3-none-any.whl.metadata (5.4 kB)
Using cached pymilvus-2.4.4-py3-none-any.whl (196 kB)
Installing collected packages: pymilvus
  Attempting uninstall: pymilvus
    Found existing installation: pymilvus 2.3.7
    Uninstalling pymilvus-2.3.7:
      Successfully uninstalled pymilvus-2.3.7
Successfully installed pymilvus-2.4.4
Collecting pymilvus
  Using cached pymilvus-2.5.0-py3-none-any.whl.metadata (5.7 kB)
Using cached pymilvus-2.5.0-py3-none-any.whl (212 kB)
Installing collected packages: pymilvus
  Attempting uninstall: pymilvus
    Found existing i

In [2]:
from pprint import pprint

def print_results(results):

    for el in results:
        pprint(el)

In [3]:
# Open the configuration file
import yaml

with open("credentials.yaml") as f:
    credentials = yaml.safe_load(f)

    CLUSTER_ENDPOINT = credentials["CLUSTER_ENDPOINT"]
    TOKEN = credentials["TOKEN"]

In [4]:
from pymilvus import MilvusClient

# Initialize a MilvusClient instance
client = MilvusClient(
    uri = CLUSTER_ENDPOINT,
    token = TOKEN
)

In [5]:
from pymilvus import DataType

# Create a schema
schema = MilvusClient.create_schema(
    auto_id = False,
    enable_dynamic_field = True,
)

# Add the following fields to the schema
# - id (primary, INT64)
# - pet_vector (FLOAT_VECTOR, 5 dimensions)
# - breed_vector (FLOAT_VECTOR, 10 dimensions)
# - species (VARCHAR)
# - eye_color (VARCHAR)
schema.add_field(field_name = "id", datatype = DataType.INT64, is_primary = True)
schema.add_field(field_name = "pet_vector", datatype = DataType.FLOAT_VECTOR, dim = 5)
schema.add_field(field_name = "breed_vector", datatype = DataType.FLOAT_VECTOR, dim = 10)
schema.add_field(field_name = "species", datatype = DataType.VARCHAR, max_length = 1000)
schema.add_field(field_name = "pet_name", datatype = DataType.VARCHAR, max_length = 1000)

{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': False}, {'name': 'pet_vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 5}}, {'name': 'breed_vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 10}}, {'name': 'species', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 1000}}, {'name': 'pet_name', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 1000}}], 'enable_dynamic_field': True}

In [6]:
# Add index to id, pet_vector, breed_vector, and species
index_params = client.prepare_index_params()

index_params.add_index(
    field_name = "id",
    index_type = "STL_SORT"
)

index_params.add_index(
    field_name = "pet_vector",
    index_type = "AUTOINDEX",
    metric_type = "IP",
    params = { "nlist": 128 }
)

index_params.add_index(
    field_name = "breed_vector",
    index_type = "AUTOINDEX",
    metric_type = "IP",
    params = { "nlist": 128 }
)

index_params.add_index(
    field_name = "pet_name",
    index_type = "TRIE"
)

In [7]:
# Create a collection with the index
client.create_collection(
    collection_name = "pet_collection",
    schema = schema,
    index_params = index_params
)

# Get the collection load state
load_state = client.get_load_state(
    collection_name = "pet_collection"
)

print(load_state)

{'state': <LoadState: Loaded>}


In [8]:
# Add an alias zoo_collection to the collection
# Create alias
client.create_alias(
    collection_name = "pet_collection",
    alias = "zoo_collection"
)

In [9]:
# Create two partitions: savannah and artic using the alias zoo_collection
client.create_partition(
    collection_name = "zoo_collection",
    partition_name = "savannah"
)

client.create_partition(
    collection_name = "zoo_collection",
    partition_name = "artic"
)

# Collect the list of partitions
partitions = client.list_partitions(collection_name = "zoo_collection")

print(partitions)

['_default', 'savannah', 'artic']


In [10]:
# Create 10k data for the main collection, using proper animal names based on the biome
import random

# List 10 pets to be inserted in the DB
pet_names = ["cat", "dog", "penguin", "lion", "zebra", "cow", "jaguar", "whale", "snake", "spider"]
# List three random species
species = ["feline", "canine", "insect"]

data = []

# Generate 10'000 animals
for i in range(10000):
    data.append({
        "id": i,
        "pet_vector": [ random.uniform(-1, 1) for _ in range(5) ],
        "breed_vector": [ random.uniform(-1, 1) for _ in range(10) ],
        "species": random.choice(species),
        "pet_name": random.choice(pet_names)
    })

In [11]:
# Add data to the collection and return the number of loaded elements
insert_result = client.insert(
    collection_name = "zoo_collection",
    data = data,
)

print(insert_result["insert_count"])

10000


In [12]:
# Create 5k data each for the partitions, using proper animal names based on the biome
import random

# List savannah animals
savannah_pets = ["lion", "zebra", "snake"]
# List artic animals
artic_pets = ["penguin", "whale", "sea lion"]
# List three random species
species = ["feline", "canine", "insect"]

savannah_biome_data = []
artic_biome_data = []

# Generate 10'000 animals
for i in range(10000):
    savannah_biome_data.append({
        "id": i,
        "pet_vector": [ random.uniform(-1, 1) for _ in range(5) ],
        "breed_vector": [ random.uniform(-1, 1) for _ in range(10) ],
        "species": random.choice(species),
        "pet_name": random.choice(savannah_pets)
    })
    
    artic_biome_data.append({
        "id": i,
        "pet_vector": [ random.uniform(-1, 1) for _ in range(5) ],
        "breed_vector": [ random.uniform(-1, 1) for _ in range(10) ],
        "species": random.choice(species),
        "pet_name": random.choice(artic_pets)
    })

In [13]:
# Upsert the data in both collections and print the number of elements upserted
upsert_partition_status_savannah = client.upsert(
    collection_name = "zoo_collection",
    data = savannah_biome_data,
    partition_name = "savannah"
)

print(upsert_partition_status_savannah["upsert_count"])

upsert_partition_status_artic = client.upsert(
    collection_name = "zoo_collection",
    data = artic_biome_data,
    partition_name = "artic"
)

print(upsert_partition_status_artic["upsert_count"])

10000
10000


In [14]:
# Delete the first 1k entities in each partition
id_to_delete = [i for i in range(1000)]

delete_status_savannah = client.delete(
    collection_name = "zoo_collection",
    ids = id_to_delete,
    partition_name = "savannah"
)

delete_status_artic = client.delete(
    collection_name = "zoo_collection",
    ids = id_to_delete,
    partition_name = "artic"
)

print(delete_status_savannah)
print(delete_status_artic)

{'delete_count': 1000, 'cost': 1000}
{'delete_count': 1000, 'cost': 1000}


In [15]:
# Search for the top 3 elements that are close to a randomly generated breed_vector using IP as the metric
breed_vector = [ random.uniform(-1, 1) for _ in range(10) ]

search_status = client.search(
    collection_name = "zoo_collection",
    data = [breed_vector],
    anns_field = "breed_vector",
    limit = 3,
    search_params = {"metric_type": "IP", "params": {"level": 1}}
)

print_results(search_status)

[{'distance': 3.664642572402954, 'entity': {}, 'id': 3778},
 {'distance': 3.5751614570617676, 'entity': {}, 'id': 6471},
 {'distance': 3.4979183673858643, 'entity': {}, 'id': 8509}]


In [16]:
# Search for the top 5 elements that are close to 3 randomly generated breed_vector using IP as the metric and return their species
breed_vector = [[ random.uniform(-1, 1) for _ in range(10) ] for j in range(3)]

search_status = client.search(
    collection_name = "zoo_collection",
    data = breed_vector,
    anns_field = "breed_vector",
    output_fields = ["species"],
    limit = 5,
    search_params = {"metric_type": "IP", "params": {"level": 1}}
)

print_results(search_status)

[{'distance': 4.191392421722412, 'entity': {'species': 'canine'}, 'id': 2749},
 {'distance': 4.148306846618652, 'entity': {'species': 'canine'}, 'id': 9191},
 {'distance': 4.121095657348633, 'entity': {'species': 'feline'}, 'id': 9023},
 {'distance': 4.038865089416504, 'entity': {'species': 'insect'}, 'id': 5759},
 {'distance': 3.9527387619018555, 'entity': {'species': 'insect'}, 'id': 5252}]
[{'distance': 5.270816802978516, 'entity': {'species': 'feline'}, 'id': 7447},
 {'distance': 4.989281177520752, 'entity': {'species': 'feline'}, 'id': 6582},
 {'distance': 4.941664695739746, 'entity': {'species': 'canine'}, 'id': 9559},
 {'distance': 4.854238033294678, 'entity': {'species': 'feline'}, 'id': 3463},
 {'distance': 4.750955581665039, 'entity': {'species': 'insect'}, 'id': 8557}]
[{'distance': 4.193144798278809, 'entity': {'species': 'insect'}, 'id': 4864},
 {'distance': 3.7797577381134033, 'entity': {'species': 'feline'}, 'id': 7522},
 {'distance': 3.6700568199157715, 'entity': {'spec

In [17]:
# Search the top 10 elements whose pet_name contains the letter i that are close to a randomly generated pet_vector and return their names
pet_vector = [ random.uniform(-1, 1) for _ in range(5) ]

search_status = client.search(
    collection_name = "pet_collection",
    data = [pet_vector],
    anns_field = "pet_vector",
    output_fields = ["pet_name"],
    filter = 'pet_name like "%i%"',
    limit = 10,
    search_params = {"metric_type": "IP", "params": {"level": 1}}
)

print_results(search_status)

[{'distance': 1.0070264339447021,
  'entity': {'pet_name': 'penguin'},
  'id': 8900},
 {'distance': 0.9696456789970398,
  'entity': {'pet_name': 'sea lion'},
  'id': 8077},
 {'distance': 0.922271192073822, 'entity': {'pet_name': 'lion'}, 'id': 5306},
 {'distance': 0.9179901480674744, 'entity': {'pet_name': 'lion'}, 'id': 2237},
 {'distance': 0.9049426317214966, 'entity': {'pet_name': 'lion'}, 'id': 8786},
 {'distance': 0.8988766670227051,
  'entity': {'pet_name': 'penguin'},
  'id': 3814},
 {'distance': 0.8940078020095825,
  'entity': {'pet_name': 'sea lion'},
  'id': 5421},
 {'distance': 0.8882008194923401, 'entity': {'pet_name': 'lion'}, 'id': 2617},
 {'distance': 0.8848559260368347, 'entity': {'pet_name': 'lion'}, 'id': 5163},
 {'distance': 0.8777058720588684,
  'entity': {'pet_name': 'penguin'},
  'id': 9160}]


In [18]:
# Perform a group seach for the elements whose pet_name contains the letter i that are close to a randomly generated pet_vector
search_status = client.search(
    collection_name = "pet_collection",
    data = [pet_vector],
    anns_field = "pet_vector",
    output_fields = ["pet_name"],
    filter = 'pet_name like "%i%"',
    group_by_field = "id",
    search_params = {"metric_type": "IP", "params": {"level": 1}}
)

pet_names = [result['entity']['pet_name'] for result in search_status[0]]
print(pet_names)

['whale', 'sea lion', 'lion', 'lion', 'lion', 'penguin', 'whale', 'lion', 'lion', 'whale']


In [19]:
# Get the entities with id from 1200 to 1205 for each partition
savannah_elements = client.get(
    collection_name = "zoo_collection",
    ids = [ (i+1200) for i in range(5) ],
    partition = "savannah"
)

print_results(savannah_elements)

artic_elements = client.get(
    collection_name = "zoo_collection",
    ids = [ (i+1200) for i in range(5) ],
    partition = "savannah"
)

print_results(artic_elements)

{'breed_vector': [0.3188581,
                  0.349546,
                  -0.28148818,
                  -0.20696737,
                  -0.7356126,
                  0.84210247,
                  0.0026325332,
                  -0.22894552,
                  0.08939071,
                  -0.16814397],
 'id': 1200,
 'pet_name': 'whale',
 'pet_vector': [0.14852327, -0.43242413, 0.7232335, 0.55974966, -0.27489674],
 'species': 'insect'}
{'breed_vector': [0.62408036,
                  -0.4675496,
                  0.94817597,
                  0.52966684,
                  -0.3527832,
                  -0.6271419,
                  0.5591908,
                  -0.43926305,
                  -0.39425373,
                  0.99569786],
 'id': 1201,
 'pet_name': 'sea lion',
 'pet_vector': [0.6612107, -0.12784503, 0.35511193, 0.35368064, 0.91896844],
 'species': 'canine'}
{'breed_vector': [0.49948183,
                  -0.5147771,
                  -0.83766973,
                  -0.112088025,

In [20]:
# Perform a query to collect all the elements whose pet_name is spider and return their name
spider_result = client.query(
    collection_name = "pet_collection",
    filter = 'pet_name == "spider"',
    output_fields = ["pet_name"],
)

print_results(spider_result)

{'id': 52, 'pet_name': 'spider'}
{'id': 59, 'pet_name': 'spider'}
{'id': 63, 'pet_name': 'spider'}
{'id': 80, 'pet_name': 'spider'}
{'id': 114, 'pet_name': 'spider'}
{'id': 121, 'pet_name': 'spider'}
{'id': 133, 'pet_name': 'spider'}
{'id': 142, 'pet_name': 'spider'}
{'id': 145, 'pet_name': 'spider'}
{'id': 178, 'pet_name': 'spider'}
{'id': 193, 'pet_name': 'spider'}
{'id': 199, 'pet_name': 'spider'}
{'id': 211, 'pet_name': 'spider'}
{'id': 212, 'pet_name': 'spider'}
{'id': 222, 'pet_name': 'spider'}
{'id': 229, 'pet_name': 'spider'}
{'id': 255, 'pet_name': 'spider'}
{'id': 268, 'pet_name': 'spider'}
{'id': 290, 'pet_name': 'spider'}
{'id': 292, 'pet_name': 'spider'}
{'id': 296, 'pet_name': 'spider'}
{'id': 305, 'pet_name': 'spider'}
{'id': 328, 'pet_name': 'spider'}
{'id': 349, 'pet_name': 'spider'}
{'id': 350, 'pet_name': 'spider'}
{'id': 379, 'pet_name': 'spider'}
{'id': 384, 'pet_name': 'spider'}
{'id': 394, 'pet_name': 'spider'}
{'id': 397, 'pet_name': 'spider'}
{'id': 419, 'pet_n

In [21]:
# Compute the ratio of artic pets and savannah pets (based on the arrays created to fill the collections) compared to the number of pets in the collection
savannah_result = client.query(
    collection_name = "zoo_collection",
    filter = 'pet_name in ["lion", "zebra", "snake"]',
    output_fields = ["count(*)"],
)

artic_result = client.query(
    collection_name = "zoo_collection",
    filter = 'pet_name in ["penguin", "whale", "sea lion"]',
    output_fields = ["count(*)"],
)

all_result = client.query(
    collection_name = "zoo_collection",
    output_fields = ["count(*)"],
)


print(str(round(savannah_result[0]["count(*)"]/all_result[0]["count(*)"], 4) * 100) + "%")
print(str(round(artic_result[0]["count(*)"]/all_result[0]["count(*)"], 4) * 100) + "%")

42.95%
39.22%


In [22]:
# Count the number of entities whose pet_name is penguin or lion
advanced_operator_result = client.query(
    collection_name = "zoo_collection",
    filter = 'pet_name == "penguin" or pet_name == "lion"',
    output_fields = ["count(*)"],
)

print_results(advanced_operator_result)

{'count(*)': 7972}


In [23]:
# Drop the collection and alias
client.drop_collection(
  collection_name="pet_collection"
)

client.drop_alias(
    collection_name = "pet_collection",
    alias = "zoo_collection"
)

In [24]:
# Create a new collection with a json field "pet_features" with the species and pet_name fields and an array of randomly generated ages
# Also, remove breeding_vector

# Create a schema
schema = MilvusClient.create_schema(
    auto_id = False,
    enable_dynamic_field = True,
)

schema.add_field(field_name = "id", datatype = DataType.INT64, is_primary = True)
schema.add_field(field_name = "pet_vector", datatype = DataType.FLOAT_VECTOR, dim = 5)
schema.add_field(field_name = "ages", datatype = DataType.ARRAY, element_type = DataType.INT64, max_capacity = 10)
schema.add_field(field_name = "pet_features", datatype = DataType.JSON)

# Add index to id, pet_vector, breed_vector, and species
index_params = client.prepare_index_params()

index_params.add_index(
    field_name = "id",
    index_type = "STL_SORT"
)

index_params.add_index(
    field_name = "pet_vector",
    index_type = "AUTOINDEX",
    metric_type = "IP",
    params = { "nlist": 128 }
)

# Create a collection with the index
client.create_collection(
    collection_name = "pet_collection",
    schema = schema,
    index_params = index_params
)

# Get the collection load state
load_state = client.get_load_state(
    collection_name = "pet_collection"
)

print(load_state)

{'state': <LoadState: Loaded>}


In [25]:
# Create 10k data for the main collection, using proper animal names based on the biome
import random

# List 10 pets to be inserted in the DB
pet_names = ["cat", "dog", "penguin", "lion", "zebra", "cow", "jaguar", "whale", "snake", "spider"]
# List three random species
species = ["feline", "canine", "insect"]

data = []

# Generate 10'000 animals
for i in range(10000):
    data.append({
        "id": i,
        "pet_vector": [ random.uniform(-1, 1) for _ in range(5) ],
        "ages": [ (j+i) for j in range(10) ],
        "pet_features": {
            "pet_name": random.choice(pet_names),
            "species": random.choice(species)
        }
    })

In [26]:
# Add data to the collection and return the number of loaded elements
insert_result = client.insert(
    collection_name = "pet_collection",
    data = data,
)

print(insert_result["insert_count"])

10000


In [27]:
# Look for all the pets whose name is penguin, cat, or dog and return their pet name that are close to a random pet_vector
pet_vector = [ random.uniform(-1, 1) for _ in range(5) ]

advanced_search_result = client.query(
    collection_name = "pet_collection",
    data = [pet_vector],
    filter = 'pet_features["pet_name"] == "penguin" or pet_features["pet_name"] == "cat" or pet_features["pet_name"] == "dog"',
    output_fields = ["pet_name"],
)

print(advanced_search_result)

data: ["{'id': 2}", "{'id': 9}", "{'id': 19}", "{'id': 22}", "{'id': 26}", "{'id': 37}", "{'id': 39}", "{'id': 43}", "{'id': 45}", "{'id': 47}"] ..., extra_info: {'cost': 6}


In [28]:
# Look for all the pets whose ages array contains the numbers 11 and 12
advanced_search_result = client.query(
    collection_name = "pet_collection",
    filter = "ARRAY_CONTAINS_ALL(ages, [11, 12])",
    output_fields = ["id"],
    limit = 3
)

print_results(advanced_search_result)

{'id': 3}
{'id': 4}
{'id': 5}


In [29]:
# Drop collection
client.drop_collection(
    collection_name = "pet_collection",
)

In [30]:
client.drop_alias(
    alias = "zoo_collection"
)