In [None]:
from IPython.display import clear_output

# Install pymilvus compatible with Milvus v2.3.x
!python -m pip install pymilvus==2.3.7

# Install pymilvus compatible with Milvus v2.4.x
!python -m pip install pymilvus==2.4.4

# Update PyMilvus to the newest version
!python -m pip install --upgrade pymilvus

# Verify installation success
!python -m pip list | grep pymilvus

!python -m pip uninstall -y setuptools

# Required for pymilvus
!python -m pip install setuptools==69.0.3

In [None]:
from pprint import pprint

def print_results(results):

    for el in results:
        pprint(el)

In [None]:
# Open the configuration file
import yaml

with open("credentials.yaml") as f:
    credentials = yaml.safe_load(f)

    CLUSTER_ENDPOINT = credentials["CLUSTER_ENDPOINT"]
    TOKEN = credentials["TOKEN"]

In [None]:
from pymilvus import MilvusClient

# Initialize a MilvusClient instance
client = MilvusClient(
    uri = CLUSTER_ENDPOINT,
    token = TOKEN
)

In [None]:
from pymilvus import DataType

# Create a schema
schema = MilvusClient.create_schema(
    auto_id = False,
    enable_dynamic_field = True,
)

# Add the following fields to the schema
# - id (primary, INT64)
# - pet_vector (FLOAT_VECTOR, 5 dimensions)
# - breed_vector (FLOAT_VECTOR, 10 dimensions)
# - species (VARCHAR)
# - eye_color (VARCHAR)
schema.add_field(field_name = "id", datatype = DataType.INT64, is_primary = True)
schema.add_field(field_name = "pet_vector", datatype = DataType.FLOAT_VECTOR, dim = 5)
schema.add_field(field_name = "breed_vector", datatype = DataType.FLOAT_VECTOR, dim = 10)
schema.add_field(field_name = "species", datatype = DataType.VARCHAR, max_length = 1000)
schema.add_field(field_name = "pet_name", datatype = DataType.VARCHAR, max_length = 1000)

In [None]:
# Add index to id, pet_vector, breed_vector, and species
index_params = client.prepare_index_params()

index_params.add_index(
    field_name = "id",
    index_type = "STL_SORT"
)

index_params.add_index(
    field_name = "pet_vector",
    index_type = "AUTOINDEX",
    metric_type = "IP",
    params = { "nlist": 128 }
)

index_params.add_index(
    field_name = "breed_vector",
    index_type = "AUTOINDEX",
    metric_type = "IP",
    params = { "nlist": 128 }
)

index_params.add_index(
    field_name = "pet_name",
    index_type = "TRIE"
)

In [None]:
# Create a collection with the index
client.create_collection(
    collection_name = "pet_collection",
    schema = schema,
    index_params = index_params
)

# Get the collection load state
load_state = client.get_load_state(
    collection_name = "pet_collection"
)

print(load_state)

In [None]:
# Add an alias zoo_collection to the collection
# Create alias
client.create_alias(
    collection_name = "pet_collection",
    alias = "zoo_collection"
)

In [None]:
# Create two partitions: savannah and artic using the alias zoo_collection
client.create_partition(
    collection_name = "zoo_collection",
    partition_name = "savannah"
)

client.create_partition(
    collection_name = "zoo_collection",
    partition_name = "artic"
)

# Collect the list of partitions
partitions = client.list_partitions(collection_name = "zoo_collection")

print(partitions)

In [None]:
# Create 10k data for the main collection, using proper animal names based on the biome
import random

# List 10 pets to be inserted in the DB
pet_names = ["cat", "dog", "penguin", "lion", "zebra", "cow", "jaguar", "whale", "snake", "spider"]
# List three random species
species = ["feline", "canine", "insect"]

data = []

# Generate 10'000 animals
for i in range(10000):
    data.append({
        "id": i,
        "pet_vector": [ random.uniform(-1, 1) for _ in range(5) ],
        "breed_vector": [ random.uniform(-1, 1) for _ in range(10) ],
        "species": random.choice(species),
        "pet_name": random.choice(pet_names)
    })

In [None]:
# Add data to the collection and return the number of loaded elements
insert_result = client.insert(
    collection_name = "zoo_collection",
    data = data,
)

print(insert_result["insert_count"])

In [None]:
# Create 5k data each for the partitions, using proper animal names based on the biome
import random

# List savannah animals
savannah_pets = ["lion", "zebra", "snake"]
# List artic animals
artic_pets = ["penguin", "whale", "sea lion"]
# List three random species
species = ["feline", "canine", "insect"]

savannah_biome_data = []
artic_biome_data = []

# Generate 10'000 animals
for i in range(10000):
savannah_biome_data.append({
    "id": i,
    "pet_vector": [ random.uniform(-1, 1) for _ in range(5) ],
    "breed_vector": [ random.uniform(-1, 1) for _ in range(10) ],
    "species": random.choice(species),
    "pet_name": random.choice(savannah_pets)
})

artic_biome_data.append({
    "id": i,
    "pet_vector": [ random.uniform(-1, 1) for _ in range(5) ],
    "breed_vector": [ random.uniform(-1, 1) for _ in range(10) ],
    "species": random.choice(species),
    "pet_name": random.choice(artic_pets)
})

In [None]:
# Upsert the data in both collections and print the number of elements upserted
upsert_partition_status_savannah = client.upsert(
    collection_name = "zoo_collection",
    data = savannah_biome_data,
    partition_name = "savannah"
)

print(upsert_partition_status_savannah["upsert_count"])

upsert_partition_status_artic = client.upsert(
    collection_name = "zoo_collection",
    data = artic_biome_data,
    partition_name = "artic"
)

print(upsert_partition_status_artic["upsert_count"])

In [None]:
# Delete the first 1k entities in each partition
id_to_delete = [i for i in range(1000)]

delete_status_savannah = client.delete(
    collection_name = "zoo_collection",
    ids = id_to_delete,
    partition_name = "savannah"
)

delete_status_artic = client.delete(
    collection_name = "zoo_collection",
    ids = id_to_delete,
    partition_name = "artic"
)

print(delete_status_savannah)
print(delete_status_artic)

In [None]:
# Search for the top 3 elements that are close to a randomly generated breed_vector using IP as the metric
breed_vector = [ random.uniform(-1, 1) for _ in range(10) ]

search_status = client.search(
    collection_name = "zoo_collection",
    data = [breed_vector],
    anns_field = "breed_vector",
    limit = 3,
    search_params = {"metric_type": "IP", "params": {"level": 1}}
)

print_results(search_status)

In [None]:
# Search for the top 5 elements that are close to 3 randomly generated breed_vector using IP as the metric and return their species
breed_vector = [[ random.uniform(-1, 1) for _ in range(10) ] for j in range(3)]

search_status = client.search(
    collection_name = "zoo_collection",
    data = breed_vector,
    anns_field = "breed_vector",
    output_fields = ["species"],
    limit = 5,
    search_params = {"metric_type": "IP", "params": {"level": 1}}
)

print_results(search_status)

In [None]:
# Search the top 10 elements whose pet_name contains the letter i that are close to a randomly generated pet_vector and return their names
pet_vector = [ random.uniform(-1, 1) for _ in range(5) ]

search_status = client.search(
    collection_name = "zoo_collection",
    data = [pet_vector],
    anns_field = "pet_vector",
    output_fields = ["pet_name"],
    filter = 'pet_name like "%i%"',
    limit = 10,
    search_params = {"metric_type": "IP", "params": {"level": 1}}
)

print_results(search_status)

In [None]:
# Perform a group seach for the elements whose pet_name contains the letter i that are close to a randomly generated pet_vector
search_status = client.search(
    collection_name = "zoo_collection",
    data = [pet_vector],
    anns_field = "pet_vector",
    output_fields = ["pet_name"],
    filter = 'pet_name like "%i%"',
    group_by_field = "pet_name",
    search_params = {"metric_type": "IP", "params": {"level": 1}}
)

pet_names = [result['entity']['pet_name'] for result in search_status[0]]
print(pet_names)

In [None]:
# Get the entities with id from 1200 to 1205 for each partition
savannah_elements = client.get(
    collection_name = "zoo_collection",
    ids = [ (i+1200) for i in range(5) ],
    partition = "savannah"
)

print_results(savannah_elements)

artic_elements = client.get(
    collection_name = "zoo_collection",
    ids = [ (i+1200) for i in range(5) ],
    partition = "savannah"
)

print_results(artic_elements)

In [None]:
# Perform a query to collect all the elements whose pet_name is spider and return their name
spider_result = client.query(
    collection_name = "pet_collection",
    filter = 'pet_name == "spider"',
    output_fields = ["pet_name"],
)

print_results(spider_result)

In [None]:
# Compute the ratio of artic pets and savannah pets (based on the arrays created to fill the collections) compared to the number of pets in the collection
savannah_result = client.query(
    collection_name = "zoo_collection",
    filter = 'pet_name in ["lion", "zebra", "snake"]',
    output_fields = ["count(*)"],
)

artic_result = client.query(
    collection_name = "zoo_collection",
    filter = 'pet_name in ["penguin", "whale", "sea lion"]',
    output_fields = ["count(*)"],
)

all_result = client.query(
    collection_name = "zoo_collection",
    output_fields = ["count(*)"],
)


print(str(round(savannah_result[0]["count(*)"]/all_result[0]["count(*)"], 4) * 100) + "%")
print(str(round(artic_result[0]["count(*)"]/all_result[0]["count(*)"], 4) * 100) + "%")

In [None]:
# Count the number of entities whose pet_name is penguin or lion
advanced_operator_result = client.query(
    collection_name = "zoo_collection",
    filter = 'pet_name == "penguin" or pet_name == "lion"',
    output_fields = ["count(*)"],
)

print_results(advanced_operator_result)

In [None]:
# Drop the collection and alias
client.drop_collection(
  collection_name="pet_collection"
)

client.drop_alias(
    collection_name = "pet_collection",
    alias = "zoo_collection"
)

In [None]:
# Create a new collection with a json field "pet_features" with the species and pet_name fields and an array of randomly generated ages
# Also, remove breeding_vector

# Create a schema
schema = MilvusClient.create_schema(
    auto_id = False,
    enable_dynamic_field = True,
)

schema.add_field(field_name = "id", datatype = DataType.INT64, is_primary = True)
schema.add_field(field_name = "pet_vector", datatype = DataType.FLOAT_VECTOR, dim = 5)
schema.add_field(field_name = "ages", datatype = DataType.ARRAY, element_type = DataType.INT64, max_capacity = 10)
schema.add_field(field_name = "pet_features", datatype = DataType.JSON)

# Add index to id, pet_vector, breed_vector, and species
index_params = client.prepare_index_params()

index_params.add_index(
    field_name = "id",
    index_type = "STL_SORT"
)

index_params.add_index(
    field_name = "pet_vector",
    index_type = "AUTOINDEX",
    metric_type = "IP",
    params = { "nlist": 128 }
)

# Create a collection with the index
client.create_collection(
    collection_name = "pet_collection",
    schema = schema,
    index_params = index_params
)

# Get the collection load state
load_state = client.get_load_state(
    collection_name = "pet_collection"
)

print(load_state)

In [None]:
# Create 10k data for the main collection, using proper animal names based on the biome
import random

# List 10 pets to be inserted in the DB
pet_names = ["cat", "dog", "penguin", "lion", "zebra", "cow", "jaguar", "whale", "snake", "spider"]
# List three random species
species = ["feline", "canine", "insect"]

data = []

# Generate 10'000 animals
for i in range(10000):
    data.append({
        "id": i,
        "pet_vector": [ random.uniform(-1, 1) for _ in range(5) ],
        "ages": [ (j+i) for j in range(10) ],
        "pet_features": {
            "pet_name": random.choice(pet_names),
            "species": random.choice(species)
        }
    })

In [None]:
# Add data to the collection and return the number of loaded elements
insert_result = client.insert(
    collection_name = "pet_collection",
    data = data,
)

print(insert_result["insert_count"])

In [None]:
# Look for all the pets whose name is penguin, cat, or dog and return their pet name that are close to a random pet_vector
pet_vector = [ random.uniform(-1, 1) for _ in range(5) ]

advanced_search_result = client.query(
    collection_name = "pet_collection",
    data = [pet_vector],
    filter = 'pet_features["pet_name"] == "penguin" or pet_features["pet_name"] == "cat" or pet_features["pet_name"] == "dog"',
    output_fields = ["pet_name"],
)

print(advanced_search_result)

In [None]:
# Look for all the pets whose ages array contains the numbers 11 and 12
advanced_search_result = client.query(
    collection_name = "pet_collection",
    filter = "ARRAY_CONTAINS_ALL(ages, [11, 12])",
    output_fields = ["id"],
    limit = 3
)

print_results(advanced_search_result)