In [1]:
import h5py
import os
import numpy as np

def concatenate_hdf5_files(directory_path, dataset_path):
    # Initialize an empty list to store the data from each HDF5 file
    data_list = []

    # Check if directory_path is a file
    if os.path.isfile(directory_path):
        with h5py.File(directory_path, 'r') as h5file:
            # Assuming your data is stored in a dataset named 'data' within the HDF5 file
            data = h5file[dataset_path][:]
            data_list.append(data)
    else:
        # Iterate through all files in the directory
        for filename in os.listdir(directory_path):
            file_path = os.path.join(directory_path, filename)
            with h5py.File(file_path, 'r') as h5file:
                # Assuming your data is stored in a dataset named 'data' within the HDF5 file
                data = h5file[dataset_path][:]
                data_list.append(data)

    # Concatenate the data from all files into a single NumPy array
    if data_list:
        #print(len(data_list))
        # Determine the concatenation axis based on the dimensionality of the data
        data_dimensionality = data_list[0].ndim
        if data_dimensionality == 1:
            concatenated_data = np.stack(data_list, axis=0)
        else:
            concatenated_data = np.concatenate(data_list, axis=data_dimensionality - 1)

        return concatenated_data
    else:
        return None  # Handle the case where there are no HDF5 files in the directory

def load_data(db_data_file_dir, db_data_dataset,  pattern_data_file_dir, pattern_data_dataset):
    db_data = concatenate_hdf5_files(db_data_file_dir, db_data_dataset)
    if db_data is not None:
        print("Concatenated data shape:", db_data.shape)
    
    db_pattern = concatenate_hdf5_files(pattern_data_file_dir, pattern_data_dataset)
    if db_pattern is not None:
        print("Concatenated data shape:", db_pattern.shape)
    return db_data, db_pattern 

#test code
# db_data, db_pattern = load_data(db_data_file_dir = "/Users/dbin/work/TensorSearch/db-data-1d", db_data_dataset = "/testg/testd",  pattern_data_file_dir = "/Users/dbin/work/TensorSearch/db-pattern-1d", pattern_data_dataset =  "/testg/testd");
# print(db_data)
# print(db_pattern)

# db_data, db_pattern = load_data(db_data_file_dir = "./deep-image-96-angular.hdf5", db_data_dataset = "/train",  pattern_data_file_dir = "./deep-image-96-angular.hdf5", pattern_data_dataset =  "/test");
# print(db_data)
# print(db_pattern)

In [None]:
""" example.py based from pymilvus
"""


import pandas as pd

import random

from milvus import default_server



from pymilvus import (
    connections,
    FieldSchema, CollectionSchema, DataType,
    Collection,
    utility
)

# This example shows how to:
#   1. connect to Milvus server
#   2. create a collection
#   3. insert entities
#   4. create index
#   5. search

# Optional, if you want store all related data to specific location
# default it wil using %APPDATA%/milvus-io/milvus-server
default_server.set_base_dir('test_milvus')
default_server.stop()
# Optional, if you want cleanup previous data
default_server.cleanup()
# star you milvus server
default_server.start()

_HOST = '127.0.0.1'
# The port may be changed, by default it's 19530
_PORT = default_server.listen_port

# Const names
_COLLECTION_NAME = 'demo'
_ID_FIELD_NAME = 'id_field'
_VECTOR_FIELD_NAME = 'float_vector_field'

# Vector parameters
_DIM = 96
#_INDEX_FILE_SIZE = 32  # max file size of stored index

# Index parameters
_METRIC_TYPE = 'IP'
_INDEX_TYPE = 'FLAT'
_NLIST = 1024
_NPROBE = 16
_TOPK = 10


# Create a Milvus connection
def create_connection():
    print(f"\nCreate connection...")
    connections.connect(host=_HOST, port=_PORT)
    print(f"\nList connections:")
    print(connections.list_connections())


# Create a collection named 'demo'
def create_collection(name, id_field, vector_field):
    field1 = FieldSchema(name=id_field, dtype=DataType.INT64, description="int64", is_primary=True)
    field2 = FieldSchema(name=vector_field, dtype=DataType.FLOAT_VECTOR, description="float vector", dim=_DIM,
                         is_primary=False)
    schema = CollectionSchema(fields=[field1, field2], description="collection description")
    collection = Collection(name=name, data=None, schema=schema, properties={"collection.ttl.seconds": 15})
    print("\ncollection created:", name)
    return collection


def has_collection(name):
    return utility.has_collection(name)


# Drop a collection in Milvus
def drop_collection(name):
    collection = Collection(name)
    collection.drop()
    print("\nDrop collection: {}".format(name))


# List all collections in Milvus
def list_collections():
    print("\nlist collections:")
    print(utility.list_collections())


def insert(collection, db_data):
    data = [
        [i for i in range(db_data.shape[0])],
        db_data.tolist(),
    ]
    #print(data)
    collection.insert(data)
    


def get_entity_num(collection):
    print("\nThe number of entity:")
    print(collection.num_entities)


def create_index(collection, filed_name):
    index_param = {
        "index_type": _INDEX_TYPE,
        "params": {"nlist": _NLIST},
        "metric_type": _METRIC_TYPE}
    collection.create_index(filed_name, index_param)
    print("\nCreated index:\n{}".format(collection.index().params))


def drop_index(collection):
    collection.drop_index()
    print("\nDrop index sucessfully")


def load_collection(collection):
    collection.load()


def release_collection(collection):
    collection.release()


def search(collection, vector_field, id_field, search_vectors):
    search_param = {
        "data": search_vectors,
        "anns_field": vector_field,
        "param": {"metric_type": _METRIC_TYPE, "params": {"nprobe": _NPROBE}},
        "limit": _TOPK,
        "expr": "id_field >= 0"}
    results = collection.search(**search_param)
    for i, result in enumerate(results):
        print("\nSearch result for {}th vector: ".format(i))
        for j, res in enumerate(result):
            print("Top {}: {}".format(j, res))


def set_properties(collection):
    collection.set_properties(properties={"collection.ttl.seconds": 1800})


def main():
    # create a connection
    create_connection()

    # drop collection if the collection exists
    if has_collection(_COLLECTION_NAME):
        drop_collection(_COLLECTION_NAME)

    # create collection
    collection = create_collection(_COLLECTION_NAME, _ID_FIELD_NAME, _VECTOR_FIELD_NAME)

    # alter ttl properties of collection level
    set_properties(collection)

    # show collections
    list_collections()
   
    db_data, db_pattern = load_data(db_data_file_dir = "./deep-image-96-angular.hdf5", db_data_dataset = "/train",  pattern_data_file_dir = "./deep-image-96-angular.hdf5", pattern_data_dataset =  "/test");
    #db_data = db_data[0:10, :]
    #db_pattern = db_pattern[0:10, :]

    # insert 10000 vectors with 128 dimension
    #df = pd.DataFrame(db_data)
    insert(collection, db_data)
    # print(vectors)
    collection.flush()
    # get the number of entities
    get_entity_num(collection)

    # create index
    create_index(collection, _VECTOR_FIELD_NAME)

    # load data to memory
    load_collection(collection)

    # search
    search(collection, _VECTOR_FIELD_NAME, _ID_FIELD_NAME, db_pattern)

    # release memory
    release_collection(collection)

    # drop collection index
    # drop_index(collection)

    # drop collection
    drop_collection(_COLLECTION_NAME)

main()
default_server.stop()




Create connection...

List connections:
[('default', <pymilvus.client.grpc_handler.GrpcHandler object at 0x11e82bbb0>)]

collection created: demo

list collections:
['demo']
Concatenated data shape: (9990000, 96)
Concatenated data shape: (10000, 96)


In [3]:
def insert(num, dim):
    data = [
        [i for i in range(num)],
        [[random.random() for _ in range(dim)] for _ in range(num)],
    ]
    print(type(data))
    print(data)

    
insert(2, 4)


<class 'list'>
[[0, 1], [[0.3515893227098098, 0.014051220768119932, 0.5188800444922963, 0.24591186833394352], [0.19807924413544042, 0.6490144610433105, 0.0538598294335535, 0.7194652749658635]]]


In [4]:
import numpy as np

V1 = [7.5, 49.5, 73.5, 58.5]
V2 = [1.81, 2.81, 3.81, 4.81]

dot_product = np.dot(V1, V2)

# Alternatively, you can use np.inner(V1, V2) to achieve the same result
# dot_product = np.inner(V1, V2)

print("Dot product of V1 and V2:", dot_product)

Dot product of V1 and V2: 714.09
