In [1]:
from min_vec import MinVectorDB

In [2]:
my_db = MinVectorDB('my_vec_db')

MinVectorDB - INFO - Successful initialization of MinVectorDB in root_path: /projects/MinVectorDB/tutorials/my_vec_db


## Use FLAT index mode

FLAT mode is a brute force search mode, so performance decays linearly with increasing data size.

In [3]:
collection = my_db.require_collection(collection='test_vec', dim=128, index_mode='FLAT', drop_if_exists=True)

MinVectorDB - INFO - Creating collection test_vec with: 
//    dim=128, collection='test_vec', 
//    n_clusters=16, chunk_size=100000,
//    distance='cosine', index_mode='FLAT', 
//    dtypes='float32', use_cache=True, 
//    scaler_bits=8, n_threads=10
MinVectorDB - INFO - Collection 'test_vec' already exists. Dropped.


In [4]:
from tqdm import trange

import numpy as np

with collection.insert_session():
    for i in trange(100000, unit="vectors"):
        if i == 0:
            query = np.random.random(128)
            vec = query
        else:
            vec = np.random.random(128)
        collection.add_item(vec, id=i, field={"test":f"test_{i // 1000}"})

100%|██████████| 100000/100000 [00:00<00:00, 136726.36vectors/s]


In [5]:
collection.query(vector=query, k=5)

(array([    0, 83623, 27364, 63772, 95421]),
 Array([0.9964056 , 0.84585243, 0.84466696, 0.8444241 , 0.84396076],      dtype=float32))

In [6]:
print(collection.query_report_)


* - MOST RECENT QUERY REPORT -
| - Database Shape: (100000, 128)
| - Query Time: 0.12887 s
| - Query Distance: cosine
| - Query K: 5
| - Top 5 Results ID: [    0 83623 27364 63772 95421]
| - Top 5 Results Similarity: [0.996406 0.845852 0.844667 0.844424 0.843961]
* - END OF REPORT -



## Use IVF-FLAT index mode

IVF-FLAT uses inverted indexes for significant speedups on large-scale data.


Note that IVF-FLAT mode will only take effect if the number of rows added to the collection has reached 100,000; below 100,000 rows, it will fall back to FLAT mode.

In [7]:
collection = my_db.require_collection(collection='test_vec2', dim=128, index_mode='IVF-FLAT', drop_if_exists=True, chunk_size=10000)

MinVectorDB - INFO - Creating collection test_vec2 with: 
//    dim=128, collection='test_vec2', 
//    n_clusters=16, chunk_size=10000,
//    distance='cosine', index_mode='IVF-FLAT', 
//    dtypes='float32', use_cache=True, 
//    scaler_bits=8, n_threads=10
MinVectorDB - INFO - Collection 'test_vec2' already exists. Dropped.


In [8]:
from tqdm import trange

import numpy as np

with collection.insert_session():
    for i in trange(100000, unit="vectors"):
        if i == 0:
            query = np.random.random(128)
            vec = query
        else:
            vec = np.random.random(128)
        collection.add_item(vec, id=i, field={"test":f"test_{i // 1000}"})

100%|██████████| 100000/100000 [00:00<00:00, 138672.42vectors/s]


In [9]:
collection.query(vector=query, k=5)

(array([    0, 25346, 66497, 26745, 69704]),
 Array([0.9972331 , 0.8270122 , 0.8229687 , 0.8146472 , 0.81384647],      dtype=float32))

In [10]:
print(collection.query_report_)


* - MOST RECENT QUERY REPORT -
| - Database Shape: (100000, 128)
| - Query Time: 0.06724 s
| - Query Distance: cosine
| - Query K: 5
| - Top 5 Results ID: [    0 25346 66497 26745 69704]
| - Top 5 Results Similarity: [0.997233 0.827012 0.822969 0.814647 0.813846]
* - END OF REPORT -

