### Initialize poc

In [46]:
import os
import sys
import logging
import numpy as np
import faiss
from dotenv import load_dotenv
import hnsw_utils 
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)],
)
load_dotenv()

True

### Load vectors from database

In [47]:
ids, vectors = hnsw_utils.load_vectors_from_db()

2024-04-08 01:34:41,268 [INFO] Loaded 10000 rows, total rows 10000, total memory footprint 58 MB
2024-04-08 01:34:55,070 [INFO] Loaded 10000 rows, total rows 20000, total memory footprint 117 MB
2024-04-08 01:35:02,208 [INFO] Loaded 5000 rows, total rows 25000, total memory footprint 146 MB
2024-04-08 01:35:02,273 [INFO] Done


### Create index

In [48]:
nvp = np.asarray(vectors)
d = nvp.shape[1]
index = faiss.index_factory(d, "HNSW32")    
index.add(nvp)

### Test the index

Use a reference vector to search for the nearest neighbors

In [49]:
qp = np.where(ids == 11193)[0][0]
qv = np.asarray([nvp[qp]])    
dist, idx = index.search(qv, 10)
hnsw = index.hnsw

List vectors (by their internal id) that are close to the reference vector

In [50]:
idx[0]

array([ 3692, 20306,  7434,  3426,  2162, 16764,  6720, 10448,  3591,
        3706])

List vectors by their *database* ids

In [51]:
ids[idx[0]]

array([11193, 78034, 23984, 10193,  7342, 63964, 21230, 37075, 10915,
       11219])

Return max graph level

In [52]:
levels = faiss.vector_to_array(hnsw.levels)
levels.max()

4

Show content (as vector internal ids) of a specific graph level

In [53]:
g = np.where(levels == 4)[0]
g

array([12586, 20288])

Show database ids of vectors in a specific graph level

In [54]:
ids[[v for v in g if v != -1]]

array([46308, 77943])

Show all the neighbors of a specific vector, at any level

In [55]:
neighbors = hnsw_utils.get_hnsw_links(hnsw, 12586)
neighbors

[array([ 6057, 17442,  6775, 24354, 21027, 22156, 12559, 21162, 16636,
        15808, 22469,   595, 16570, 21038, 20929, 21858, 20919,  3038,
         6704, 21035,  7736,   598, 11301, 24053, 23534, 16097, 23847,
        17455, 14228, 23985, 23994, 20942, 22864, 21851, 15303, 22132,
        22067, 10128,  2266,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1], dtype=int32),
 array([20288, 13907,  9991, 23475,  9192, 23338, 13815, 20502,  6057,
        24970,  1988,  7830,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1], dtype=int32),
 array([20288,   592,  3913,  8169, 23593,  3123, 11537,  5033, 10189,
        18411, 20513, 22509,  1751,  8947, 13212, 19782, 24582, 11548,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    

Save graph back to the database

In [None]:
hnsw_utils.save_hnsw_graph(ids, hnsw)     