In [2]:
import os
import sys
import pyodbc
import logging
import json
import numpy as np
import faiss
from dotenv import load_dotenv
from sqlext.utils import Buffer, VectorSet, NpEncoder 
from test import load_vectors_from_db, vector_to_array, get_hnsw_links

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)],
)

load_dotenv()

True

In [4]:
print(os.environ["MSSQL"])

Driver={ODBC Driver 18 for SQL Server};Server=host.docker.internal;Database=wikipedia;Uid=vectordb_user;Pwd=StronGPassw0rd!;Encrypt=no;Connection Timeout=30;


In [5]:
ids, vectors = load_vectors_from_db()

2024-04-07 00:20:36,473 [INFO] Loaded 10000 rows, total rows 10000, total memory footprint 58 MB
2024-04-07 00:20:56,979 [INFO] Loaded 10000 rows, total rows 20000, total memory footprint 117 MB
2024-04-07 00:21:06,905 [INFO] Loaded 5000 rows, total rows 25000, total memory footprint 146 MB
2024-04-07 00:21:07,004 [INFO] Done


In [6]:
nvp = np.asarray(vectors)
d = nvp.shape[1]
index = faiss.index_factory(d, "HNSW32")    
index.add(nvp)

In [30]:
qp = np.where(ids == 3451)[0][0]
qv = np.asarray([nvp[qp]])    
dist, idx = index.search(qv, 10)

In [31]:
idx[0]

array([ 780,  294, 2241, 5761, 4717, 2833, 4546, 1760, 5054, 4574])

In [32]:
ids[idx[0]]

array([ 3451,   566,  7469, 18659, 14855,  8934, 14199,  5894, 15966,
       14317])

In [10]:
hnsw = index.hnsw
levels = faiss.vector_to_array(hnsw.levels)
lmin,lmax = levels.min(), levels.max()
lmin,lmax

(1, 4)

In [11]:
g = np.where(levels == 4)[0]
g

array([12586, 20288])

In [12]:
ids[[v for v in g if v != -1]]

array([46308, 77943])

In [17]:
conn = pyodbc.connect(os.environ["MSSQL"]) 

for i in range(len(ids)):
    # vector id
    id = ids[i]

    # get neighbors
    #logging.info(f'Processing vector {i} -> {id}')
    neighbors = get_hnsw_links(hnsw, i)

    for lidx, nl in enumerate(neighbors):
        
        # get vector ids of neighbors
        nl_ids = ids[[v for v in nl if v != -1]]

        #logging.info(f'   Processing neighbors level {lidx} (n:{len(nl_ids)})') 

        # insert into database
        params = [(int(id), int(nv_id), lidx) for nv_id in nl_ids]

        cursor = conn.cursor()  
        cursor.fast_executemany = True    

        cursor.executemany(f'INSERT INTO [$vector].faiss_hnsw (id, id_neighbor, l) VALUES (?, ?, ?)', params)

        cursor.commit()

conn.close()       