### Initialize poc

In [1]:
import os
import sys
import logging
import numpy as np
import faiss
import json
from openai import AzureOpenAI
from dotenv import load_dotenv
import hnsw_utils 

load_dotenv()

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)],
)


### Load vectors from database

In [2]:
ids, vectors = hnsw_utils.load_vectors_from_db(
    table_name="dbo.wikipedia_articles_title_embeddings_native",
    id_column="id",
    vector_column="title_vector_native",
    vector_size=1536,
    top_n=25000
)

2024-04-27 20:30:24,562 [INFO] Loaded 10000 rows, total rows 10000, total memory footprint 58 MB
2024-04-27 20:30:26,921 [INFO] Loaded 10000 rows, total rows 20000, total memory footprint 117 MB
2024-04-27 20:30:28,009 [INFO] Loaded 5000 rows, total rows 25000, total memory footprint 146 MB
2024-04-27 20:30:28,064 [INFO] Done


### Create index

In [3]:
nvp = np.asarray(vectors)
d = nvp.shape[1]
index = faiss.index_factory(d, "HNSW", faiss.METRIC_INNER_PRODUCT)  
index.add(nvp)
hnsw = index.hnsw
{"type":type(index), "metric":index.metric_type, "efSearch":index.hnsw.efSearch}

{'type': faiss.swigfaiss_avx2.IndexHNSWFlat, 'metric': 0, 'efSearch': 16}

### Test the index

Use a reference vector to search for the nearest neighbors

In [4]:
qp = np.where(ids == 11193)[0][0]
qv = np.asarray([nvp[qp]])    
dist, idx = index.search(qv, 10)

List vectors (by their internal id) that are close to the reference vector

In [5]:
idx[0]

array([ 3692, 20306,  7434,  3426,  2162, 16764,  6720, 10448,  3591,
        3706])

List vectors by their *database* ids

In [6]:
ids[idx[0]]

array([11193, 78034, 23984, 10193,  7342, 63964, 21230, 37075, 10915,
       11219])

List the vectors by their database id along with the distances to the reference vector

In [7]:
r = dict(zip([str(ids[i]) for i in idx[0]], dist[0]))
json.dumps(r, cls=hnsw_utils.NpEncoder)

'{"11193": 0.9999999403953552, "78034": 0.8929316997528076, "23984": 0.883037805557251, "10193": 0.8578137159347534, "7342": 0.8558316826820374, "63964": 0.8552420139312744, "21230": 0.8529642820358276, "37075": 0.8517205715179443, "10915": 0.8511642813682556, "11219": 0.8493160605430603}'

Return max graph level

In [8]:
levels = faiss.vector_to_array(hnsw.levels)
levels.max()

4

Show content (as vector internal ids) of a specific graph level

In [9]:
g = np.where(levels == 4)[0]
g

array([12586, 20288])

Show database ids of vectors in a specific graph level

In [10]:
ids[[v for v in g if v != -1]]

array([46308, 77943])

Show all the neighbors of a specific vector, at any level

In [11]:
neighbors = hnsw_utils.get_hnsw_links(hnsw, 12586)
neighbors

[array([ 6057, 17442,  6775, 24354, 22156, 22469, 16636, 12559, 21858,
        21038, 20919,  3038,  6704, 21035,  7736,   598, 11301, 24053,
        23847, 22605, 23534, 16097, 17455, 23985, 14228, 20942, 21851,
        23994, 15303, 22132, 22864, 22067, 10128,  2266,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1], dtype=int32),
 array([20502, 23475,  9991,  2627, 13815,  6057,  1988,  7830,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1], dtype=int32),
 array([20288,   592,  3913,  8169, 23593,  3123, 11537,  5033, 10189,
        18411, 20513, 22509,  1751,  8947, 13212, 19782, 24582, 11548,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    

Save graph back to the database

In [12]:
hnsw_utils.save_hnsw_graph_to_db(ids, hnsw)     

2024-04-27 20:30:57,017 [INFO] Inserting 10007 records...
2024-04-27 20:30:57,329 [INFO] Inserting 10009 records...
2024-04-27 20:30:57,501 [INFO] Inserting 10040 records...
2024-04-27 20:30:57,799 [INFO] Inserting 10038 records...
2024-04-27 20:30:57,990 [INFO] Inserting 10008 records...
2024-04-27 20:30:58,197 [INFO] Inserting 10016 records...
2024-04-27 20:30:58,416 [INFO] Inserting 10011 records...
2024-04-27 20:30:58,631 [INFO] Inserting 10010 records...
2024-04-27 20:30:58,851 [INFO] Inserting 10021 records...
2024-04-27 20:30:59,033 [INFO] Inserting 10037 records...
2024-04-27 20:30:59,213 [INFO] Inserting 10047 records...
2024-04-27 20:30:59,396 [INFO] Inserting 10019 records...
2024-04-27 20:30:59,571 [INFO] Inserting 10062 records...
2024-04-27 20:30:59,750 [INFO] Inserting 10017 records...
2024-04-27 20:30:59,927 [INFO] Inserting 10028 records...
2024-04-27 20:31:00,124 [INFO] Inserting 10036 records...
2024-04-27 20:31:00,306 [INFO] Inserting 10017 records...
2024-04-27 20:

### Test using a calculated embedding

Get the embeddings

In [None]:
client = AzureOpenAI(
    api_key=os.environ["OPENAI_KEY"],  
    api_version="2024-02-01",
    azure_endpoint = os.environ["OPENAI_ENDPOINT"]
    )

In [None]:
text = "Pasta"
e = client.embeddings.create(input = [text], model="embeddings").data[0].embedding

In [None]:
json.dumps(e)

Find closest items along with their distances

In [None]:
qv = np.asarray([e]) 
dist, idx = index.search(qv, 10)
r = dict(zip([str(ids[i]) for i in idx[0]], dist[0]))
json.dumps(r, cls=hnsw_utils.NpEncoder)
