### Initialize poc

In [1]:
import os
import sys
import logging
import numpy as np
import faiss
import json
from openai import AzureOpenAI
from dotenv import load_dotenv
import hnsw_utils 

load_dotenv()

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)],
)


### Load vectors from database

In [5]:
ids, vectors = hnsw_utils.load_vectors_from_db()

2024-04-09 17:44:09,637 [INFO] Loaded 10000 rows, total rows 10000, total memory footprint 58 MB
2024-04-09 17:44:23,310 [INFO] Loaded 10000 rows, total rows 20000, total memory footprint 117 MB
2024-04-09 17:44:30,195 [INFO] Loaded 5000 rows, total rows 25000, total memory footprint 146 MB
2024-04-09 17:44:30,265 [INFO] Done


### Create index

In [74]:
nvp = np.asarray(vectors)
d = nvp.shape[1]
index = faiss.index_factory(d, "HNSW")  
index.add(nvp)

In [75]:
{"type":type(index), "metric": index.metric_type, "efSearch":index.hnsw.efSearch}

{'type': faiss.swigfaiss_avx2.IndexHNSWFlat, 'metric': 1, 'efSearch': 16}

### Test the index

Use a reference vector to search for the nearest neighbors

In [77]:
qp = np.where(ids == 11193)[0][0]
qv = np.asarray([nvp[qp]])    
dist, idx = index.search(qv, 10)
hnsw = index.hnsw

List vectors (by their internal id) that are close to the reference vector

In [62]:
idx[0]

array([ 3692, 20306,  7434,  3426,  2162, 16764,  6720, 10448,  3591,
        3706])

List vectors by their *database* ids

In [63]:
ids[idx[0]]

array([11193, 78034, 23984, 10193,  7342, 63964, 21230, 37075, 10915,
       11219])

Return max graph level

In [64]:
levels = faiss.vector_to_array(hnsw.levels)
levels.max()

4

Show content (as vector internal ids) of a specific graph level

In [65]:
g = np.where(levels == 4)[0]
g

array([12586, 20288])

Show database ids of vectors in a specific graph level

In [66]:
ids[[v for v in g if v != -1]]

array([46308, 77943])

Show all the neighbors of a specific vector, at any level

In [67]:
neighbors = hnsw_utils.get_hnsw_links(hnsw, 12586)
neighbors

[array([ 6057,  9646, 17442,  6775, 24354, 21027, 22156, 12559, 16636,
        21162, 15808, 22469,   595, 16570, 21038, 20929, 21858, 20919,
         3038,  6704, 21035,   598, 11301,  7736, 24053, 17455, 23985,
        23534, 16097, 23847, 23994, 14228, 20942, 15303, 21851, 22864,
        22067, 22132,  2266, 10128,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1], dtype=int32),
 array([13907, 23475,  5109, 20502, 13815,  2006,  6057,  1988,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1], dtype=int32),
 array([20288,   592,  3913,  8169, 23593,  3123, 11537,  5033, 10189,
        18411, 20513, 22509,  1751,  8947, 13212, 19782, 24582, 11548,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    

Save graph back to the database

In [78]:
hnsw_utils.save_hnsw_graph(ids, hnsw)     

### Test using a calculated embedding

Get the embeddings

In [79]:
client = AzureOpenAI(
    api_key=os.environ["OPENAI_KEY"],  
    api_version="2024-02-01",
    azure_endpoint = os.environ["OPENAI_ENDPOINT"]
    )

In [90]:
text = "Pasta"
e = client.embeddings.create(input = [text], model="embeddings").data[0].embedding

2024-04-09 18:42:15,083 [INFO] HTTP Request: POST https://dm-open-ai-3.openai.azure.com//openai/deployments/embeddings/embeddings?api-version=2024-02-01 "HTTP/1.1 200 OK"


In [91]:
json.dumps(e)

'[0.02129378356039524, -0.002485408913344145, 0.004123273305594921, -0.009836466051638126, -0.013313254341483116, 0.009143582545220852, -0.01884394697844982, -0.019462592899799347, -0.0031798386480659246, -0.03432245925068855, 0.021231919527053833, 0.007250526919960976, -0.00781968142837286, 0.01711173914372921, 0.004488273989409208, 0.00695976335555315, 0.03531229496002197, 0.00013194551866035908, 0.02578515000641346, -0.031971607357263565, -0.01753241755068302, -0.01379579771310091, 0.012756473384797573, -0.006081286817789078, -0.011605792678892612, -0.005617302376776934, -0.00781968142837286, -0.016406482085585594, 0.014760885387659073, -0.0219371747225523, 0.03946959227323532, -0.026057355105876923, -0.00223949714563787, -0.006829848047345877, 0.0037582723889499903, -0.0063782366923987865, -0.0036190771497786045, -0.008623920381069183, 0.005249208305031061, 0.006526711396872997, 0.0034303900320082903, 0.008933243341743946, -0.006427728105336428, -0.017383942380547523, -0.0268492214

Find closest items along with their distances

In [92]:
qv = np.asarray([e]) 
dist, idx = index.search(qv, 10)
r = dict(zip([str(ids[i]) for i in idx[0]], dist[0]))
json.dumps(r, cls=hnsw_utils.NpEncoder)


'{"47141": 2.7279957066639327e-06, "19931": 0.13070416450500488, "77682": 0.17061400413513184, "42307": 0.2040543258190155, "19360": 0.20772784948349, "29673": 0.21760308742523193, "4475": 0.22775641083717346, "52423": 0.2293991595506668, "60337": 0.23269997537136078, "12725": 0.2374797761440277}'