# Notebook 8: Add HNSW Edges

This notebook constructs an HNSW index from node embeddings to quickly approximate the K-nearest neighbors (KNN) for each node.

Each node is then linked with its **8 nearest neighbors**. Neighbors with a **similarity â‰¤ 0.6** are skipped.

## HNSW Hyperparameters
- **M** = 32  
- **efConstruction** = 200  
- **efSearch** = 64  

## Output
- The resulting **HNSW index** is saved.  
- The resulting graph is saved as **G5 - semantically enriched graph**.

In [1]:
import os
dir_path = os.getcwd()
print("The directory of this script is:", dir_path)
root_path = os.path.dirname(dir_path)
print("The root directory is:", root_path)

The directory of this script is: c:\Users\HP\Desktop\Projects\NodeRAG\graphs
The root directory is: c:\Users\HP\Desktop\Projects\NodeRAG


In [2]:
import sys
sys.path.append(root_path)
from graphs.Node import Node

In [3]:
import pickle
with open(f"{root_path}/graphs/data/graphs/G4_text_inserted_graph.pkl", "rb") as f:
    medical_g4 = pickle.load(f)

In [4]:
#sanity check
for node_id in medical_g4:
    node = medical_g4[node_id]
    for edge in node.edges:
        if node.edges[edge] > 1:
            print(node_id,"-",edge,"-",node.edges[edge])
print("-"*40)
for node_id in medical_g4:
    node = medical_g4[node_id]
    for edge in node.edges:
        if edge == node_id:
            print(node_id)
print("-"*40)
#calculate total edge weight
sum_edge = 0
for node_id in medical_g4:
    node = medical_g4[node_id]
    for edge in node.edges:
        sum_edge += node.edges[edge]
print("Total edge weight in G4:", sum_edge)
print("-"*40)

----------------------------------------
----------------------------------------
Total edge weight in G4: 145760
----------------------------------------


In [5]:
import json
import faiss
import numpy as np
#embeddings of S,A,H,T nodes

#load faiss index file
index = faiss.read_index("data/embedding/medical_index.faiss")
with open("data/embedding/medical_ids.json", "r") as f:
    medical_embedding_ids = json.load(f)
index.ntotal, len(medical_embedding_ids)

#reconstruct vectors from faiss index
num_vectors = index.ntotal
dimension = index.d
embeddings = np.zeros((num_vectors, dimension), dtype='float32')
for i in range(num_vectors):
    embeddings[i] = index.reconstruct(i)

print("Embeddings shape:", embeddings.shape)
embeddings

Embeddings shape: (4016, 768)


array([[-0.07040366, -0.0460059 , -0.03309077, ..., -0.04978694,
        -0.03295664, -0.00608065],
       [-0.03963271, -0.0726626 , -0.06257651, ..., -0.0454145 ,
        -0.03274148, -0.00346303],
       [ 0.00135392,  0.0302959 ,  0.02910572, ...,  0.01070352,
        -0.02824946, -0.05189522],
       ...,
       [ 0.04048249, -0.0287718 , -0.03389297, ..., -0.00295302,
         0.0417009 ,  0.00099732],
       [ 0.00630151, -0.01392328, -0.01267361, ..., -0.09236691,
         0.01060563, -0.01778928],
       [ 0.01245843, -0.01854508, -0.01106029, ..., -0.08220403,
         0.01130273,  0.05532239]], dtype=float32)

In [6]:
import time
import random
#build hnsw index
start_time = time.time()
M = 32
hnsw = faiss.IndexHNSWFlat(embeddings.shape[1], M, faiss.METRIC_INNER_PRODUCT)
hnsw.hnsw.efConstruction = 200
hnsw.hnsw.efSearch = 64
hnsw.add(embeddings)
faiss.write_index(hnsw, "data/embedding/medical_index_hnsw.faiss")
end_time = time.time()
print(f"HNSW index built in {end_time - start_time:.2f} seconds.")

#test

k_test = 8
q = embeddings[random.randint(0,embeddings.shape[0]-1)].reshape(1, -1)
distance, idx = hnsw.search(q, k_test)
for i in range(k_test):
    print(f"Neighbor {medical_embedding_ids[idx[0][i]]}: Index={idx[0][i]}, Similarity={distance[0][i]}")

HNSW index built in 0.16 seconds.
Neighbor medical-178: Index=3640, Similarity=1.0
Neighbor medical-178-S-0: Index=868, Similarity=0.8330248594284058
Neighbor medical-177-S-3: Index=867, Similarity=0.7988085150718689
Neighbor medical-528: Index=3990, Similarity=0.790191650390625
Neighbor medical-178-S-2: Index=870, Similarity=0.7583250999450684
Neighbor medical-538: Index=4000, Similarity=0.746252179145813
Neighbor medical-526: Index=3988, Similarity=0.7439181804656982
Neighbor medical-526-S-3: Index=2592, Similarity=0.7323269248008728


In [7]:
#addd hnsw edges to graph
for i in range(len(medical_embedding_ids)):
    node = medical_g4[medical_embedding_ids[i]]
    node_embedding = embeddings[i].reshape(1,-1)
    k = 16
    similarities, indices = hnsw.search(node_embedding, k+1)
    for j in range(k+1):
        neighbor_id = medical_embedding_ids[indices[0][j]]
        if neighbor_id == medical_embedding_ids[i]:
            continue
        neighbor = medical_g4[neighbor_id]
        similarity = round(similarities[0][j], 3)
        if similarity <= 0.6:
            continue
        node.link(neighbor, similarity)
        neighbor.link(node, similarity)

In [8]:
#calculate total edge weight after adding hnsw edges
sum_edge = 0
for node_id in medical_g4:
    node = medical_g4[node_id]
    for edge in node.edges:
        sum_edge += node.edges[edge]
print("Total edge weight:", sum_edge)
print("-"*40)
for node_id in medical_g4:
    node = medical_g4[node_id]
    for edge in node.edges:
        if edge == node_id:
            print(node_id)

Total edge weight: 223226.0760332346
----------------------------------------


In [9]:
with open(f"{root_path}/graphs/data/graphs/G5_semantically_enriched_graph.pkl", "wb") as f:
    pickle.dump(medical_g4, f)
with open(f"{root_path}/graphs/data/graphs/G5_semantically_enriched_graph.pkl", "rb") as f:
    medical_g5 = pickle.load(f)

In [10]:
#check
keys = list(medical_g5.keys())
for i in range(999):
    node_id = keys[i]
    node = medical_g5[node_id]
    for j in range(i+1, 1000):
        neighbor_id = keys[j]
        if neighbor_id in node.edges:
            print(node_id, "-", neighbor_id, "-", node.edges[neighbor_id])

medical-0-S-0 - medical-0-S-1 - 1.24
medical-0-S-0 - medical-1-S-2 - 0.611
medical-0-S-0 - medical-2-S-2 - 1.418
medical-0-S-0 - medical-13-S-0 - 1.344
medical-0-S-0 - medical-20-S-3 - 1.234
medical-0-S-1 - medical-1-S-2 - 1.436
medical-0-S-1 - medical-1-S-3 - 1.574
medical-0-S-1 - medical-2-S-0 - 1.558
medical-0-S-1 - medical-2-S-2 - 0.623
medical-0-S-1 - medical-3-S-0 - 1.274
medical-0-S-1 - medical-13-S-1 - 1.236
medical-0-S-1 - medical-13-S-4 - 1.282
medical-0-S-1 - medical-20-S-4 - 0.61
medical-0-S-2 - medical-1-S-0 - 1.426
medical-0-S-2 - medical-13-S-3 - 1.812
medical-0-S-2 - medical-13-S-4 - 1.284
medical-0-S-2 - medical-14-S-0 - 1.22
medical-1-S-0 - medical-13-S-3 - 1.402
medical-1-S-0 - medical-13-S-4 - 1.502
medical-1-S-0 - medical-14-S-0 - 1.688
medical-1-S-1 - medical-3-S-1 - 0.609
medical-1-S-1 - medical-3-S-2 - 0.602
medical-1-S-2 - medical-1-S-3 - 1.308
medical-1-S-2 - medical-2-S-0 - 1.304
medical-1-S-2 - medical-2-S-1 - 1.326
medical-1-S-2 - medical-2-S-2 - 0.638
medi