In [3]:
import json
import numpy as np
import hnswlib

In [4]:
dimension = 16
numElements = 100

In [6]:
data = np.float32(np.random.random((numElements, dimension)))

In [8]:
data1 = data[:numElements // 2]
data2 = data[numElements // 2:]

In [13]:
data

array([[0.31966615, 0.4631508 , 0.9576948 , ..., 0.65229934, 0.94523656,
        0.8806226 ],
       [0.2290776 , 0.82432044, 0.23329155, ..., 0.7190801 , 0.37603498,
        0.15270254],
       [0.17584477, 0.32184175, 0.22162323, ..., 0.357019  , 0.9055558 ,
        0.8372053 ],
       ...,
       [0.85544354, 0.9588192 , 0.21561944, ..., 0.82639086, 0.2895264 ,
        0.8791019 ],
       [0.33412936, 0.9085951 , 0.7280573 , ..., 0.08073778, 0.9071431 ,
        0.38801873],
       [0.32107872, 0.733728  , 0.99954027, ..., 0.8352785 , 0.7639304 ,
        0.83419204]], dtype=float32)

In [15]:
data_labels = np.arange(numElements)

In [16]:
data_labels

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [20]:
lis = [[1,2,3, 5], [3,4,5, 7], [5,6,7, 8]]

In [21]:
arr = np.asarray(lis)

In [32]:
arr[0]

array([1, 2, 3, 5])

In [23]:
p = hnswlib.Index(space = 'l2', dim = dimension) # possible options are l2, cosine or ip

# Initing index - the maximum number of elements should be known beforehand
p.init_index(max_elements = numElements, ef_construction = 200, M = 7)

# Element insertion (can be called several times):
p.add_items(data, data_labels)

# Controlling the recall by setting ef:
p.set_ef(50) # ef should always be > k

# Query dataset, k - number of closest elements (returns 2 numpy arrays)
labels, _ = p.knn_query(data, k = 3)

In [29]:
labels = labels.tolist()

In [30]:
labels

[[0, 85, 8],
 [1, 77, 58],
 [2, 14, 69],
 [3, 15, 69],
 [4, 64, 22],
 [5, 96, 60],
 [6, 93, 52],
 [7, 95, 89],
 [8, 0, 80],
 [9, 31, 98],
 [10, 60, 1],
 [11, 42, 41],
 [12, 43, 77],
 [13, 27, 83],
 [14, 2, 44],
 [15, 3, 81],
 [16, 86, 26],
 [17, 20, 51],
 [18, 69, 19],
 [19, 48, 32],
 [20, 17, 30],
 [21, 51, 28],
 [22, 49, 72],
 [23, 83, 77],
 [24, 41, 3],
 [25, 68, 58],
 [26, 56, 47],
 [27, 13, 95],
 [28, 21, 92],
 [29, 50, 92],
 [30, 77, 58],
 [31, 9, 48],
 [32, 19, 1],
 [33, 31, 48],
 [34, 93, 22],
 [35, 84, 91],
 [36, 74, 99],
 [37, 73, 75],
 [38, 83, 37],
 [39, 83, 52],
 [40, 98, 31],
 [41, 42, 66],
 [42, 41, 62],
 [43, 12, 94],
 [44, 79, 15],
 [45, 74, 83],
 [46, 92, 62],
 [47, 49, 22],
 [48, 61, 86],
 [49, 22, 64],
 [50, 92, 29],
 [51, 55, 21],
 [52, 6, 59],
 [53, 86, 48],
 [54, 51, 61],
 [55, 51, 78],
 [56, 26, 52],
 [57, 59, 96],
 [58, 48, 30],
 [59, 75, 61],
 [60, 62, 10],
 [61, 48, 86],
 [62, 60, 87],
 [63, 51, 62],
 [64, 49, 22],
 [65, 49, 10],
 [66, 99, 93],
 [67, 77, 83],

In [34]:
IDList = []
NNList = []
embeddings = []
with open('dblpAbstractFTEmbeddings.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        paperID = data['id']
        embedding = data['embedding']
        IDList.append(paperID)
        embeddings.append(embedding)

In [35]:
numElements = len(IDList)
dimension = len(embeddings[0])
embeddings = np.asarray(embeddings)
data_labels = np.arange(numElements)

In [36]:
p = hnswlib.Index(space = 'cosine', dim = dimension) # possible options are l2, cosine or ip

# Initing index - the maximum number of elements should be known beforehand
p.init_index(max_elements = numElements, ef_construction = 200, M = 20)

# Element insertion (can be called several times):
p.add_items(embeddings, data_labels)

# Controlling the recall by setting ef:
p.set_ef(50) # ef should always be > k

# Query dataset, k - number of closest elements (returns 2 numpy arrays)
labels, _ = p.knn_query(embeddings, k = 5)

In [37]:
index_path='fastTexthnswlib.bin'
print("Saving index to '%s'" % index_path)
p.save_index("fastTexthnswlib.bin")
del p

Saving index to 'fastTexthnswlib.bin'


In [38]:
labels[0]

array([ 136396,  408376, 1222837,  485560,  586113], dtype=uint64)