In [1]:
import numpy 
import sys 
import nmslib 
import time 
import math 
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
print(sys.version)
print("NMSLIB version:", nmslib.__version__)


3.8.6 (tags/v3.8.6:db45529, Sep 23 2020, 15:52:53) [MSC v.1927 64 bit (AMD64)]
NMSLIB version: 2.0.6


In [2]:
# Just read the data
# all_data_matrix = numpy.loadtxt('../../sample_data/sift_10k.txt')
data_matrix = numpy.fromfile(r'C:\Users\t-bizhao\source\repos\sift\sift_base.fvecs', dtype=numpy.float32, sep='')
data_matrix = data_matrix.reshape((-1, 129), order='C')
data_matrix = data_matrix[:,1:]
print(data_matrix.shape)

(1000000, 128)


In [3]:
# Create a held-out query data set
# (data_matrix, query_matrix) = train_test_split(all_data_matrix, test_size = 0.1)
query_matrix = numpy.fromfile(r'C:\Users\t-bizhao\source\repos\sift\sift_query.fvecs', dtype=numpy.float32, sep='')
query_matrix = query_matrix.reshape((-1, 129), order='C')
query_matrix = query_matrix[:,1:]
print(query_matrix.shape)

(10000, 128)


In [4]:
print("# of queries %d, # of data points %d"  % (query_matrix.shape[0], data_matrix.shape[0]) )

# of queries 10000, # of data points 1000000


In [5]:
# Set index parameters
# These are the most important onese
M = 10
efC = 200

num_threads = 1
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0,
                     'skip_optimized_index' : 1 # using non-optimized index!
                    }

In [6]:
# Number of neighbors 
K=1

In [7]:
# Space name should correspond to the space name 
# used for brute-force search
space_name='l2'

In [8]:
# Intitialize the library, specify the space, the type of the vector and add data points 
# for SIFT data, we want DENSE_UINT8_VECTOR and distance type INT
index = nmslib.init(method='hnsw', 
                    space=space_name, 
                    data_type=nmslib.DataType.DENSE_VECTOR, 
                    dtype=nmslib.DistType.FLOAT) 

In [9]:
index.addDataPointBatch(data_matrix) 

1000000

In [10]:
# Create an index
start = time.time()
index.createIndex(index_time_params) 
end = time.time() 
print('Index-time parameters', index_time_params)
print('Indexing time = %f' % (end-start))

Index-time parameters {'M': 10, 'indexThreadQty': 1, 'efConstruction': 200, 'post': 0, 'skip_optimized_index': 1}
Indexing time = 783.351550


In [11]:
# Save a meta index and the data
index.saveIndex('sift1m_float.bin', save_data=True)

In [12]:
# Setting query-time parameters
efS = 1280
query_time_params = {'efSearch': efS}
print('Setting query-time parameters', query_time_params)
index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 1280}


In [13]:
# Querying
query_qty = query_matrix.shape[0]
start = time.time() 
nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)
end = time.time() 
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % 
      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) 

kNN time total=44.465681 (sec), per query=0.004447 (sec), per query adjusted for thread number=0.004447 (sec)


In [14]:
print(K)
print(nbrs[0][1])

1
[232.87122]


In [15]:
# Create a held-out query data set
# (data_matrix, query_matrix) = train_test_split(all_data_matrix, test_size = 0.1)
truth_matrix = numpy.fromfile(r'C:\Users\t-bizhao\source\repos\sift\sift_groundtruth.ivecs', dtype=numpy.int32, sep='')
truth_matrix = truth_matrix.reshape((-1, 101), order='C')
truth_matrix = truth_matrix[:,1:]
print(truth_matrix.shape)

(10000, 100)


In [24]:
# Finally computing recall
recall=0.0
for i in range(0, query_qty):
  correct_set = set(truth_matrix[i][:K])
  ret_set = set(nbrs[i][0])
  recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)
recall = recall / query_qty
print('kNN recall %f' % recall)

kNN recall 0.992400


In [25]:
# Re-intitialize the library, specify the space, the type of the vector.
newIndex = nmslib.init(method='hnsw', 
                    space=space_name, 
                    data_type=nmslib.DataType.DENSE_VECTOR, 
                    dtype=nmslib.DistType.FLOAT) 

In [26]:
# Re-load the index and re-run queries
newIndex.loadIndex('sift1m_float.bin', load_data=True)

In [27]:
# Setting query-time parameters and querying
print('Setting query-time parameters', query_time_params)
newIndex.setQueryTimeParams(query_time_params)

query_qty = query_matrix.shape[0]
start = time.time() 
new_nbrs = newIndex.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)
end = time.time() 
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % 
      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) 

Setting query-time parameters {'efSearch': 1280}
kNN time total=38.905756 (sec), per query=0.003891 (sec), per query adjusted for thread number=0.003891 (sec)


In [29]:
# Finally computing recall for the new result set
recall=0.0
for i in range(0, query_qty):
  correct_set = set(truth_matrix[i][:K])
  ret_set = set(new_nbrs[i][0])
  recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)
recall = recall / query_qty
print('kNN recall %f' % recall)

kNN recall 0.992400
