<a href="https://colab.research.google.com/github/GUDDUemsec/Implementation-with-Faiss-IndexIVFPQ-HNSW/blob/main/Implementation_with_Faiss_IndexIVFPQ_%2B_HNSW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Read Vector Data** 

In [1]:
import shutil
import urllib.request as request
from contextlib import closing

# first we download the Sift1M dataset
with closing(request.urlopen('ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz')) as r:
    with open('sift.tar.gz', 'wb') as f:
        shutil.copyfileobj(r, f)

In [2]:
import tarfile

# the download leaves us with a tar.gz file, we unzip it
tar = tarfile.open('sift.tar.gz', "r:gz")
tar.extractall()
tar

<tarfile.TarFile at 0x7f689026cfa0>

In [6]:
import numpy as np

# now define a function to read the fvecs file format of Sift1M dataset
def read_fvecs(fp):
    a = np.fromfile(fp, dtype='int32')
    d = a[0]
    return a.reshape(-1, d + 1)[:, 1:].copy().view('float32')
# data we will search through
xb = read_fvecs('./sift/sift_base.fvecs')[:500]
xp = read_fvecs('./sift/sift_base.fvecs')[500:700]  # 1M samples
# also get some query vectors to search with
xq = read_fvecs('./sift/sift_query.fvecs')
# take just one query (there are many in sift_learn.fvecs)
xq = xq[0].reshape(1, xq.shape[1])

In [4]:
xq

array([[  1.,   3.,  11., 110.,  62.,  22.,   4.,   0.,  43.,  21.,  22.,
         18.,   6.,  28.,  64.,   9.,  11.,   1.,   0.,   0.,   1.,  40.,
        101.,  21.,  20.,   2.,   4.,   2.,   2.,   9.,  18.,  35.,   1.,
          1.,   7.,  25., 108., 116.,  63.,   2.,   0.,   0.,  11.,  74.,
         40., 101., 116.,   3.,  33.,   1.,   1.,  11.,  14.,  18., 116.,
        116.,  68.,  12.,   5.,   4.,   2.,   2.,   9., 102.,  17.,   3.,
         10.,  18.,   8.,  15.,  67.,  63.,  15.,   0.,  14., 116.,  80.,
          0.,   2.,  22.,  96.,  37.,  28.,  88.,  43.,   1.,   4.,  18.,
        116.,  51.,   5.,  11.,  32.,  14.,   8.,  23.,  44.,  17.,  12.,
          9.,   0.,   0.,  19.,  37.,  85.,  18.,  16., 104.,  22.,   6.,
          2.,  26.,  12.,  58.,  67.,  82.,  25.,  12.,   2.,   2.,  25.,
         18.,   8.,   2.,  19.,  42.,  48.,  11.]], dtype=float32)

In [3]:
a=xb.shape[0]
a

500

In [6]:
import numpy as np


def fvecs_read(filename, c_contiguous=True):
    fv = np.fromfile(filename, dtype=np.float32)
    if fv.size == 0:
        return np.zeros((0, 0))
    dim = fv.view(np.int32)[0]
    assert dim > 0
    fv = fv.reshape(-1, 1 + dim)
    if not all(fv.view(np.int32)[:, 0] == dim):
        raise IOError("Non-uniform vector sizes in " + filename)
    fv = fv[:, 1:]
    if c_contiguous:
        fv = fv.copy()
    return fv

In [7]:
fvecs_read('./sift/sift_base.fvecs',c_contiguous=True)

array([[  0.,  16.,  35., ...,  25.,  23.,   1.],
       [ 14.,  35.,  19., ...,  11.,  21.,  33.],
       [  0.,   1.,   5., ...,   4.,  23.,  10.],
       ...,
       [ 30.,  12.,  12., ...,  50.,  10.,   0.],
       [  0.,   5.,  12., ...,   1.,   2.,  13.],
       [114.,  31.,   0., ...,  25.,  16.,   0.]], dtype=float32)

In [8]:
!pip install faiss-cpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# **Implementation with Faiss: IndexIVFPQ + HNSW**

In [5]:
import faiss
d = xb.shape[1]        
M = 8         
nlist = xb.shape[0]  
nsegment = 16  
nbit = 8       
k=10
coarse_quantizer = faiss.IndexHNSWFlat(d, M)
index = faiss.IndexIVFPQ(coarse_quantizer, d, nlist, nsegment, nbit)      



In [7]:
index.is_trained

False

In [8]:
index.train(xb)

In [9]:
index.add(xb)  

In [10]:
index.nprobe = 5     
D, I = index.search(xq, k)  

In [11]:
D

array([[8.5812000e+04, 9.8494000e+04, 9.8910000e+04, 1.0359000e+05,
        1.0662100e+05, 3.4028235e+38, 3.4028235e+38, 3.4028235e+38,
        3.4028235e+38, 3.4028235e+38]], dtype=float32)

# **Implementation with Faiss: IndexHNSWFlat**

In [12]:
import faiss
d = xb.shape[1]      
M = 32       
k=20
index = faiss.IndexHNSWFlat(d, M)            
index.hnsw.efConstruction = 40         
index.add(xb)                  
Dis, I = index.search(xq, k) 

In [13]:
Dis

array([[ 85812.,  98494.,  98910., 103590., 106621., 106803., 111889.,
        112211., 113230., 116449., 117906., 117968., 118334., 119525.,
        121834., 122188., 122880., 123308., 124210., 124576.]],
      dtype=float32)

In [14]:
I

array([[190, 224, 292, 492, 146, 107, 370, 121, 124, 287, 123,  97, 420,
        149, 348, 106, 286, 396, 145, 174]])