# IVF Implementation 

### Depnedncies

In [8]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import pickle
import joblib
import vec_db

### IVF Class

In [9]:
class IVF_Index:
    def __init__(self, n_clusters, file_name,random_state=4):
        self.n_clusters = n_clusters
        self.cluster_centers = None
        self.inverted_index = {}
        self.random_state=random_state
        self.file_name=file_name
    #! get the index clusters
    def fit(self, data):
        #! fit the data to the kmeans
        kmeans=KMeans(n_clusters=self.n_clusters,random_state=self.random_state)
        labels=kmeans.fit_predict(data)
        self.cluster_centers=kmeans.cluster_centers_
        #! create the inverted index
        for i,label in enumerate(labels):
            if label not in self.inverted_index:
                self.inverted_index[label]=[]
            self.inverted_index[label].append(i)
        #! convert the lists to numpy arrays
        for key in self.inverted_index:
            self.inverted_index[key]=np.array(self.inverted_index[key])
        
    #! save index to a file
    def save(self):
        with open(self.file_name, 'wb') as f:
            np.save(f, self.cluster_centers)
            pickle.dump(self.inverted_index, f)
    def load(self):
        with open(self.file_name, 'rb') as f:
            self.cluster_centers = np.load(f)
            self.inverted_index = pickle.load(f)
            self.n_clusters=self.cluster_centers.shape[0]
    #? To be edited instead of flat search
    def retrieve(self,query_vector,n_clusters,n_arrays,cosine_similarity,get_row):
        #! calculate the similarities between the query vector and the cluster centers
        similarities = np.array([cosine_similarity(query_vector, center) for center in self.cluster_centers])
        #! get the n nearest clusters
        nearest_clusters = np.argpartition(similarities, -n_clusters)[-n_clusters:]
        #! get nearest n arrays within nearest k clusters 
        vectors_indices = [self.inverted_index[cluster] for cluster in nearest_clusters]
        all_vectors_indices = np.concatenate(vectors_indices)
        vectors = np.array([get_row(i) for i in all_vectors_indices])
        similarities = np.array([cosine_similarity(query_vector, vector) for vector in vectors])
        #! get nearest n arrays overall
        nearest_arrays = np.argpartition(similarities, -n_arrays)[-n_arrays:]
        return vectors[nearest_arrays]


## Testing the Implementation

## generating index file 

In [10]:
VecDB=vec_db.VecDB(db_size=1000000)
index = IVF_Index(n_clusters=100,file_name=VecDB.index_path)
data = VecDB.get_all_rows()
index.fit(data)
index.save()

## getting closest vectors using linear search

In [11]:
#! generating 10 random query vectors each of lengh 70
query_vectors = np.random.rand(5, 70)
query_vectors=np.append(query_vectors,data[:5],axis=0)
#! retreive closest vector to each query using linear search
similars_linear = []
for query_vector in query_vectors:
    similarities=np.array([VecDB._cal_score(query_vector,i) for i in data])
    nearest_arrays=np.argpartition(similarities,-1)[-1:]
    similars_linear.append(data[nearest_arrays])
similars_linear=np.array(similars_linear)
similars_linear=similars_linear.reshape(similars_linear.shape[0],similars_linear.shape[2])
print(similars_linear.shape)
for i in range(query_vectors.shape[0]):
    print(f"Query {i+1} linear search:\n {data[i]}")
    print(f"Similar to:\n {similars_linear[i]}")
    print("Similarity: ",VecDB._cal_score(query_vectors[i],similars_linear[i]))
    

(10, 70)
Query 1 linear search:
 [0.08925092 0.773956   0.6545715  0.43887842 0.43301523 0.8585979
 0.08594561 0.697368   0.20146948 0.09417731 0.52647895 0.9756223
 0.73575234 0.7611397  0.71747726 0.78606427 0.51322657 0.12811363
 0.8397482  0.45038593 0.5003519  0.370798   0.1825496  0.92676497
 0.78156745 0.6438651  0.40241432 0.8227616  0.5454291  0.44341415
 0.45045954 0.22723871 0.09213591 0.55458474 0.8878898  0.0638172
 0.85829127 0.8276311  0.27675968 0.6316644  0.16522902 0.7580877
 0.70052296 0.35452592 0.06791997 0.970698   0.44568747 0.89312106
 0.677919   0.7783835  0.75989944 0.19463867 0.36390603 0.466721
 0.49779153 0.04380375 0.54656947 0.15428948 0.7433759  0.6830489
 0.9225278  0.7447621  0.36664265 0.9675097  0.41085035 0.32582533
 0.90553576 0.37045968 0.07634318 0.4695558 ]
Similar to:
 [0.18514574 0.5500325  0.22704232 0.41559786 0.30438542 0.8566631
 0.3655637  0.2924654  0.41802967 0.8783871  0.93414015 0.48053372
 0.16647786 0.919113   0.9423215  0.9925107  

## Getting closest vectors using the index

In [12]:
index.load()

In [13]:
similar_index=[]
for query_vector in query_vectors:
    similar_index.append(index.retrieve(query_vector,10,1,VecDB._cal_score,VecDB.get_one_row))
similar_index=np.array(similar_index)
similar_index=similar_index.reshape(similar_index.shape[0],similar_index.shape[2])
for i in range(query_vectors.shape[0]):
    print(f"Query {i+1} linear search:\n {data[i]}")
    print(f"Similar to:\n {similar_index[i]}")
    print("Similarity: ",VecDB._cal_score(query_vectors[i],similar_index[i]))

Query 1 linear search:
 [0.08925092 0.773956   0.6545715  0.43887842 0.43301523 0.8585979
 0.08594561 0.697368   0.20146948 0.09417731 0.52647895 0.9756223
 0.73575234 0.7611397  0.71747726 0.78606427 0.51322657 0.12811363
 0.8397482  0.45038593 0.5003519  0.370798   0.1825496  0.92676497
 0.78156745 0.6438651  0.40241432 0.8227616  0.5454291  0.44341415
 0.45045954 0.22723871 0.09213591 0.55458474 0.8878898  0.0638172
 0.85829127 0.8276311  0.27675968 0.6316644  0.16522902 0.7580877
 0.70052296 0.35452592 0.06791997 0.970698   0.44568747 0.89312106
 0.677919   0.7783835  0.75989944 0.19463867 0.36390603 0.466721
 0.49779153 0.04380375 0.54656947 0.15428948 0.7433759  0.6830489
 0.9225278  0.7447621  0.36664265 0.9675097  0.41085035 0.32582533
 0.90553576 0.37045968 0.07634318 0.4695558 ]
Similar to:
 [0.8181798  0.4115106  0.78124744 0.6848927  0.80186135 0.86441666
 0.14514613 0.9333077  0.5203274  0.81529963 0.9613186  0.38059413
 0.5101823  0.3984353  0.45803607 0.27626038 0.605345

In [14]:
print(np.all(similar_index == similars_linear, axis=1))

[False  True  True False  True  True  True  True  True  True]
