## KNN Model Implementation in python using FAISS library 
### source code reference
https://github.com/facebookresearch/faiss

### KNN Wrapper Class for Predicting K nearest neighors 

In [1]:
import numpy as np 
from collections import Counter 
import os 
import faiss
import time 
 
class FaissKNNImpl:
    
    def __init__(self,k,faiss):
        self.k = k # k nearest neighbor value
        self.faissIns = faiss # FAISS instance
        self.index = 0  
        self.train_labels = []  
        self.test_label_faiss_output = [] 
        
    def fitModel(self,train_features,train_labels): 
        self.train_labels = train_labels
        self.index = self.faissIns.IndexFlatL2(train_features.shape[1])   # build the index 
        self.index.add(train_features)       # add vectors to the index
        
    def predict(self,test_features): 
        distance, test_features_faiss_Index = self.index.search(test_features, self.k) 
        self.test_label_faiss_output = stats.mode(self.train_labels[test_features_faiss_Index],axis=1)[0]
        self.test_label_faiss_output = np.array(self.test_label_faiss_output.ravel())
        return self.test_label_faiss_output
      
    def getAccuracy(self,test_labels):
        accuracy = (self.test_label_faiss_output == test_labels).mean() * 100 
        return round(accuracy) 

## Data can be dowloaded from here
https://drive.google.com/file/d/1txt-EhdUPXC7w28s4scKVXco9Xla36O2/view?usp=sharing

In [11]:
def calculate_matrix(size, all_points, is_sklearn=False):
    # if is_sklearn:
    # distance_matrix = pairwise_distances(all_points, metric='euclidean')
    # else:
    distance_matrix = np.zeros((size, size))
    for i in range(size):
        distance_matrix[i, :] = np.sqrt(np.sum((all_points[i] - all_points) ** 2, axis=1))

    return distance_matrix

def read_data(path):
    all_points = []
    with open(path, 'r') as f:
        numbers = f.readline().split()
        numbers = [int(x) for x in numbers]
        # print(numbers)
        for line in f:
            all_points.append([float(x) for x in line.split()])

        start = time.time()

        # distance_matrix = calculate_matrix(numbers[0], np.array(all_points), is_sklearn=False)
        end = time.time()
        print("Time for calculating distance matrix: ", end - start)
        # print(distance_matrix)

        # save distance matrix to file
        #np.savetxt('distance_matrix.txt', distance_matrix, fmt='%f')
    return np.asarray(all_points), numbers[0]

all_points, m = read_data(r"C:\Users\Sergey\Documents\GitHub\gpu-clusterize\data\my_data_10.txt")

Time for calculating distance matrix:  0.0


In [12]:
all_points.shape

(10, 2)

In [13]:
raw_data = all_points

## Prepare Training and Test Data using 58k datapoints and 54 features

In [14]:
np.random.seed(0)
np.random.shuffle(raw_data)
train_size = int(0.9 * raw_data.shape[0])
train_features = raw_data[:train_size, :-1].astype('float32')
train_labels = raw_data[:train_size, -1].astype('float32')
test_features = raw_data[train_size:, :-1].astype('float32')
test_labels = raw_data[train_size:, -1].astype('float32')

In [15]:
train_features.shape

(9, 1)

In [16]:
test_features.shape

(1, 1)

In [17]:
import faiss_knn as fbknn

## Train FAISS KNN model with k = 5

In [21]:
k = 1
start_time = time.time()
faissobj = fbknn.FaissKNNImpl(k,faiss)
faissobj.fitModel(train_features,train_labels)
run_time = time.time() - start_time
print('time required for training %d data points at k = %d: %.2f seconds' % (train_features.shape[0], k , run_time))

time required for training 9 data points at k = 1: 0.00 seconds


In [22]:
# start_time = time.time()
predictions = faissobj.predict(test_features)
run_time = time.time() - start_time
print('time required for predicting %d data point at k = %d: %.2f seconds' % (test_features.shape[0], k, run_time))

time required for predicting 1 data point at k = 1: 0.22 seconds


In [23]:
accuracy = faissobj.getAccuracy(test_labels) 
print('Accuracy for K = %d : %d ' % (k, accuracy),'%')

Accuracy for K = 1 : 0  %


### Accuracy for K = 5 : 97  % 

In [24]:
from sklearn.metrics import classification_report
y_true = test_labels
y_pred = predictions
target_names = ['class 1', 'class 2', 'class 3','class 4', 'class 5', 'class 6','class 7']
print(classification_report(y_true, y_pred, target_names=target_names)) 

ValueError: Number of classes, 2, does not match size of target_names, 7. Try specifying the labels parameter

## Train FAISS KNN model with k = 10

In [19]:
k = 10
start_time = time.time()
faissobj = fbknn.FaissKNNImpl(k,faiss)
faissobj.fitModel(train_features,train_labels)
run_time = time.time() - start_time
print('time required for training %d data point at k = %d: %.2f seconds' % (train_features.shape[0], k, run_time))

time required for training 522910 data point at k = 10: 0.05 seconds


In [20]:
start_time = time.time()
predictions = faissobj.predict(test_features)
run_time = time.time() - start_time
print('time required for predicting %d data point at k = %d: %.2f seconds' % (test_features.shape[0], k, run_time))

time required for predicting 58102 data point at k = 10: 42.15 seconds


In [21]:
accuracy = faissobj.getAccuracy(test_labels) 
print('Accuracy for K = %d : %d ' % (k, accuracy),'%')

Accuracy for K = 10 : 96  %


### Accuracy for K = 10 : 96  % 