In [185]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
import math
from collections import Counter

In [74]:
def find_dist(row , test_df):
    '''
    This method takes the complete list of features and the values of the dataframe for prediction and calculates
    the result based on those.
    '''
    for col in row.index:
        if col not in ['distance', 'group']:
            row['distance'] += abs(row[col] - test_df[col])  
    return row['distance']

In [179]:
class KNN:
    '''
    Implementing KNN algorith based on Eucledean distance as the distance matrix
    '''
    training_df = None
    result_df = None
    test_df = None
    predict_df = None
    
    
    def __init__(self, N = 1):
        self.N = N
        self.predictions = list()
        
    def fit(self, training_df , result_df):
        '''
        This method will set the value of the training dataframe and the result values and set up the prediction parameters
        '''
        self.training_df = pd.DataFrame(data = training_df)
        self.result_df = result_df
        self.predict_df = self.training_df.copy()
        self.predict_df['distance'] = 0
        self.predict_df['group'] = self.result_df
        
    def predict(self , test_df):
        for row in test_df:
            self.with_pandas_apply(row)
        return self.predictions
        
    def with_pandas_apply(self, test_row):
        #Specify axis = 1 to return the complete row to the function, else axis = 0 to send the complete column.
                
        self.predict_df['distance'] = self.predict_df.apply(find_dist ,axis = 1 ,args = (test_row,))
        self.calculate_result()
        
    def calculate_result(self):
        '''
        This method will sort the dataframe based on the distance between the points. Fetch the top N results and
        return the most occured group value.
        '''
        self.predict_df.sort_values(by = 'distance', ascending=True , inplace= True)
        result_list = self.predict_df.head(self.N)['group'].value_counts().idxmax()
        
        self.predictions.append(result_list) 

In [187]:
if __name__ == "__main__":
    knn = KNN(N=1)
    x , y = load_iris(return_X_y= True)
    x_train , x_test , y_train , y_test = train_test_split(x , y , test_size = 0.3 , random_state = 400)
    knn.fit(x_train,y_train)
    output = knn.predict(x_test)
    print("Output generated by the algo  : {}".format(output))
    print("Actual output of the problem : {}".format(y_test))
    get_accuracy(output , y_test)

Output generated by the algo  : [1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Actual output of the problem : [1 2 0 0 2 2 2 0 2 0 1 0 1 1 1 0 1 2 1 2 0 1 2 2 0 0 0 0 1 2 0 1 0 1 1 2 2
 2 0 2 0 1 0 2 2]
Accuracy acheived : 37.78%


In [186]:
def get_accuracy(predicted , actual):
    accuracy = 0
    for i in range(len(predicted)):
        if predicted[i] == actual[i]:
            accuracy += 1
    print("Accuracy acheived : {}%".format(round( (accuracy / len(predicted)) * 100 , 2)))