In [1]:
import sklearn
from sklearn import datasets
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt

Implementing the KNN Classifier on an Iris Dataset

In [4]:
iris = datasets.load_iris()
flowers = pd.DataFrame(data=iris.data, columns=iris.feature_names)
flowers['target'] = iris.target
flowers

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [5]:
flowers_features = flowers.drop('target',axis=1)
flowers_labels = flowers.target

In [8]:
def minkowski_distance(a,b,p=1):
    dim = len(a)
    distance = 0
    
    for d in range(dim):
        distance += abs(a[d] - b[d])**p
    distance = distance**(1/p)
    return distance    

In [10]:
# Defining an arbitary test point
test_pt = [4.8,2.7,2.5,0.7]
distances = []
for i in flowers_features.index:
    distances.append(minkowski_distance(test_pt,flowers_features.iloc[i]))
df_dists = pd.DataFrame(data=distances, index=flowers_features.index,columns=['dist'])
df_dists

Unnamed: 0,dist
0,2.7
1,2.0
2,2.3
3,2.1
4,2.7
...,...
145,6.5
146,5.4
147,6.0
148,6.6


In [11]:
df_nn = df_dists.sort_values(by=['dist'],axis=0)[:5]
df_nn

Unnamed: 0,dist
98,1.4
57,1.5
93,1.7
24,1.8
30,1.8


In [14]:
from collections import Counter
counter = Counter(flowers_labels[df_nn.index])
counter.most_common()[0][0]

1

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

flowers_features_train, flowers_features_test, flowers_labels_train, flowers_labels_test = train_test_split(flowers_features, flowers_labels, test_size=0.25,
                                                   random_state=1)
scale = StandardScaler()
flowers_features_train = scale.fit_transform(flowers_features_train)
flowers_features_test = scale.fit_transform(flowers_features_test)

In [16]:
def knn_predict(flowers_features_train,flowers_features_test,flowers_labels_train,flowers_labels_test,k,p):
    
    from collections import Counter
    y_hat_test = []
    for test_point in flowers_features_test:
        distances = []
        
        for train_point in flowers_features_train:
            distance = minkowski_distance(test_point,train_point, p=p)
            distances.append(distance)
        
        df_dists = pd.DataFrame(data=distances, columns=['dist'],index = flowers_labels_train.index)
        
        df_nn = df_dists.sort_values(by = ['dist'],axis = 0)[:k]
        counter = Counter(flowers_labels_train[df_nn.index])
        prediction = counter.most_common()[0][0]
        y_hat_test.append(prediction)
    return y_hat_test

y_hat_test = knn_predict(flowers_features_train, flowers_features_test, flowers_labels_train, flowers_labels_test, k = 5, p= 1)
print(y_hat_test)

[0, 1, 1, 0, 2, 2, 2, 0, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 0, 1, 1, 2, 0, 2, 1, 0, 0, 1, 2, 2, 2, 1, 2, 2, 0, 1, 0]


In [17]:
from sklearn.metrics import accuracy_score
print(accuracy_score(flowers_labels_test, y_hat_test)*100)

92.10526315789474


In [19]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=5, p=1)
clf.fit(flowers_features_train, flowers_labels_train)
y_pred_test = clf.predict(flowers_features_test)

print(f"Sklearn KNN Accuracy: {accuracy_score(flowers_labels_test, y_pred_test)}")

Sklearn KNN Accuracy: 0.9210526315789473
