# k-nearest neigbour (KNN)

In [4]:
import numpy as np

## Implementierung eines KNN-Klassifikators in Python

Implementierung mit Euklid-Abstand

In [110]:
class MyKNN:
    
    def fit(self, X, Y, k=3):
        self.Xtr = X
        self.Ytr = Y
        self.classes = list(np.unique(Y))
        self.trainExamples, self.featDim = np.asarray(X).shape
        self.k = k
        
    # compute the euclidean distance of two vektors
    @staticmethod
    def euclidean_distance(a, b):
        return np.sqrt(np.sum(np.power(np.subtract(a,b),2)))
    
    def predict_vec(self, x):
        # initialice the distance matrix
        dist = np.zeros((self.trainExamples,2))

        # compute distance from input feature vector to every feature vector in the trainings-set
        for t in range(self.trainExamples):
            # save the computed distance and the associated label
            dist[t] = np.array([MyKNN.euclidean_distance(self.Xtr[t], x), self.Ytr[t]])

        # sort the distances by shortest distance first
        dist = dist[dist[:,0].argsort()]
        # only keep the classes of those k-nearest nabours
        knn = dist[:self.k,1]
        # count how many unique classes are there
        class_counter = np.bincount(knn.astype(int))
        # get the class with the most frequency
        predicted_class = np.argmax(class_counter)
        
        return predicted_class
        
    def predict(self, X):
        out= []
        for vec in X:
            out.append(self.predict_vec(vec))
        return (np.asarray(out))

Testen des Klassifikators auf dem Iris Datensatz aus sklearn

In [115]:
from sklearn.datasets import load_iris

dataset = load_iris()
print(dataset.keys())
print(dataset.DESCR)

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])
.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Cr

In [125]:
Xtr = dataset.data
Ytr = dataset.target
print(dataset.target_names)

#print(np.concatenate((dataset.data, dataset.target.reshape((150,1))), axis = 1))

# example for virginica
x = np.array([[5.9, 3.0, 5.1, 1.8]])

knn = MyKNN()
knn.fit(Xtr, Ytr)
c= knn.predict(x)

print(dataset.target_names[c])

['setosa' 'versicolor' 'virginica']
['virginica']


Testen des Klassifikators auf den gesamten Daten

In [128]:
iris_dataset = load_iris()

X=iris_dataset['data']
y=iris_dataset['target']

knn = MyKNN()
knn.fit(X,y,3)
o=knn.predict(X)

In [129]:
import sklearn.metrics as eval

acc=eval.accuracy_score(y,o)

print(acc)
print("Confusion Matrix")
print(eval.confusion_matrix(y,o))
print(eval.classification_report(y, o))

0.96
Confusion Matrix
[[50  0  0]
 [ 0 47  3]
 [ 0  3 47]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       0.94      0.94      0.94        50
           2       0.94      0.94      0.94        50

    accuracy                           0.96       150
   macro avg       0.96      0.96      0.96       150
weighted avg       0.96      0.96      0.96       150



Zerlegung der Daten in ein Training- und Testset

In [130]:
from sklearn.model_selection import train_test_split

(Xtr,Xte,ytr,yte)=train_test_split(X,y)

print(Xtr.shape)
print(Xte.shape)

(112, 4)
(38, 4)


In [132]:
import sklearn.metrics as eval

knn = MyKNN()
knn.fit(Xtr,ytr,3)
o=knn.predict(Xte)

acc=eval.accuracy_score(yte,o)

print(acc)
print("Confusion Matrix")
print(eval.confusion_matrix(yte,o))


print(eval.classification_report(yte, o))

0.9736842105263158
Confusion Matrix
[[12  0  0]
 [ 0 14  1]
 [ 0  0 11]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      0.93      0.97        15
           2       0.92      1.00      0.96        11

    accuracy                           0.97        38
   macro avg       0.97      0.98      0.97        38
weighted avg       0.98      0.97      0.97        38



Bestimmung eines geeigneten Parameters k

In [135]:
(Xtr_small,Xval,ytr_small,yval)=train_test_split(Xtr,ytr,test_size=0.15)

print(Xtr_small.shape)
print(Xval.shape)


kVals = range(1, 30, 2) #start, stop,step
accuracies=[]
highest_idx=0
highest_acc=0
idx=0
for k in kVals:
    myModel = MyKNN()
    myModel.fit(Xtr_small,ytr_small,k)
    val_res=myModel.predict(Xval)
    score_val = eval.accuracy_score(yval,val_res)
    test_res=myModel.predict(Xte)
    score_test = eval.accuracy_score(yte,test_res)
    if highest_acc <= score_test:
        highest_acc=score_test
        highest_idx=idx
    idx=idx+1
    print("k=%d, accuracy=%.2f%% %.2f%%" % (k, score_val * 100, score_test *100))
    accuracies.append(score_val)

(95, 4)
(17, 4)
k=1, accuracy=88.24% 97.37%
k=3, accuracy=88.24% 94.74%
k=5, accuracy=88.24% 97.37%
k=7, accuracy=94.12% 97.37%
k=9, accuracy=88.24% 97.37%
k=11, accuracy=88.24% 97.37%
k=13, accuracy=88.24% 97.37%
k=15, accuracy=88.24% 97.37%
k=17, accuracy=88.24% 97.37%
k=19, accuracy=88.24% 97.37%
k=21, accuracy=88.24% 94.74%
k=23, accuracy=94.12% 94.74%
k=25, accuracy=88.24% 97.37%
k=27, accuracy=88.24% 97.37%
k=29, accuracy=88.24% 100.00%


Verwenden Sie eine Kreuzvalidierung zur Bestimmung von k



In [136]:
from sklearn.neighbors import KNeighborsClassifier

#model.fit(trainData, trainLabels)

from sklearn.model_selection import cross_validate


kVals = range(1, 30, 2) #start, stop,step
for k in kVals:
    model = KNeighborsClassifier(n_neighbors=k)
    cv_results = cross_validate(model, Xtr, ytr, cv=10)
    avg_score = np.average(cv_results['test_score'])
    print("k=%d, avg_score=%.2f%% " % (k, avg_score * 100,))

k=1, avg_score=95.53% 
k=3, avg_score=95.53% 
k=5, avg_score=96.36% 
k=7, avg_score=96.44% 
k=9, avg_score=97.27% 
k=11, avg_score=97.27% 
k=13, avg_score=96.44% 
k=15, avg_score=95.45% 
k=17, avg_score=94.55% 
k=19, avg_score=94.62% 
k=21, avg_score=95.61% 
k=23, avg_score=94.70% 
k=25, avg_score=95.53% 
k=27, avg_score=94.70% 
k=29, avg_score=96.44% 
