In [11]:
import numpy as np

fFloat = open("iris.csv", "r")
dataset = np.loadtxt(fFloat, delimiter=",")
fFloat.close()

x = dataset[:,0:4]
y = dataset[:,4]
percentTrainingset = 0.8
np.random.seed(42)
TrainSet = np.random.choice(x.shape[0], int(x.shape[0]*percentTrainingset), replace=False)
XTrain = x[TrainSet,:]
YTrain = y[TrainSet]
TestSet = np.delete(np.arange(0, len(y)), TrainSet)
XTest = x[TestSet,:]
YTest = y[TestSet]

In [2]:
def plainKNNclassification(xTrain, yTrain, xQuery, k, normOrd=None):
    xMin = xTrain.min(axis=0)
    xMax = xTrain.max(axis=0)
    xTrain = (xTrain - xMin) / (xMax-xMin)
    xQuery = (xQuery - xMin) / (xMax - xMin)
    diff = xTrain - xQuery
    dist = np.linalg.norm(diff, axis=1, ord=normOrd)
    knearest = np.argsort(dist)[:k]
    (classification, counts) = np.unique(YTrain[knearest], return_counts=True)
    chosenClass = np.argmax(counts)
    return (classification[chosenClass])

In [3]:
errors = 0
for i in range(len(YTest)):
    myClass = plainKNNclassification(XTrain, YTrain, XTest[i,:], 3)
    if myClass != YTest[i]:
        errors = errors + 1
        print('%s wurde als %d statt %d klassifiziert' % (str(XTest[i,:]), myClass, YTest[i]))

[4.9 2.5 4.5 1.7] wurde als 2 statt 3 klassifiziert
[7.2 3.  5.8 1.6] wurde als 2 statt 3 klassifiziert


In [4]:
# Motivate weighted average of closest nodes using two moon problems
def twoMoonsProblems(SamplesPerMoon=240, pNoise=2):
    tMoon0 = np.linspace(0, np.pi, SamplesPerMoon)
    tMoon1 = np.linspace(0, np.pi, SamplesPerMoon)
    Moon0x = np.cos(tMoon0)
    Moon0y = np.sin(tMoon0)
    Moon1x = 1 - np.cos(tMoon1)
    Moon1y = 0.5 - np.sin(tMoon1)
    
    X = np.vstack((np.append(Moon0x, Moon1x), np.append(Moon0y, Moon1y))).T
    X = X + pNoise/100*np.random.normal(size=X.shape)
    Y = np.hstack([np.zeros(SamplesPerMoon), np.ones(SamplesPerMoon)])
    
    return X, Y

In [7]:
# Further, we utilize a kd tree to reduce the lookup time
from scipy.spatial import KDTree

class knnRegression:
    def fit(self, X, Y):
        self.xMin = X.min(axis=0)
        self.xMax = x.max(axis=0)
        self.XTrain = (X - self.xMin) / (self.xMax - self.xMin)
        self.kdTree = KDTree(self.XTrain)
        self.YTrain = Y
        
    def predict(self, X, k=3, smear = 1):
        X = (X - self.xMin) / (self.xMax - self.xMin) # scaling
        (dist, neighbours) = self.kdTree.query(X,k) # find k closest neighbors in X using kdtree
        distsum = np.sum( 1/(dist+smear/k), axis=1) 
        distsum = np.repeat(distsum[:,None],k,axis=1)
        dist = (1/distsum)*1/(dist+smear/k)
        y = np.sum( dist*YTrain[neighbours], axis=1)
        return(y)

In [8]:
samples = 5000
pNoise = 1
myK = 3
mysmear = 0.5

np.random.seed(42)
x = np.random.rand(samples, 2)
y = np.tanh( 500*( (1/16) - (x[:,0]-0.5)**2 - (x[:,1]-0.5)**2) )
Noise = np.random.normal(size=len(y))
y = (1+Noise*pNoise/100)*y

percentTrainingset = 0.8
TrainSet = np.random.choice(x.shape[0], int(x.shape[0]*percentTrainingset), replace=False)
XTrain = x[TrainSet, :]
YTrain = y[TrainSet]
TestSet = np.delete(np.arange(0, len(y)), TrainSet)
XTest = x[TestSet,:]
YTest = y[TestSet]

myRegression = knnRegression()

myRegression.fit(XTrain, YTrain)
yP = myRegression.predict(XTest, k=myK, smear=mysmear)
diff = yP - YTest
MAE = np.mean(np.abs(diff))
print(MAE) # Mean absolute error

0.024800797417177438


In [23]:
# TODO: Adapt classification into a similar class format using a weighted distance choice

class knnClassification:
    def fit(self, X, Y):
        self.xMin = X.min(axis=0)
        self.xMax = x.max(axis=0)
        self.XTrain = (X - self.xMin) / (self.xMax - self.xMin)
        self.kdTree = KDTree(self.XTrain)
        self.YTrain = Y
        
    def predict(self, X, k=3, smear=1):
        X = (X - self.xMin) / (self.xMax - self.xMin) # scaling
        (dist, neighbours) = self.kdTree.query(X,k) # find k closest neighbors in X using kdtree
        distsum = np.sum( 1/(dist+smear/k))  # Sum up all distances, including smear
        distsum = np.repeat(distsum,k) # Repeat distance sum k-times [[a,a,a],[b,b,b],[c,c,c]]
        dist = (1/distsum)*1/(dist+smear/k) # Reciprocal of distsum multipled with reciprocal of dist+smear/k
        classTotalDistances = {}
        for neighbor in range(len(neighbours)):
            if(YTrain[neighbours[neighbor]] in classTotalDistances):
                classTotalDistances[YTrain[neighbours[neighbor]]] += abs(dist[neighbor])
            else:
                classTotalDistances[YTrain[neighbours[neighbor]]] = abs(dist[neighbor])
        return max(classTotalDistances, key=classTotalDistances.get)
        
        

In [24]:
errors = 0
knn = knnClassification()
knn.fit(XTrain, YTrain)
for i in range(len(YTest)):
    myClass = knn.predict(XTest[i,:])
    if myClass != YTest[i]:
        errors = errors + 1
        print('%s wurde als %d statt %d klassifiziert' % (str(XTest[i,:]), myClass, YTest[i]))

[4.9 2.5 4.5 1.7] wurde als 2 statt 3 klassifiziert
[7.2 3.  5.8 1.6] wurde als 2 statt 3 klassifiziert
