In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Task 01
# 1.a: load data frame from file, store into df
df = pd.read_csv('KNNAlgorithmDataset.csv')
#print(df)

#1.c: shuffle the samples
df = df.sample(frac=1)
# frac = [0,1], incase frac = 0.5, shuffle 50% of the rows
#print(df)

# convert data frame into array
data_array = df.values[:, 1:]  #unselect 1st column / label y

# 1.b: set column 'diagnosis' as target variable y / the label
y = df["diagnosis"].values
#print(y) 

In [3]:
# Task 01: normalizing the data with z-score normalization

mean = data_array.mean(axis=0)
standard_deviation = data_array.std(axis=0)

normalized_data = (data_array - mean) / standard_deviation

print(normalized_data)

[[-1.12108689 -0.40948202 -1.10591706 ... -0.89618001  0.24953342
   0.22300338]
 [ 0.09165373  0.21649904  0.10383912 ...  1.05358578  2.9965249
   0.96169623]
 [ 0.85565191 -0.6724406   0.98984033 ...  2.13771951  1.88510962
   1.21660899]
 ...
 [-0.09011536  1.03795373 -0.01684817 ...  1.32005124  2.47721732
   1.36623169]
 [ 3.29533386 -0.42577149  3.38710998 ...  2.45138742  1.27682411
   0.23297823]
 [ 2.87499285  0.2118449   3.05758838 ...  1.67787627  0.51970278
  -0.21367326]]


In [4]:
# Task 02: seperate data into training(70%), validation(20%) and test(10%) sets by slicing
da_len = len(data_array)   #number of rows / length of data_array

training_set = data_array[:int(da_len * 0.7)]
training_label = y[:int(da_len * 0.7)]

#print(training_set)
validation_set = data_array[int(da_len * 0.7): int(da_len * 0.9)]
validation_label = y[int(da_len * 0.7): int(da_len * 0.9)]

test_set = data_array[int(da_len* 0.9):]
test_label = y[int(da_len* 0.9):]
#print(len(training_set) + len(validation_set) + len(test_set) == da_len) # the whole data array included in the 3 sets: True 

In [5]:
# 3.c distance algorithms with input of two 1D-Vectors, return distance (real number) of 2 vectors(x1,...,xn)
# Manhattan distance
def manhattan_distance(vector01, vector02):
    # raise error if data points are not of the same length
    if (len(vector01) != len(vector02)):
        raise ValueError("Unequal length of inputed data points")
    # return sum(\xi_m - xj_m\)
    return np.sum(np.abs(vector01 - vector02))    # np.abs(): absolute value of each elements
    # a variante: use np.fabs(v1, v2), return np.sum(np.fabs(vector01 - vector02))

# Euclidean distance
def euclidean_distance(vector01, vector02):
    if (len(vector01) != len(vector02)):
        raise ValueError("Unequal length of inputed data points")
    # use np library: calculate the norm of the new vector (difference of two old vectors)
    return np.linalg.norm(vector01 - vector02)

# Chebyshev distance
def chebyshev_distance(vector01, vector02):
    if (len(vector01) != len(vector02)):
        raise ValueError("Unequal length of inputed data points")
    # return max(\xi_01 - xi_02\)
    absolute_array = [np.abs(vector01[i] - vector02[i]) for i in range(len(vector01))]
    return max(absolute_array)


# test algo with data: 
#t_p01 = training_set[0]
#t_p02 = training_set[1]
#t_short01 = [1.1,2,4.43,8]
#t_short02 = [3,4.7,6,9.69]

#print(manhattan_distance(t_p01, t_p02))    # test (compared with result by online calculator): True
#print(euclidean_distance(t_p01, t_p02))    # definition from numpy library
#print(chebyshev_distance(t_short01, t_short02))    # test: True

In [6]:
# task 3.2: check if user inputs are valid
def check_predictor_input(k, d):
    valid_dist_algos = ['manhattan', 'euclidean', 'chebyshev']
    # there are k training set rows / neighbours for the new data point
    if k > len(training_set):
        raise ValueError(f"Given k too large! Give a number not larger than {len(training_set)}")
    if d.lower() not in valid_dist_algos:
        raise ValueError(f"Given algorithm is not valid, please choose from {valid_dist_algos}!")
        
#test: passed
#check_predictor_input(100, 'manha')
#check_predictor_input(400, 'chebyshev')

In [7]:
# Task 3: implement the kNN algorithm
# accroding to 2.2 kNN algo is capsuled in a kNN predictor
class kNN_predictor:
    #constructor: define attribiutes
    def __init__(self, k, distance_func):    #q: if the k and dist-func is given as param in constructor or later in predict func?
        check_predictor_input(k, distance_func)
        self.trained = False   #default: not trained
        self.k = k  # k neighbours
        if (distance_func == 'manhattan'):
            self.dname = 'Manhattan distance'
            self.d = manhattan_distance
        elif (distance_func == 'euclidean'):
            self.dname = 'Euclidean distance'
            self.d = euclidean_distance
        elif (distance_func == 'chebyshev'):
            self.dname = 'Chebyshev distance'
            self.d = chebyshev_distance
        else:
            raise ValueError("Invalid function")
        # X(data set) and Y(classes / target values) will be provided in fit() as parameters
        self.X = None
        self.Y = None
        # for the normalization
        self.mean = None
        self.std = None
    
    def normalize(self, X):
        if self.trained == False:
            raise("Predictor is not trained! Train it first by calling fit()!")
        return (X - self.mean) / self.std
    
    # functions
    def fit(self, training_X, training_Y):
        #feed the model with training data
        # TODO: normalize X before storing
        self.X = training_X
        self.Y = training_Y
        self.mean = self.X.mean(axis=0)
        self.std = self.X.std(axis=0)
        self.trained = True   #set trained

        self.X = self.normalize(self.X)
        

    def predict(self, X, thresh=0.5):
        # check if predictor is trained
        if self.trained == False:
            raise("Predictor is not trained! Train it first by calling fit()!")
        
        # TODO: call normalize() to normalize the data
        X = self.normalize(X)
        # TODO: calculate the distance with all the neighbours
        kNN = []
        for x1 in X:
            dists = []
            for x2 in self.X:
                dist = self.d(x1, x2)
                dists.append(dist)
            nearest_indexes = np.argsort(dists)
            nearest_labels = self.Y[nearest_indexes]
            prob = np.mean(nearest_labels[1:self.k+1])
            kNN.append(prob)
        return np.array(kNN) > thresh

    def confusionMatrix(self, X, Y): 
        #compute confusion matrix, input: X (user input)
        '''
        if not self.Y:
            raise error?
        # and validate X and Y shape
        # should be Y given or taken from fit?
            '''
        X = np.array(X)
        Y = np.array(Y)
        pY = np.array(self.predict(X))

        size = len(X)
        # calculate entries of the confusion matrix by counting entries where the corresponding condition is given and make it relative to the whole set
        TP = len(np.where((Y == 1) & (pY == 1))[0])/size
        TN = len(np.where((Y == 0) & (pY == 0))[0])/size
        FP = len(np.where((Y == 0) & (pY == 1))[0])/size
        FN = len(np.where((Y == 1) & (pY == 0))[0])/size
        return [[TP, FN], [FP, TN]]
  

In [9]:
pred = kNN_predictor(4, 'manhattan')
print(training_set[:,0],training_set[:,1:])
pred.fit(training_set[:,0], training_set[:,1:])
pred.predict([data_array[0]])

[10.18  14.45  17.14  12.31   9.333 12.05  10.26  15.53  12.27  13.61
 12.4   11.5   12.21  14.22  10.16  11.04  20.57   9.676 15.75  16.35
 21.71  18.46  12.9   23.27  18.03  12.77  14.06  13.54  13.15  12.34
 19.59  11.68  14.59  11.61  13.85  13.96  11.54  15.49  11.71  17.85
 15.12  14.04  15.46  17.2   15.78  17.06  20.51  12.32   9.755 17.29
 10.66  14.58  15.28  12.95  12.88  14.86  15.    10.44  23.29  12.91
 13.    14.64  14.29  18.08  13.21  20.92  14.86  18.22  12.72  11.71
 12.03  20.34  16.3   12.8    9.904 15.75  19.55  11.45  10.82  11.89
 10.48  13.82  16.13  15.05  10.88  11.93  11.06  12.36  15.85  12.46
 13.65  12.1   19.79  12.18  11.13   8.598  8.95  11.84  20.55  11.47
 11.08  10.08  13.46  13.53   8.618 13.68   8.734 13.88  13.48  13.94
 11.94  18.49  12.    20.58  12.3    9.668 16.6   17.57  13.51  14.97
 10.26   9.787 13.28  15.78  10.65  12.89  10.94  11.93  14.53  13.05
 11.95  21.56  18.25  12.85  17.6    8.219 13.27  11.22  13.05  14.26
 12.47  27.22  14.74

TypeError: object of type 'numpy.float64' has no len()

In [None]:
# Task 4: test the data with user inputs

In [None]:
# Task 5: visualization