In [None]:
%autosave 15
%matplotlib inline

import numpy as np
import scipy as sp
import pandas as pd
import math
import random

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [None]:
def printAllPoints(data, classes):
    green = [[], []]
    blue = [[], []]
    for (x, y) in data:
        if (classes[(x, y)] == 0):
            green[0].append(x)
            green[1].append(y)
        else:
            blue[0].append(x)
            blue[1].append(y)
    plt.plot(green[0], green[1], 'g.', blue[0], blue[1], 'b.')

In [None]:
def get_data(transform, name):
    fin = open('chips.txt', 'r')
    lines = fin.readlines()
    fin.close()
    
    classes = dict()
    data = list()
    
    for x, y, z in [map(float, x.split(',')) for x in lines]:
        a, b = transform(x, y)
        data.append((a, b))
        classes[(a, b)] = z
    
    return (data, classes, name)

In [None]:
def toPolar(x, y, x0, y0):
    x, y = x - x0, y - y0
    r = (x ** 2 + y ** 2) ** (1 / 2)
    a = math.atan2(x, y)
    return (r, a)

def getCenter():
    fin = open('chips.txt', 'r')
    lines = fin.readlines()
    fin.close()

    x0, y0 = 0, 0
    for x, y, z in [map(float, x.split(',')) for x in lines]:
        x0 += x
        y0 += y
    x0 = x0 / len(lines)
    y0 = y0 / len(lines)
    
    return (x0, y0)

In [None]:
simpleData = get_data(lambda x, y:(x, y), '-')
sumData = get_data(lambda x, y:(x, x + y), 'x, y -> x, x + y')
polarData = get_data(lambda x, y: toPolar(x, y, 0, 0), 'polar(0, 0)')

center = getCenter()
polarData2 = get_data(lambda x, y: toPolar(x, y, center[0], center[1]), 'polar(center_x, center_y)')


data = [simpleData, sumData, polarData, polarData2]

In [None]:
d = get_data(lambda x, y: (toPolar(x, y, center[0], center[1])[0] * 5, 
                           toPolar(x, y, center[0], center[1])[0] + toPolar(x, y, center[0], center[1])[1]
                          ), 
             'wide')
printAllPoints(d[0], d[1])

data.append(d)

In [None]:
printAllPoints(simpleData[0], simpleData[1])

In [None]:
printAllPoints(sumData[0], sumData[1])

In [None]:
printAllPoints(polarData[0], polarData[1])

In [None]:
printAllPoints(polarData2[0], polarData2[1])

In [None]:
def minkowskiDistance(x, y, p):
    res = 0
    for i in range(len(x)):
        res += abs(x[i] - y[i]) ** p
    return res ** (1 / p)

# https://en.wikipedia.org/wiki/Cosine_similarity
def cosineSimilarity(x, y):
    res, a, b = 0, 0, 0
    for i in range(len(x)):
        res += x[i] * y[i]
        a += x[i] ** 2
        b += y[i] ** 2
    a = a ** (1 / 2)
    b = b ** (1 / 2)
    return res / a / b
    

metrics = [lambda x, y: minkowskiDistance(x, y, 1), 
           lambda x, y: minkowskiDistance(x, y, 2)
          ]
#lambda x, y: cosineSimilarity(x, y)]    

metric_names = dict()
metric_names[metrics[0]] = 'minkowski with p = 1'
metric_names[metrics[1]] = 'minkowski with p = 2'
#metric_names[metrics[2]] = 'cosine'

#for metric in metrics:
#    print(metric((1, 1), (2, 2)))

In [None]:
def k_fold_cv(k, length):
    one_fold_length = length // k
    others = length % k
    indexies = [i for i in range(length)]
    result = list()
    for i in range(k):
        learn_suit = list()
        train_suit = list()

        for j in range(one_fold_length):
            index = indexies[int(np.random.uniform(0, len(indexies))) % len(indexies)]
            learn_suit.append(index)
            indexies.remove(index)
        if others > 0:
            others -= 1
            index = indexies[int(np.random.uniform(0, len(indexies))) % len(indexies)]
            learn_suit.append(index)
            indexies.remove(index)
        
        for j in range(length):
            if j not in learn_suit:
                train_suit.append(j)
        
        result.append((train_suit, learn_suit))
    return result

In [None]:
def predict_class(k, metric, learn_suit, classes, point, kernel):
    distances = [(metric(l_point, point), l_point) for l_point in learn_suit]
    distances.sort()
    
    d = distances[k][0]
    if (d == 0):
        d = 1
    
    s = [0, 0]
    for j in range(k):
        dist, p = distances[j]
        s[int(classes[p])] += kernel(dist / d)
        
    if (s[0] > s[1]):
        return 0
    else:
        return 1

In [None]:
def kNN(metric, data, classes, kernel, cv_params = (1, 10)):
    
    
    def compute_score(k, learn_suit, train_suit):
        correct_predicted = [0, 0]
                    
        for point in train_suit:
            predicted = predict_class(k, metric, learn_suit, classes, point, kernel)
            real_class = classes[point]
            
            if predicted == real_class:
                correct_predicted[int(real_class)] += 1
                
        # accuracy
        return sum(correct_predicted) / len(train_suit)


    tfold, kfold = cv_params
    max_k = 0
    max_score = 0
    for k in range(2, 13):
        average_score = 0
        for i in range(tfold):
            kfold_index = k_fold_cv(kfold, len(data))
            score = 0
            for learn_suit, train_suit in kfold_index:
                learning_suit = [data[i] for i in learn_suit]
                training_suit = [data[i] for i in train_suit]
                score += compute_score(k, learning_suit, training_suit)
            average_score += score / len(kfold_index)
        
        average_score /= tfold
        
        if average_score > max_score:
            max_k = k
            max_score = average_score
    
    return (max_k, max_score)

In [None]:
kernels = [
    (lambda x: x, 'x -> x'),
    (lambda x: 1 - abs(x), 'triangular'),
    (lambda x: 3 / 4 * (1 - x * x), 'parabolic'),
    (lambda x: (1 - x ** 2) ** 2 * 15 / 16, 'quartic')
]

In [None]:
results = pd.DataFrame(columns=['k', 'metric', 'accuracy'])

for kernel in kernels:
    for metric in metrics:
        for input_data, classes, transform_name in data:
            k, accuracy = (kNN(metric, input_data, classes, kernel[0]))
            raw = pd.DataFrame([[k, metric_names[metric], transform_name, accuracy, kernel[1]]], columns=['k', 'metric', 'transformation name', 'accuracy', 'kernel'])
            results = results.append(raw, ignore_index=True)

display(results)

In [None]:
kNN(metrics[0], data[0][0], data[0][1], predict_class, cv_params=(1, 10))