In [None]:
%autosave 15
%matplotlib inline

import numpy as np
import scipy as sp
import math
import random

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

In [None]:
def printAllPoints(data, classes):
    red = [[], []]
    blue = [[], []]
    for (x, y) in data:
        if (classes[(x, y)] == 0):
            red[0].append(x)
            red[1].append(y)
        else:
            blue[0].append(x)
            blue[1].append(y)
    plt.plot(red[0], red[1], 'r.', blue[0], blue[1], 'b.')

In [None]:
def get_data():
    fin = open('chips.txt', 'r')
    classes = dict()
    data = list()
    
    for x, y, z in [map(float, x.split(',')) for x in fin.readlines()]:
        data.append((x, y))
        classes[(x, y)] = z
    
    fin.close()
    return (data, classes)

simpleData = get_data()
printAllPoints(simpleData[0], simpleData[1])

In [None]:
def get_sum_data():
    fin = open('chips.txt', 'r')
    classes = dict()
    data = list()
    
    for x, y, z in [map(float, x.split(',')) for x in fin.readlines()]:
        data.append((x, x + y))
        classes[(x, x + y)] = z
    
    fin.close()
    return (data, classes)

sumData = get_sum_data()
printAllPoints(sumData[0], sumData[1])

In [None]:
def get_polar_data(x0, y0):
    fin = open('chips.txt', 'r')
    lines = fin.readlines()
    fin.close()
    
    classes = dict()
    data = list()
    
    for x, y, z in [map(float, x.split(',')) for x in lines]:
        x -= x0
        y -= y0
        r = (x ** 2 + y ** 2) ** (1 / 2)
        a = math.atan2(x, y)
        data.append((r, a))
        classes[(r, a)] = z
    
    return (data, classes)

polarData = get_polar_data(0.2, 0.2)
printAllPoints(polarData[0], polarData[1])

In [None]:
def get_polar_data2():
    fin = open('chips.txt', 'r')
    lines = fin.readlines()
    fin.close()
    
    classes = dict()
    data = list()
    
    x0, y0 = 0, 0
    for x, y, z in [map(float, x.split(',')) for x in lines]:
        x0 += x
        y0 += y
    x0 = x0 / len(lines)
    y0 = y0 / len(lines)
    
    print(x0, y0)
    
    for x, y, z in [map(float, x.split(',')) for x in lines]:
        x -= x0
        y -= y0
        r = (x ** 2 + y ** 2) ** (1 / 2)
        a = math.atan2(x, y)
        data.append((r, a))
        classes[(r, a)] = z
    
    return (data, classes)

polarData2 = get_polar_data2()
printAllPoints(polarData2[0], polarData2[1])

In [None]:
def minkowskiDistance(x, y, p):
    res = 0
    for i in range(len(x)):
        res += abs(x[i] - y[i]) ** p
    return res ** (1 / p)

# https://en.wikipedia.org/wiki/Cosine_similarity
def cosineSimilarity(x, y):
    res, a, b = 0, 0, 0
    for i in range(len(x)):
        res += x[i] * y[i]
        a += x[i] ** 2
        b += y[i] ** 2
    a = a ** (1 / 2)
    b = b ** (1 / 2)
    return res / a / b
    

metrics = [lambda x, y: minkowskiDistance(x, y, 1), 
           lambda x, y: minkowskiDistance(x, y, 2),
           lambda x, y: cosineSimilarity(x, y)]    

#for metric in metrics:
#    print(metric((1, 1), (2, 2)))

In [None]:
def k_fold_cv(k, length):
    one_fold_length = length // k
    others = length % k
    indexies = [i for i in range(length)]
    result = list()
    for i in range(k):
        learn_suit = list()
        train_suit = list()

        for j in range(one_fold_length):
            index = indexies[int(np.random.uniform(0, len(indexies))) % len(indexies)]
            learn_suit.append(index)
            indexies.remove(index)
        if others > 0:
            others -= 1
            index = indexies[int(np.random.uniform(0, len(indexies))) % len(indexies)]
            learn_suit.append(index)
            indexies.remove(index)
        
        for j in range(length):
            if j not in learn_suit:
                train_suit.append(j)
        
        result.append((train_suit, learn_suit))
    return result


def predict_class(k, metric, learn_suit, classes, point):
    distances = [(metric(l_point, point), l_point) for l_point in learn_suit]
    distances.sort()

    count_zero = 0
    count_one = 0
    for j in range(k):
        dist, p = distances[j]
        if classes[p] == 0:
            count_zero += 1
        else:
            count_zero -= 1
    if count_zero > count_one:
        return 0
    else:
        return 1


# def leave_one_out(k):
#     result = 0
#     for i in range(len(test)):
#         value = test[i]
#         del test[i]
#         color = predict_class(k, metric, test, colors, value)
#         test.insert(i, value)
            
#         if (color != classes[test[i]])
#             result += 1
#     return result

def kNN(metric, data, classes, cv_params = (10, 10)):
    def compute_score(k, learn_suit, train_suit):
        true_ones = 0
        ones = 0
        true_zeros = 0
        zeros = 0
        for point in train_suit:
            predicted = predict_class(k, metric, learn_suit, classes, point)
            real_class = classes[point]
            
            if predicted == real_class:
                if predicted == 0:
                    true_zeros += 1
                else:
                    true_ones += 1
            
            if real_class == 0:
                zeros += 1
            else:
                ones += 1
                
#         if ones == 0 or zeros == 0:
            # accuracy
            return (true_zeros + true_ones) / (zeros + ones)
        
#         recall = true_ones / ones
#         specifity = true_zeros / zeros
#         precision = true_ones / (true_ones + true_zeros)
        # f1 measure
#         return 2 * (precision * recall) / (precision + recall)

    def learn_parameter_k(metric, learn_suit, train_suit):
        current_k = -1
        current_score = -1
        
        for k in range(1, 20 + 1, 2): # len??
            temp = compute_score(k, learn_suit, train_suit)
            if temp > current_score:
                current_score = temp
                current_k = k
        return current_k, current_score

    tfold, kfold = cv_params
    accumulator_k = 0
    accumulator_score = 0
    for i in range(tfold):
        kfold_index = k_fold_cv(kfold, len(data))
        for learn_suit, train_suit in kfold_index:
            learning_suit = [data[i] for i in learn_suit]
            training_suit = [data[i] for i in train_suit]
            k, score = learn_parameter_k(metric, learning_suit, training_suit)
            accumulator_k += k
            accumulator_score += score
    average_k = accumulator_k / (tfold * kfold)
    average_score = accumulator_score / (tfold * kfold)
    return (average_k, average_score)


In [None]:
data, classes = get_data()

print(kNN(metrics[0], data, classes))