In [None]:
%autosave 15
%matplotlib inline

import numpy as np
import scipy as sp
import pandas as pd
import math
import random

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [None]:
def getData():
    fin = open('prices.txt', 'r')
    lines = fin.readlines()
    fin.close()
    
    actualPrice = dict()
    data = list()
    
    for area, rooms, price in [map(int, x.split(',')) for x in lines[1:]]:
        data.append((area, rooms))
        actualPrice[(area, rooms)] = price
    
    return (data, actualPrice)

data, actualPrice = getData()

In [None]:
def k_fold_cv(k, length):
    one_fold_length = length // k
    others = length % k
    indexies = [i for i in range(length)]
    result = list()
    for i in range(k):
        test_suit = list()
        train_suit = list()

        for j in range(one_fold_length):
            index = indexies[int(np.random.uniform(0, len(indexies))) % len(indexies)]
            test_suit.append(index)
            indexies.remove(index)
        if others > 0:
            others -= 1
            index = indexies[int(np.random.uniform(0, len(indexies))) % len(indexies)]
            test_suit.append(index)
            indexies.remove(index)
        
        for j in range(length):
            if j not in test_suit:
                train_suit.append(j)
        
        result.append((train_suit, test_suit))
    return result

In [None]:
# w - вектор коэффициентов [area, rooms, free]
def standardDeviation(data, w, actualPrice):
    sum = 0
    for area, rooms in data:
        pp = area * w[0] + rooms * w[1] + w[2]
        ap = actualPrice[(area, rooms)]
        sum += (pp - ap) ** 2
    res = (sum / len(data)) ** 0.5
    return res

In [None]:
# w0 - вектор начальных коэффициентов [area, rooms, free]

# step - размер шага 
# маленький шаг - медленно, большой - есть шанс застрять, не доходя до минимума (и даже удаляться от него))

# eps - критерий_остановки (разность между векторами, полученными в соседних шагах)
# ??? проверить критерий остановки
def gradient(w0, step, eps):
    w = w0
    while True:
        w1 =
        diff = sum([(w1[i] - w[i]) ** 2 for i in range(len(w))]) ** 0.5
        if (diff < eps):
            return w1

In [None]:
def predict_class(k, metric, learn_suit, classes, point, kernel):
    distances = [(metric(l_point, point), l_point) for l_point in learn_suit]
    distances.sort()
    
    d = distances[k][0]
    if (d == 0):
        d = 1
    
    s = [0, 0]
    for j in range(k):
        dist, p = distances[j]
        s[int(classes[p])] += kernel(dist / d)
        
    if (s[0] > s[1]):
        return 0
    else:
        return 1

In [None]:
def kNN(metric, data, classes, kernel, cv_params = (1, 10)):
    
    def compute_score(k, train_suit, test_suit):
        true_positive = [0, 0]
        false_positive = [0, 0]
        all_points = [0, 0]

        for point in test_suit:
            predicted = predict_class(k, metric, train_suit, classes, point, kernel)
            real_class = int(classes[point])
            
            if predicted == real_class:
                true_positive[predicted] += 1
            else:
                false_positive[predicted] += 1
            all_points[real_class] += 1
                
        if true_positive[1] > 0:
            recall = true_positive[1] / all_points[1]
            precision = true_positive[1] / (true_positive[1] + false_positive[1])
            # F1 measure
            return 2 * (precision * recall) / (precision + recall)
        return 0


    tfold, kfold = cv_params
    max_neighbors = 0
    max_score = 0
    for neighbors in range(2, int(np.sqrt(len(data)))):
        average_score = 0
        for i in range(tfold):
            kfold_index = k_fold_cv(kfold, len(data))
            score = 0
            for train_suit, test_suit in kfold_index:
                training_suit = [data[i] for i in train_suit]
                testing_suit = [data[i] for i in test_suit]
                score += compute_score(neighbors, training_suit, testing_suit)
            average_score += score
        
        average_score /= (tfold * kfold)
        
        if average_score > max_score:
            max_neighbors = neighbors
            max_score = average_score
    
    return (max_neighbors, max_score)

In [None]:
results = pd.DataFrame(columns=['k', 'metric', 'accuracy'])


best_accuracy = 0
best_kernel   = None
best_metric   = None
best_data     = None
best_neighbor = None
best_kfold = None

pd.set_option('display.height', 250)
pd.set_option('display.max_rows', 250)

for kfold in range(5, 11):
    for kernel in kernels:
        for metric in metrics:
            for input_data, classes, transform_name, transform in data:
                k, accuracy = (kNN(metric, input_data, classes, kernel[0], (1, kfold)))
                raw = pd.DataFrame([[kfold, k, metric_names[metric], transform_name, accuracy, kernel[1]]], columns=['folds', 'k', 'metric', 'transformation name', 'accuracy', 'kernel'])
                results = results.append(raw, ignore_index=True)
            
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_kernel = kernel[0]
                    best_metric = metric
                    best_data = (input_data, classes, transform)
                    best_neighbor = k
                    best_kfold = kfold
                
                    best_string = str(accuracy) + ', ' + str(k) + ', ' + kernel[1] + ', ' + metric_names[metric] + ", " + transform_name + " folds: " + str(best_kfold)

display(results)