# Context

The paper "Distance and Similarity Measures Effect on the Performance of K-Nearest Neighbor Classifier" showed how effective the Hassanet metric function can be when compared to a broad range of other metrics for a 1-NN algorithm.
Original paper: https://arxiv.org/pdf/1708.04321.pdf

## Observation
The formula in the paper for the hassanet metric contradicts previous definitions by the author (https://arxiv.org/pdf/1501.00687.pdf) because of a missing 1 in the denominator; we will consider the version with the 1 since it is the one used in the previous papers and that holds the properties desired.

# Hypothesis

My hypthesis is that the significance in results is due to the fact the data was not standardized. This would mean we do not expect similar results if the data was standardized.

# Imports

In [155]:

import math
import time
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer, load_diabetes, load_wine

pd.set_option('display.max_columns', 50)

# Load datasets

In [66]:
datasets = {"cancer": load_breast_cancer(), "diabetes": load_diabetes(), "wine": load_wine()}

# Create the different metric functions

In [22]:
v1 = [5.1, 3.5, 1.4, 0.3]
v2 = [5.4, 3.4, 1.7, 0.2]
def test_func(func, num, tol=0.0001):
    if not (num-tol < func(v1, v2) < num+tol):
        raise ValueError("Error on function!")

In [23]:
def HasD(x, y):
    total = 0
    for xi, yi in zip(x, y):
        min_value = min(xi, yi)
        max_value = max(xi, yi)
        total += 1 # we sum the 1 in both cases
        if min_value >= 0:
            total -= (1 + min_value) / (1 + max_value)
        else:
            # min_value + abs(min_value) = 0, so we ignore that
            total -= 1 / (1 + max_value + abs(min_value))
    return total

test_func(HasD, 0.2572)

In [24]:
def LD(x, y):
    total = 0
    for xi, yi in zip(x, y):
        total += math.log(1 + abs(xi-yi))
    return total

test_func(LD, 0.7153)

In [25]:
funcs = {
    "HasD": HasD,
    "LD": LD,
    "CanD": "canberra",
    "L2": "euclidean"
}

In [144]:
def run_experiment(metric_func, data, testing=False):
    """
    testing: if passed, uses just 0.1% of data.
    """
    if testing:
        x, _, y, _ = train_test_split(data.data, data.target, test_size=0.90)
    else:
        x, y = data.data, data.target
        
    # Create standardized train/test
    train_data, test_data, train_y, test_y = train_test_split(x, y, test_size=0.34)
    scaler = StandardScaler()
    train_data = scaler.fit_transform(train_data)
    test_data = scaler.transform(test_data)
    
    clf = KNeighborsClassifier(n_neighbors=1, metric=metric_func, n_jobs=3)
    clf.fit(train_data, train_y)
    preds = clf.predict(test_data)
    return accuracy_score(test_y, preds)

In [156]:
%%time
start_time = time.time()
results = pd.DataFrame(index=funcs.keys(), columns=datasets.keys())
for metric_name, metric_func in funcs.items():
    for data_name, data in datasets.items():
        # Not sure why cancer isnt working
        if data_name == "cancer":
            continue
        avg_score = 0
        for _ in range(10):
            avg_score += run_experiment(metric_func, data, testing=True)
        avg_score /= 10
        results.loc[metric_name, data_name] = avg_score
        
end_time = time.time()

CPU times: user 1.75 s, sys: 192 ms, total: 1.94 s
Wall time: 8.96 s


In [157]:
print(end_time - start_time)

8.956639766693115


In [158]:
results.to_csv("results.csv")