In [1]:
import math
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer as breast_cancer
from sklearn.model_selection import train_test_split

In [2]:
bunch = breast_cancer()
bunch['feature_names']

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [3]:
X = bunch.data
y = bunch.target

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [5]:
a = np.array([1,1])
b = np.array([2,2])

long = math.sqrt((a[0]-b[0])**2 + (a[1] - b[1])**2)
short = np.linalg.norm(a-b)

long, short

(1.4142135623730951, 1.4142135623730951)

In [6]:
indices = [20, 32, 45]
order = [1, 2, 0]

sort_obj = list(zip(indices, order))
composite_array = np.array(sort_obj, dtype=[('index', 'i4'), ('rank', 'i4')])

order = np.argsort(composite_array, order='rank')

ordered_indices = [n[0] for n in composite_array[order]]
ordered_indices

[45, 20, 32]

In [7]:
composite_array[order]

array([(45, 0), (20, 1), (32, 2)],
      dtype=[('index', '<i4'), ('rank', '<i4')])

In [8]:
X_train.shape, y_train.shape

((426, 30), (426,))

In [9]:
a = X_train[9]
b = X_train[11]

In [10]:
c = np.linalg.norm(a-b)
c

677.5206464669267

In [11]:
type(c)

numpy.float64

In [12]:
d = pd.DataFrame(X_train)

In [13]:
d.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,20.26,23.03,132.4,1264.0,0.09078,0.1313,0.1465,0.08683,0.2095,0.05649,...,24.22,31.59,156.1,1750.0,0.119,0.3539,0.4098,0.1573,0.3689,0.08368
1,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
2,15.71,13.93,102.0,761.7,0.09462,0.09462,0.07135,0.05933,0.1816,0.05723,...,17.5,19.25,114.3,922.8,0.1223,0.1949,0.1709,0.1374,0.2723,0.07071
3,13.71,18.68,88.73,571.0,0.09916,0.107,0.05385,0.03783,0.1714,0.06843,...,15.11,25.63,99.43,701.9,0.1425,0.2566,0.1935,0.1284,0.2849,0.09031
4,13.81,23.75,91.56,597.8,0.1323,0.1768,0.1558,0.09176,0.2251,0.07421,...,19.2,41.85,128.5,1153.0,0.2226,0.5209,0.4646,0.2013,0.4432,0.1086


In [23]:
def euclidean_classifier(X_target, X_train=X_train, y_train=y_train, n = 3):
    # given a set of X samples with y lables return a list of predictions
    # list will be based on the average of the labels of
    # the n closest values in the sample X to each sample in the target

    # each sample from target
        # compared to each sample from known
        # capture known sample distance to target
        # capture class of known sample
        # rank order the known (distance, class) tuples by ascending distance
        # average the class value for the first n members of the ordered distances

    predictions = []

    for i in range(len(X_target)):
        vector = X_target[i]
        vector_values = []

        for j in range(len(X_train)):
            known_vector = X_train[j]
            distance = np.linalg.norm(vector - known_vector)
            known_class = y_train[j]
            vector_values.append((distance, known_class))

        distance_vector = np.array(vector_values, dtype=[('distance', np.float64), ('class', 'i2')])
        order = np.argsort(distance_vector, order='distance')
        distance_vector = distance_vector[order]
        nn = distance_vector[:n]
        class_sum = sum([i[1] for i in nn])
        unk_prediction = float(class_sum)/n        
        unk_prediction = 1 if unk_prediction >= 0.5 else 0
        predictions.append(unk_prediction)

    return predictions

In [27]:
euclidean_classifier(X_test, n=13)

[1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1]

In [16]:
y_test

array([1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1])