## Initialization

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing, model_selection, neighbors
from bqplot import pyplot as bplt
import random
from collections import Counter, defaultdict
%matplotlib inline

## Loading Data

In [2]:
df = pd.read_csv('../data/breast-cancer-wisconsin.data.txt')
df = df.replace('?', -99999)
df = df.drop(['id'], 1)

In [3]:
X = np.array(df.drop(['class'], 1))
y = np.array(df['class'])

## Split Data

In [4]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)

## Train the Classifier

In [5]:
clf = neighbors.KNeighborsClassifier()
clf.fit(X_train, y_train);

## Get Accuracy

In [6]:
clf.score(X_test, y_test)

0.9642857142857143

## Predict

In [7]:
example = np.array([10,10,1,1,1,2,3,2,1]).reshape(1, -1)
clf.predict(example)[0]

4

## Manual KNN on generated dataset

## Euclidean Distance

In [8]:
def euclidean_distance(p1, p2):
    dim = len(p1)
    radical = sum((c1 - c2)**2 for c1, c2 in zip(p1, p2))
    return (radical)**(1/dim)

def euclidean_distance_numpy(p1, p2):
    p1 = np.array(p1)
    p2 = np.array(p2)
    dim = len(p1)
    return ((p1 - p2)**2).sum()**(1/dim)

In [132]:
def k_neares_neighbors(data, predict, k=3):
    if len(data) >= k:
        print('You may change k')
    distances = []
    for group in data:
        for features in data[group]:
            distance = np.linalg.norm(np.array(features) - np.array(predict))
            distances.append([distance, group])
    votes = [i[1] for i in sorted(distances)[:k]]
    vote_result = Counter(votes).most_common(1)[0][0]
    
    return vote_result        

## Manual datasets

In [133]:
dataset = {
    'black':[[1, 2],[2,3],[3,1]],
    'green':[[6,5],[7,7],[8,6]]
}

## Interactive Plot

In [137]:
def add_point(target):
    x, y = scat.x[-1], scat.y[-1]
    class_ = k_neares_neighbors(dataset_, [x, y])
    dataset_[class_].append([x, y])
    bplt.scatter([x], [y], enable_move=True, colors=[class_])

In [141]:
bplt.clear() # BQplot code
figure = bplt.figure(title='K nearest neighbors - Click to add points', animation_duration=500) # BQplot code
dataset_ = dataset.copy()
scat = bplt.scatter([0], [0], interactions={'click': 'add'}, colors=['white'])
for color in dataset_:
    xs, ys = zip(*dataset_[color])
    bplt.scatter(xs, ys, colors=[color])
scat.observe(add_point, names=['x'])
bplt.show()

## Manual KNN on Breast Cancer Dataset

In [15]:
df = pd.read_csv('../data/breast-cancer-wisconsin.data.txt')
df = df.replace('?', -99999)
df = df.drop(['id'], 1)

## Manual Split train test

In [52]:
full_data = df.astype(float).values.tolist()
random.shuffle(full_data)

In [53]:
def split_features_labels(data):
    set_ = defaultdict(list)
    for value in data:
        *features, label = value
        set_[label].append(features)
    return set_

In [54]:
test_size = 0.2
limit = -int(len(full_data)*test_size)
train_data = full_data[:limit]
test_data = full_data[limit:]
train_set = split_features_labels(train_data)
test_set = split_features_labels(test_data)

## Manual Score

In [55]:
correct = 0
for group in test_set:
    for data in test_set[group]:
        vote = k_neares_neighbors(train_set, data, k=5)
        correct += 1 if group == vote else 0
correct / len(test_data)

0.9640287769784173