In [1]:
import numpy as np
from math import sqrt
import warnings
from collections import Counter
import pandas as pd
import random

In [2]:
def k_nearest_neighbors(data, predict, k=3):
    if len(data) >= k:
        warnings.warn('K is set to a value less than total voting groups, baka!')
        
    distances = []
    for group in data:
        for features in data[group]:
            euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
            distances.append([euclidean_distance, group])
            
    votes = [i[1] for i in sorted(distances)[:k]]
    #print(Counter(votes).most_common(1))
    vote_result = Counter(votes).most_common(1)[0][0]
    confidence = Counter(votes).most_common(1)[0][1] / k
    
    #print(vote_result, confidence)
    
    return vote_result, confidence

In [3]:
accuracies = []

for i in range(25):
    df = pd.read_csv("breast-cancer.data.txt")
    df.replace('?', -99999, inplace=True)
    #df.drop(['id'], 1, inplace=True)
    
    #9 labels and 1 feature (10 variables) is too much for a sample size of 300
    #also certain features aren't as important as others, so I dropped less significant ones
    
    #df.drop(['breast'], 1, inplace=False)
    #df.drop(['breast_quad'], 1, inplace=False)
    df.drop(['irradiat'], 1, inplace=False)
    
    full_data = df.astype(float).values.tolist()

    random.shuffle(full_data)
    
    test_size = 0.2
    train_set = {0:[], 1:[]}
    test_set = {0:[], 1:[]}

    train_data = full_data[:-int(test_size*len(full_data))]
    test_data = full_data[-int(test_size*len(full_data)):]

    for i in train_data:
        train_set[i[-1]].append(i[:-1])
    for i in test_data:
        test_set[i[-1]].append(i[:-1])
        
    correct = 0
    total = 0
    
    for group in train_set:
        for data in test_set[group]:
            vote, confidence = k_nearest_neighbors(train_set, data, k=5)
            if group == vote:
                correct += 1
            else:
                print(confidence)
            total += 1 
    print('Accuracy:', correct/total)
    accuracies.append(correct/total)

print(sum(accuracies)/len(accuracies))

0.6
0.8
0.8
0.6
0.8
0.6
0.6
0.6
0.6
0.8
1.0
0.8
1.0
0.8
1.0
1.0
0.6
1.0
Accuracy: 0.6842105263157895
0.8
1.0
1.0
1.0
0.6
1.0
0.8
0.8
1.0
1.0
0.6
1.0
Accuracy: 0.7894736842105263
0.6
0.8
1.0
1.0
0.8
1.0
1.0
0.8
0.8
1.0
0.6
0.8
0.8
1.0
0.8
Accuracy: 0.7368421052631579
0.6
0.6
0.8
0.8
0.6
0.6
0.6
0.8
0.6
1.0
0.8
0.6
1.0
Accuracy: 0.7719298245614035
0.6
0.8
0.6
0.6
0.6
0.6
1.0
0.8
0.6
1.0
0.6
1.0
0.6
0.8
0.6
0.8
Accuracy: 0.7192982456140351
0.6
0.8
0.6
0.8
0.6
0.8
0.8
0.6
0.6
1.0
1.0
0.6
0.6
0.6
0.8
0.6
1.0
0.8
Accuracy: 0.6842105263157895
0.6
0.6
0.6
0.8
0.6
0.8
0.8
1.0
0.8
0.8
0.8
0.8
0.6
1.0
Accuracy: 0.7543859649122807
0.6
0.8
1.0
0.6
0.8
0.6
1.0
Accuracy: 0.8771929824561403
0.6
0.8
1.0
1.0
0.8
1.0
1.0
1.0
1.0
1.0
1.0
1.0
Accuracy: 0.7894736842105263
0.6
0.8
0.6
1.0
0.8
0.6
1.0
1.0
1.0
0.6
0.8
0.8
Accuracy: 0.7894736842105263
0.8
1.0
0.6
0.8
0.8
1.0
0.8
0.8
1.0
1.0
0.6
0.8
1.0
1.0
0.6
1.0
Accuracy: 0.7192982456140351
1.0
0.8
0.8
1.0
0.8
0.8
0.8
0.8
1.0
0.6
1.0
0.6
Accuracy: 0.789473684

SVM would have 70% accuracy; which is as good as an adjusted random

KNN would have 75% accuracy; which is an okay improvement, but is still flawed due to the way I am tackling the problem

Dropping breast, breast_quad, and irradiat in KNN would take down to 74%

Dropping only irradiat would have an accuracy that varies between 68% and 88%, but overall 76%. An insignificant improvement.