In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plots

import time

%matplotlib inline

pd.options.display.max_rows = 10

In [2]:
# read data from csv file
# drop the patien id column

patient_data =  pd.read_csv('../data/breast-cancer-wisconsin.csv')

patient_data = patient_data.drop('id', axis=1)

patient_data = patient_data[~patient_data["Bare Nuclei"].str.contains('\?')]

patient_data

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,2
695,2,1,1,1,2,1,1,1,1,2
696,5,10,10,3,7,3,8,10,2,4
697,4,8,6,4,3,4,10,6,1,4


In [235]:
# class value: 2 is benign, 4 is malignant, change to: 0 is benign (no cancer), 1 is malignant (cancer)

# change class 2 -> 0, benign, no cancer
mask = patient_data.Class == 2
column_name = 'Class'
patient_data.loc[mask, column_name] = 0

# change class 4 -> 1, malignant, cancer
mask = patient_data.Class == 4
column_name = 'Class'
patient_data.loc[mask, column_name] = 1

# change dtype column ['Bare Nuclei'] from string to numeric
patient_data['Bare Nuclei'] = pd.to_numeric(patient_data['Bare Nuclei'])

# print data frame
patient_data

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,0
1,5,4,4,5,7,10,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0
5,8,10,10,8,7,10,9,7,1,1
6,1,1,1,1,2,10,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...
692,3,1,1,1,2,1,1,1,1,0
693,3,1,1,1,2,1,2,1,2,0


In [17]:
def distance(point1, point2):
    """Return Euclidien distance between point1 and point2,
    each point is an array of coordinates"""
    
    return np.sqrt(np.sum((point1 - point2)**2))

def row_distance(row1, row2):
    """Return the distance between two numerical rows of a table"""
    return distance(np.array(row1), np.array(row2))

In [4]:
def all_distances(training, new_point):
    """Returns an array of distances
    between each point in the training set
    and the new point (which is a row of attributes)"""
    attributes = training.drop('Class', axis=1)
    #new_point = np.delete(new_point, -1)
    
    all_distance = []
    for row in attributes.itertuples(index=False):
        #print(type(row))
        d = row_distance(row, new_point)
        all_distance.append(d)
        
    return all_distance

In [5]:
def table_with_distances(training, new_point):
    """Augments the training table 
    with a column of distances from new_point"""
    #training.loc[:,'Distance'] = all_distances(training, new_point)
    training['Distance'] = all_distances(training, new_point)
    return training


In [6]:
def closest(training, new_point, k):
    """Returns a data frame of the k rows 
    corresponding to the k smallest distances"""
    with_dists = table_with_distances(training, new_point)
    
    sorted_by_distance = with_dists.sort_values(by='Distance')
    
    topk = sorted_by_distance.take(np.arange(k))
    return topk

In [7]:
patient_data

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,2
695,2,1,1,1,2,1,1,1,1,2
696,5,10,10,3,7,3,8,10,2,4
697,4,8,6,4,3,4,10,6,1,4


In [16]:
new_point = patient_data.take([1]).drop('Class', axis=1)
new_point

#a = closest(patient_data, new_point, 5)
#print(a)

#patient_data = patient_data.drop('Distance', axis = 1)
#patient_data

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
1,5,4,4,5,7,10,3,2,1


In [10]:
def majority(topkclasses):
    """Return majority of neighbour labels 
    """

    # convert 'Class' column to numpy array
    topK = topkclasses.values
    
    # count number of '1' class
    ones = np.count_nonzero( topK == 1 )
    
    # count number of '0' class
    zeros = np.count_nonzero( topK == 0 )
    
    if ones > zeros:
        return 1
    else:
        return 0

In [11]:
def classify(training, new_point, k):
    closestk = closest(training, new_point, k)
    topkclasses = closestk['Class']
    return majority(topkclasses)


In [12]:
new_point = patient_data.take([3]).drop('Class', axis=1)

classify(patient_data, new_point.values, 5)

TypeError: unsupported operand type(s) for -: 'str' and 'int'

In [278]:
shuffled_patients = patient_data.sample(683, replace=False) 
training_set = shuffled_patients.take(np.arange(486))
test_set  = shuffled_patients.take(np.arange(486, 683))

In [270]:
patient_data

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,0
1,5,4,4,5,7,10,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0
5,8,10,10,8,7,10,9,7,1,1
6,1,1,1,1,2,10,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...
692,3,1,1,1,2,1,1,1,1,0
693,3,1,1,1,2,1,2,1,2,0


In [277]:
def evaluate_accuracy(training, test, k):

    test_attributes = test.drop('Class', axis = 1)
    num_correct = 0
    
    for i in np.arange(test.shape[0]):
        test_patient = test_attributes.iloc[i]
        
        if 'Distance' in training.columns:
            training = training.drop('Distance', axis=1)

        c = classify(training, test_patient, k)
        
        if c == test['Class'].iloc[i]:
            num_correct = num_correct + 1
    
    return num_correct / test.shape[0]


In [279]:
training_set
#test_set

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
128,8,3,5,4,5,10,1,6,2,1
453,4,5,5,8,6,10,10,7,1,1
390,1,1,1,2,2,1,2,1,1,0
656,5,1,1,1,2,1,2,1,1,0
616,3,1,1,1,2,1,2,1,1,0
191,7,5,10,10,10,10,4,10,3,1
681,5,10,10,10,4,10,5,6,3,1
...,...,...,...,...,...,...,...,...,...,...
576,5,1,1,1,2,1,2,1,1,0
194,3,1,1,1,2,1,3,1,1,0


In [292]:
%timeit evaluate_accuracy(training_set, test_set, 7)

4.07 s ± 283 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [296]:
start = time.time()
evaluate_accuracy(training_set, test_set, 3)
time.time() - start

3.909313201904297

In [283]:
test_set

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
106,10,10,10,8,2,10,4,1,1,1
102,4,1,2,1,2,1,3,1,1,0
662,1,1,3,1,2,1,2,1,1,0
130,5,1,3,1,2,1,2,1,1,0
680,10,10,10,10,5,10,10,10,7,1
509,2,1,1,1,2,1,1,1,1,0
86,3,3,6,4,5,8,4,4,1,1
...,...,...,...,...,...,...,...,...,...,...
568,8,4,4,1,6,10,2,5,2,1
555,4,3,1,1,2,1,4,8,1,0


In [284]:
training_set

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class,Distance
128,8,3,5,4,5,10,1,6,2,1,11.747340
453,4,5,5,8,6,10,10,7,1,1,13.190906
390,1,1,1,2,2,1,2,1,1,0,19.078784
656,5,1,1,1,2,1,2,1,1,0,17.916473
616,3,1,1,1,2,1,2,1,1,0,18.574176
191,7,5,10,10,10,10,4,10,3,1,13.674794
681,5,10,10,10,4,10,5,6,3,1,7.937254
...,...,...,...,...,...,...,...,...,...,...,...
576,5,1,1,1,2,1,2,1,1,0,17.916473
194,3,1,1,1,2,1,3,1,1,0,18.493242
