In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plots
%matplotlib inline



In [593]:
# read data from csv file
# drop the patien id column

patient_data =  pd.read_csv('../data/breast-cancer-wisconsin.csv')

patient_data.drop( ['id'], axis=1, inplace=True )
patient_data = patient_data[~patient_data["Bare Nuclei"].str.contains('\?')]

patient_data

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
5,8,10,10,8,7,10,9,7,1,4
6,1,1,1,1,2,10,3,1,1,2
7,2,1,2,1,2,1,3,1,1,2
8,2,1,1,1,2,1,1,1,5,2
9,4,2,1,1,2,1,2,1,1,2


In [594]:
# class value: 2 is benign, 4 is malignant, change to: 0 is benign (no cancer), 1 is malignant (cancer)

# change class 2 -> 0, benign, no cancer
mask = patient_data.Class == 2
column_name = 'Class'
patient_data.loc[mask, column_name] = 0

# change class 4 -> 1, malignant, cancer
mask = patient_data.Class == 4
column_name = 'Class'
patient_data.loc[mask, column_name] = 1

# change dtype column ['Bare Nuclei'] from string to numeric
patient_data['Bare Nuclei'] = pd.to_numeric(patient_data['Bare Nuclei'])

# print data frame
patient_data

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,0
1,5,4,4,5,7,10,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0
5,8,10,10,8,7,10,9,7,1,1
6,1,1,1,1,2,10,3,1,1,0
7,2,1,2,1,2,1,3,1,1,0
8,2,1,1,1,2,1,1,1,5,0
9,4,2,1,1,2,1,2,1,1,0


In [541]:
def distance(point1, point2):
    """Return Euclidien distance between point1 and point2,
    each point is an array of coordinates"""
    
    return np.sqrt(np.sum((point1 - point2)**2))

def row_distance(row1, row2):
    """Return the distance between two numerical rows of a table"""
    return distance(np.array(row1), np.array(row2))

In [143]:
# test distance
p1 = np.array([1,2])
p2 = np.array([0,10])

p1 = patient_data.take([0])
p2 = patient_data.iloc[[1]]

p1 = p1.values
p2 = p2.values
print(p1)

type(p1)
distance(p1, p2)



[[5 1 1 1 2 1 3 1 1 0]]


11.874342087037917

In [542]:
def all_distances(training, new_point):
    """Returns an array of distances
    between each point in the training set
    and the new point (which is a row of attributes)"""
    attributes = training.drop('Class', axis=1)
    #new_point = np.delete(new_point, -1)
    
    all_distance = []
    for row in attributes.itertuples(index=False):
        #print(type(row))
        d = row_distance(row, new_point)
        all_distance.append(d)
        
    return all_distance

In [543]:
def table_with_distances(training, new_point):
    """Augments the training table 
    with a column of distances from new_point"""
    training.loc[:,'Distance'] = all_distances(training, new_point)
    return training


In [544]:
def closest(training, new_point, k):
    """Returns a data frame of the k rows 
    corresponding to the k smallest distances"""
    with_dists = table_with_distances(training, new_point)
    
    print(type(with_dists))
    sorted_by_distance = with_dists.sort_values(by='Distance')
    
    topk = sorted_by_distance.take(np.arange(k))
    return topk

In [545]:
new_point = patient_data.take([0]).drop('Class', axis=1)

closest(patient_data, new_point, 5)


<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class,Distance
0,5,1,1,1,2,1,3,1,1,0,0.0
561,5,1,1,1,2,1,3,1,1,0,0.0
560,5,1,1,1,2,1,3,1,1,0,0.0
536,5,1,1,1,2,1,3,1,1,0,0.0
203,5,1,1,1,2,1,3,1,1,0,0.0


In [546]:
def majority(topkclasses):
    """Return majority of neighbour labels 
    """

    # convert 'Class' column to numpy array
    topK = topkclasses.values
    
    # count number of '1' class
    ones = np.count_nonzero( topK == 1 )
    
    # count number of '0' class
    zeros = np.count_nonzero( topK == 0 )
    
    if ones > zeros:
        return 1
    else:
        return 0

In [547]:
def classify(training, new_point, k):
    closestk = closest(training, new_point, k)
    topkclasses = closestk[['Class']]
    return majority(topkclasses)


In [548]:
new_point = patient_data.take([3]).drop('Class', axis=1)

classify(patient_data, new_point.values, 5)

<class 'pandas.core.frame.DataFrame'>


1

In [602]:
shuffled_patients = patient_data.sample(683, replace=False) 
training_set = shuffled_patients.take(np.arange(342))
test_set  = shuffled_patients.take(np.arange(342, 683))

In [581]:
patient_data

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class,Distance
0,5,1,1,1,2,1,3,1,1,0,17.029386
1,5,4,4,5,7,10,3,2,1,0,11.226218
2,3,1,1,1,2,2,3,1,1,0,15.625247
3,6,8,8,1,3,4,3,7,1,0,0.000000
4,4,1,1,3,2,1,3,1,1,0,15.752725
5,8,10,10,8,7,10,9,7,1,1,14.544480
6,1,1,1,1,2,10,3,1,1,0,14.170677
7,2,1,2,1,2,1,3,1,1,0,15.028049
8,2,1,1,1,2,1,1,1,5,0,14.976917
9,4,2,1,1,2,1,2,1,1,0,15.565561


In [595]:
def count_zero(array):
    """Counts the number of 0's in an array"""
    return len(array) - np.count_nonzero(array)

def count_equal(array1, array2):
    """Takes two numerical arrays of equal length
    and counts the indices where the two are equal"""
    return count_zero(array1 - array2)

def evaluate_accuracy(training, test, k):
    test_attributes = test.drop('Class', axis=1)
    #def classify_testrow(row):
    #    return classify(training, row, k)
    
    c = []
    for row in test_attributes.itertuples(index=False):
        #print(type(row))
        predict = classify(training_set, row, k)
        c.append(predict)

    #c = test_attributes.apply(classify_testrow)
    return count_equal(np.array(c), test['Class'].values) / test.shape[0]


In [600]:
training_set
#test_set

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
365,2,1,1,1,2,1,2,1,1,0
41,10,4,3,1,3,3,6,5,2,1
406,4,2,2,1,2,1,2,1,1,0
39,2,5,3,3,6,7,7,5,1,1
230,7,4,7,4,3,7,7,6,1,1
574,10,9,7,3,4,2,7,7,1,1
261,5,10,10,6,10,10,10,6,5,1
217,1,1,1,1,2,1,3,1,1,0
428,1,1,1,1,2,1,2,1,1,0
336,6,5,5,8,4,10,3,4,1,1


In [601]:
evaluate_accuracy(training_set, test_set, 5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


<class 'pandas.core.frame.DataFrame'>


ValueError: operands could not be broadcast together with shapes (10,) (9,) 

In [587]:
test_set

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
328,8,10,3,2,6,4,3,10,1,1
583,3,1,1,1,2,1,1,1,1,0
682,5,1,1,1,2,1,3,2,1,0
213,10,10,10,10,7,10,7,10,4,1
324,1,1,1,1,2,1,3,1,1,0
649,3,1,1,1,2,1,2,1,1,0
211,8,10,8,8,4,8,7,7,1,1
87,3,6,6,6,5,10,6,8,3,1
635,3,1,4,1,2,1,1,1,1,0
287,3,1,1,1,3,1,2,1,1,0


In [588]:
training_set

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
80,2,2,2,1,1,1,7,1,1,0
236,10,8,8,2,8,10,4,8,10,1
543,4,1,1,1,2,1,2,1,1,0
577,1,1,1,1,2,1,2,1,1,0
687,3,1,1,1,2,1,2,3,1,0
22,3,1,1,1,2,1,2,1,1,0
269,1,1,1,1,2,1,3,1,1,0
134,3,1,1,1,3,1,2,1,1,0
422,4,3,3,1,2,1,3,3,1,0
254,9,10,10,1,10,8,3,3,1,1


In [603]:
test_att = test_set.drop('Class', axis=1)
test_att

c = []
for row in test_att.itertuples(index=False):
    #print(type(row))
    predict = classify(training_set, row, 5)
    c.append(predict)
    
c

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)


<class 'pandas.core.frame.DataFrame'>


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


ValueError: operands could not be broadcast together with shapes (10,) (9,) 

In [566]:
test_set.shape[0]

a = test_set['Class'].values

a

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0])

In [564]:
res

array([0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0])

In [573]:
res = np.array(c)

#error_count = np.count_nonzero(res - a)
#error_rate = error_count / test_set.shape[0]

#error_rate
#s = 1 - error_rate
#s


#return count_equal(c, test.column('Class')) / test.num_rows
count_equal(res, test_set['Class'].values) / test_set.shape[0]


0.9030837004405287

In [519]:
evaluate_accuracy(training_set, test_set, 5)


ValueError: operands could not be broadcast together with shapes (10,) (9,) 