In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plots
%matplotlib inline



In [476]:
# read data from csv file
# drop the patien id column

patient_data =  pd.read_csv('../data/breast-cancer-wisconsin.csv')

patient_data.drop( ['id'], axis=1, inplace=True )
patient_data = patient_data[~patient_data["Bare Nuclei"].str.contains('\?')]

patient_data

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
5,8,10,10,8,7,10,9,7,1,4
6,1,1,1,1,2,10,3,1,1,2
7,2,1,2,1,2,1,3,1,1,2
8,2,1,1,1,2,1,1,1,5,2
9,4,2,1,1,2,1,2,1,1,2


In [477]:
# class value: 2 is benign, 4 is malignant, change to: 0 is benign (no cancer), 1 is malignant (cancer)

# change class 2 -> 0, benign, no cancer
mask = patient_data.Class == 2
column_name = 'Class'
patient_data.loc[mask, column_name] = 0

# change class 4 -> 1, malignant, cancer
mask = patient_data.Class == 4
column_name = 'Class'
patient_data.loc[mask, column_name] = 1

# change dtype column ['Bare Nuclei'] from string to numeric
patient_data['Bare Nuclei'] = pd.to_numeric(patient_data['Bare Nuclei'])

# print data frame
patient_data

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,0
1,5,4,4,5,7,10,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0
5,8,10,10,8,7,10,9,7,1,1
6,1,1,1,1,2,10,3,1,1,0
7,2,1,2,1,2,1,3,1,1,0
8,2,1,1,1,2,1,1,1,5,0
9,4,2,1,1,2,1,2,1,1,0


In [405]:
def distance(point1, point2):
    """Return Euclidien distance between point1 and point2,
    each point is an array of coordinates"""
    
    return np.sqrt(np.sum((point1 - point2)**2))

def row_distance(row1, row2):
    """Return the distance between two numerical rows of a table"""
    return distance(np.array(row1), np.array(row2))

In [143]:
# test distance
p1 = np.array([1,2])
p2 = np.array([0,10])

p1 = patient_data.take([0])
p2 = patient_data.iloc[[1]]

p1 = p1.values
p2 = p2.values
print(p1)

type(p1)
distance(p1, p2)



[[5 1 1 1 2 1 3 1 1 0]]


11.874342087037917

In [461]:
def all_distances(training, new_point):
    """Returns an array of distances
    between each point in the training set
    and the new point (which is a row of attributes)"""
    attributes = training.drop('Class', axis=1)
    #new_point = np.delete(new_point, -1)
    
    all_distance = []
    for row in attributes.itertuples(index=False):
        #print(type(row))
        d = row_distance(row, new_point)
        all_distance.append(d)
        
    return all_distance

In [479]:
new_point = patient_data.take([0]).drop('Class', axis=1)

table_with_distances(patient_data, new_point)

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class,Distance
0,5,1,1,1,2,1,3,1,1,0,0.000000
1,5,4,4,5,7,10,3,2,1,0,11.874342
2,3,1,1,1,2,2,3,1,1,0,2.236068
3,6,8,8,1,3,4,3,7,1,0,12.041595
4,4,1,1,3,2,1,3,1,1,0,2.236068
5,8,10,10,8,7,10,9,7,1,1,19.949937
6,1,1,1,1,2,10,3,1,1,0,9.848858
7,2,1,2,1,2,1,3,1,1,0,3.162278
8,2,1,1,1,2,1,1,1,5,0,5.385165
9,4,2,1,1,2,1,2,1,1,0,1.732051


In [466]:
def table_with_distances(training, new_point):
    """Augments the training table 
    with a column of distances from new_point"""
    training['Distance'] = all_distances(training, new_point)
    return training


In [470]:
def closest(training, new_point, k):
    """Returns a data frame of the k rows 
    corresponding to the k smallest distances"""
    with_dists = table_with_distances(training, new_point)
    sorted_by_distance = with_dists.sort_values(by=['Distance'])
    
    topk = sorted_by_distance.take(np.arange(k))
    return topk

In [478]:
closest(patient_data, new_point, 5)

ValueError: operands could not be broadcast together with shapes (9,) (1,10) 

In [302]:
#print(type(topkclasses))

#print(topkclasses['Class'].values)

x = topkclasses['Class'].values
print(x)
a = np.count_nonzero( x == 1 )

print(a)

b = np.count_nonzero( x == 0 )
print(b)

print("Majority:")
majority(topkclasses)


[0 1 1 1 1]
4
1
Majority:


1

In [316]:
def majority(topkclasses):
    """Return majority of neighbour labels 
    """

    # convert 'Class' column to numpy array
    topK = topkclasses.values
    
    # count number of '1' class
    ones = np.count_nonzero( topK == 1 )
    
    # count number of '0' class
    zeros = np.count_nonzero( topK == 0 )
    
    if ones > zeros:
        return 1
    else:
        return 0

In [317]:
def classify(training, new_point, k):
    closestk = closest(training, new_point, k)
    topkclasses = closestk[['Class']]
    return majority(topkclasses)


In [348]:
classify(patient_data, patient_data.take([3]).values, 5)

ValueError: labels ['Class'] not contained in axis

### Try to remove for loop in function all_distance()

In [327]:
n = patient_data.shape[0]

train = n * (2/3)
train

#test = n - train - 1
#test


455.3333333333333

In [329]:
shuffled_patients = patient_data.sample(683, replace=False) 
training_set = shuffled_patients.take(np.arange(456))
test_set  = shuffled_patients.take(np.arange(456, 683))

In [332]:
training_set.shape
test_set.shape

683

In [335]:
def count_zero(array):
    """Counts the number of 0's in an array"""
    return len(array) - np.count_nonzero(array)

def count_equal(array1, array2):
    """Takes two numerical arrays of equal length
    and counts the indices where the two are equal"""
    return count_zero(array1 - array2)

def evaluate_accuracy(training, test, k):
    test_attributes = test.drop('Class')
    def classify_testrow(row):
        return classify(training, row, k)
    c = test_attributes.apply(classify_testrow)
    return count_equal(c, test.column('Class')) / test.num_rows

In [338]:
evaluate_accuracy(training_set, test_set, 5)


ValueError: labels ['Class'] not contained in axis