In [4]:
import numpy as np
from sklearn import preprocessing, neighbors
from sklearn.model_selection import train_test_split
import pandas as pd

In [1]:
# Classification is using a dataset to create a model that identifies patterns or groups on graphs based features
# New input is usually classified based on proximity to labelled data

# K Nearest Neighbors algorithm looks at closest points to new unknown input and then categorizes/classfies the input based labelled data around it
# K is controlled to determine the number of closest points to analyze
# If K = 3 and the closest points are labelled: DOG, DOG, CAT then the input is classified as DOG because it is the majority
# It can also have include a degree of confidence for each input/unknown point labelled

# Downfalls of K Nearest Neighbors: Finding closest points requires finding Euclidean distance of all points and selecting the lowest K, which can consume lots of time and power when dealing with large datasets

In [53]:
accuracies = []
for i in range(25):
    df = pd.read_csv('breast-cancer-wisconsin.data')
    df.replace('?', -99999, inplace=True)
    # in classification it is important to eliminate useless features such as id because it has no real life impact on class but its effects on classification algorithms are immense
    df.drop(['id'], 1, inplace=True)

    X = np.array(df.drop(['class'],1)) # does not permanently remove 'class' but it is the y / predicted value so it should not be in X
    y = np.array(df['class'])

    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

    clf = neighbors.KNeighborsClassifier()
    clf.fit(X_train, y_train)

    accuracy = clf.score(X_test, y_test)
    print(accuracy)
    accuracies.append(accuracy)
    # NOT REMOVING 'id' FEATURE RESULTS IN ACCURACY OF 0.57
print('Average accuracy', sum(accuracies)/len(accuracies))

  df.drop(['id'], 1, inplace=True)
  X = np.array(df.drop(['class'],1)) # does not permanently remove 'class' but it is the y / predicted value so it should not be in X
  df.drop(['id'], 1, inplace=True)
  X = np.array(df.drop(['class'],1)) # does not permanently remove 'class' but it is the y / predicted value so it should not be in X
  df.drop(['id'], 1, inplace=True)
  X = np.array(df.drop(['class'],1)) # does not permanently remove 'class' but it is the y / predicted value so it should not be in X
  df.drop(['id'], 1, inplace=True)
  X = np.array(df.drop(['class'],1)) # does not permanently remove 'class' but it is the y / predicted value so it should not be in X
  df.drop(['id'], 1, inplace=True)
  X = np.array(df.drop(['class'],1)) # does not permanently remove 'class' but it is the y / predicted value so it should not be in X
  df.drop(['id'], 1, inplace=True)
  X = np.array(df.drop(['class'],1)) # does not permanently remove 'class' but it is the y / predicted value so it shoul

0.9714285714285714
0.9714285714285714
0.9571428571428572
0.9642857142857143
0.9714285714285714
0.9571428571428572
0.9642857142857143
0.9642857142857143
0.9785714285714285
0.9857142857142858
0.9857142857142858
0.9642857142857143
0.9785714285714285
0.9857142857142858
0.9714285714285714
0.9785714285714285
1.0
0.9857142857142858
0.9714285714285714
0.9285714285714286
0.9928571428571429
0.9714285714285714
0.9785714285714285
0.9285714285714286
0.9571428571428572
Average accuracy 0.9705714285714283


  df.drop(['id'], 1, inplace=True)
  X = np.array(df.drop(['class'],1)) # does not permanently remove 'class' but it is the y / predicted value so it should not be in X
  df.drop(['id'], 1, inplace=True)
  X = np.array(df.drop(['class'],1)) # does not permanently remove 'class' but it is the y / predicted value so it should not be in X
  df.drop(['id'], 1, inplace=True)
  X = np.array(df.drop(['class'],1)) # does not permanently remove 'class' but it is the y / predicted value so it should not be in X
  df.drop(['id'], 1, inplace=True)
  X = np.array(df.drop(['class'],1)) # does not permanently remove 'class' but it is the y / predicted value so it should not be in X
  df.drop(['id'], 1, inplace=True)
  X = np.array(df.drop(['class'],1)) # does not permanently remove 'class' but it is the y / predicted value so it should not be in X
  df.drop(['id'], 1, inplace=True)
  X = np.array(df.drop(['class'],1)) # does not permanently remove 'class' but it is the y / predicted value so it shoul

In [48]:
# testing out the classifier with made up data
example_measures = np.array([[4,2,1,1,1,2,3,2,1], [4,2,1,2,2,2,3,2,1]])
example_measures = example_measures.reshape(len(example_measures),-1) # 1st param = number of samples


prediction = clf.predict(example_measures)
print(prediction)

[2 2]
