In [1]:
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV



In [2]:
# read in the breast cancer data table
df = pd.read_csv('data.csv')
# create a map to change the labels and changed them
diagnosis_map = {'M':1, 'B':0}
df['diagnosis'] = df['diagnosis'].map(diagnosis_map)


In [3]:
# seperate the inputs from the labels
X = df.drop(['diagnosis', 'id', 'Unnamed: 32'], axis=1)
y = df['diagnosis']

input_labels = X.columns

## Attribute Selection
Use the ANOVA filter-based feature selection method to get the top 10 attributes from the 32 available

In [4]:
best = SelectKBest(score_func=f_classif, k=10)
X_new = best.fit_transform(X, y)
# print the top 'k' attributes selected in no particular order..
print(input_labels[best.get_support()])

Index(['radius_mean', 'perimeter_mean', 'area_mean', 'concavity_mean',
       'concave points_mean', 'radius_worst', 'perimeter_worst', 'area_worst',
       'concavity_worst', 'concave points_worst'],
      dtype='object')


## Train test Split
Split the data into a training and testing set


In [5]:
# X_new = StandardScaler().fit_transform(X_new) # scaling doesnt seem to help
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=0)


## Finiding k
Do a grid search for to find the best value of k for the model

In [6]:
knn = KNeighborsClassifier(weights='uniform', algorithm='auto')
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17],
    'weights': ['uniform', 'distance']
}
CV_knn = GridSearchCV(estimator=knn, param_grid=param_grid, cv=10)
CV_knn.fit(X_train, y_train)
print('Best Params = ',CV_knn.best_params_)
print('CV : Accuracy Score = ',CV_knn.best_score_ )


Best Params =  {'n_neighbors': 11, 'weights': 'uniform'}
CV : Accuracy Score =  0.9271356783919598




## Final Test Accuracy
Now train all the training data into a final model and test it on the test set. Print the model evaluation metrics to see how well the model preformed.

In [7]:
knn = KNeighborsClassifier(n_neighbors=11, weights='uniform', algorithm='auto')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [8]:
# get the metrices
acc = accuracy_score(y_pred, y_test)
preci = precision_score(y_pred, y_test)
recall = recall_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test)

print(acc)
print('precision:',preci)
print('recall:', recall)
print('f1 score:', f1)

0.9649122807017544
precision: 0.9365079365079365
recall: 0.9672131147540983
f1 score: 0.9516129032258064


## Comparison
Compare to a model that uses all the attributes 

In [9]:
# compare to a model fit on all the attributes 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
knn = KNeighborsClassifier(weights='uniform', algorithm='auto')
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17],
    'weights': ['uniform', 'distance']
}
CV_knn = GridSearchCV(estimator=knn, param_grid=param_grid, cv=10)
CV_knn.fit(X_train, y_train)
print('Best Params = ',CV_knn.best_params_)
print('CV : Accuracy Score = ',CV_knn.best_score_ )

Best Params =  {'n_neighbors': 7, 'weights': 'uniform'}
CV : Accuracy Score =  0.9271356783919598


In [10]:
knn = KNeighborsClassifier(n_neighbors=7, weights='uniform', algorithm='auto')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
# get the metrices
acc = accuracy_score(y_pred, y_test)
preci = precision_score(y_pred, y_test)
recall = recall_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test)

print(acc)
print('precision:',preci)
print('recall:', recall)
print('f1 score:', f1)

0.9532163742690059
precision: 0.9365079365079365
recall: 0.9365079365079365
f1 score: 0.9365079365079365


In [68]:
# attributes = ['diagnosis', 'radius_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']
# df_subset = df[attributes]

# # model = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto')

# #Split the data into a training set and a test set.
# # get the inputs and scale them
# X = df_subset.drop('diagnosis', axis=1)
# X = StandardScaler().fit_transform(X)
# y = df_subset[['diagnosis']]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# knn = KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto')

# knn.fit(X_train, y_train)

# # we get 91.2 percent accuracy without scaling. 91.8 with scaling. with k=5
# # we get 96.5% accuracy with k=10 using the top ten selected features 
# knn.score(X_test, y_test)
