In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier


In [2]:
cols=['id','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitose','Class']

df=pd.read_csv('breast-cancer-wisconsin.data.csv',names=cols,header=None)
df.head()

Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitose,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


## Data Preprocessing

In [3]:
df.isnull().sum()

id                             0
Clump Thickness                0
Uniformity of Cell Size        0
Uniformity of Cell Shape       0
Marginal Adhesion              0
Single Epithelial Cell Size    0
Bare Nuclei                    0
Bland Chromatin                0
Normal Nucleoli                0
Mitose                         0
Class                          0
dtype: int64

In [4]:
np.where(df.isnull())

(array([], dtype=int64), array([], dtype=int64))

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   id                           699 non-null    int64 
 1   Clump Thickness              699 non-null    int64 
 2   Uniformity of Cell Size      699 non-null    int64 
 3   Uniformity of Cell Shape     699 non-null    int64 
 4   Marginal Adhesion            699 non-null    int64 
 5   Single Epithelial Cell Size  699 non-null    int64 
 6   Bare Nuclei                  699 non-null    object
 7   Bland Chromatin              699 non-null    int64 
 8   Normal Nucleoli              699 non-null    int64 
 9   Mitose                       699 non-null    int64 
 10  Class                        699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [7]:
df['Bare Nuclei'].describe()

count     699
unique     11
top         1
freq      402
Name: Bare Nuclei, dtype: object

In [8]:
df.describe()

Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bland Chromatin,Normal Nucleoli,Mitose,Class
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [9]:
df['Bare Nuclei'].value_counts()

1     402
10    132
2      30
5      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: Bare Nuclei, dtype: int64

In [10]:
## Finding the '?' value
df[df['Bare Nuclei']=='?']

Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitose,Class
23,1057013,8,4,5,1,2,?,7,3,1,4
40,1096800,6,6,6,9,6,?,7,8,1,2
139,1183246,1,1,1,1,1,?,2,1,1,2
145,1184840,1,1,3,1,2,?,2,1,1,2
158,1193683,1,1,2,1,3,?,1,1,1,2
164,1197510,5,1,1,1,2,?,3,1,1,2
235,1241232,3,1,4,1,2,?,3,1,1,2
249,169356,3,1,1,1,2,?,3,1,1,2
275,432809,3,1,3,1,2,?,2,1,1,2
292,563649,8,8,8,1,2,?,6,10,1,4


In [11]:
df['Class'].value_counts()

2    458
4    241
Name: Class, dtype: int64

In [12]:
df['Bare Nuclei'].replace('?',np.NAN,inplace=True)
df=df.dropna()

In [16]:
df.nunique()

id                             630
Clump Thickness                 10
Uniformity of Cell Size         10
Uniformity of Cell Shape        10
Marginal Adhesion               10
Single Epithelial Cell Size     10
Bare Nuclei                     10
Bland Chromatin                 10
Normal Nucleoli                 10
Mitose                           9
Class                            2
dtype: int64

## Note that for class:2 is benign, 4 is for maligant
$$\frac{df["Class"]}{2}-1$$

In [17]:
df['Bare Nuclei'].value_counts()

1     402
10    132
2      30
5      30
3      28
8      21
4      19
9       9
7       8
6       4
Name: Bare Nuclei, dtype: int64

In [18]:
df['Class']=df['Class']/2-1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Class']=df['Class']/2-1


In [19]:
df['Class'].value_counts()

0.0    444
1.0    239
Name: Class, dtype: int64

In [20]:
df.columns

Index(['id', 'Clump Thickness', 'Uniformity of Cell Size',
       'Uniformity of Cell Shape', 'Marginal Adhesion',
       'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
       'Normal Nucleoli', 'Mitose', 'Class'],
      dtype='object')

In [23]:
x=df.drop(['id','Class'],axis=1)
x_col=x.columns
x_col

Index(['Clump Thickness', 'Uniformity of Cell Size',
       'Uniformity of Cell Shape', 'Marginal Adhesion',
       'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
       'Normal Nucleoli', 'Mitose'],
      dtype='object')

In [32]:
x

array([[ 0.19790469, -0.70221201, -0.74177362, ..., -0.18182716,
        -0.61292736, -0.34839971],
       [ 0.19790469,  0.27725185,  0.26278299, ..., -0.18182716,
        -0.28510482, -0.34839971],
       [-0.51164337, -0.70221201, -0.74177362, ..., -0.18182716,
        -0.61292736, -0.34839971],
       ...,
       [ 0.19790469,  2.23617957,  2.2718962 , ...,  1.86073779,
         2.33747554,  0.22916583],
       [-0.15686934,  1.58320366,  0.93248739, ...,  2.67776377,
         1.02618536, -0.34839971],
       [-0.15686934,  1.58320366,  1.6021918 , ...,  2.67776377,
         0.37054027, -0.34839971]])

In [33]:
y=df['Class']
y

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
694    0.0
695    0.0
696    1.0
697    1.0
698    1.0
Name: Class, Length: 683, dtype: float64

In [34]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

In [39]:
x_std=sc.fit_transform(x)
x_std

array([[ 0.19790469, -0.70221201, -0.74177362, ..., -0.18182716,
        -0.61292736, -0.34839971],
       [ 0.19790469,  0.27725185,  0.26278299, ..., -0.18182716,
        -0.28510482, -0.34839971],
       [-0.51164337, -0.70221201, -0.74177362, ..., -0.18182716,
        -0.61292736, -0.34839971],
       ...,
       [ 0.19790469,  2.23617957,  2.2718962 , ...,  1.86073779,
         2.33747554,  0.22916583],
       [-0.15686934,  1.58320366,  0.93248739, ...,  2.67776377,
         1.02618536, -0.34839971],
       [-0.15686934,  1.58320366,  1.6021918 , ...,  2.67776377,
         0.37054027, -0.34839971]])

In [40]:
df1=pd.DataFrame(x_std,columns=x_col)
df1

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitose
0,0.197905,-0.702212,-0.741774,-0.639366,-0.555608,-0.698853,-0.181827,-0.612927,-0.348400
1,0.197905,0.277252,0.262783,0.758032,1.695166,1.772867,-0.181827,-0.285105,-0.348400
2,-0.511643,-0.702212,-0.741774,-0.639366,-0.555608,-0.424217,-0.181827,-0.612927,-0.348400
3,0.552679,1.583204,1.602192,-0.639366,-0.105454,0.125054,-0.181827,1.354008,-0.348400
4,-0.156869,-0.702212,-0.741774,0.059333,-0.555608,-0.698853,-0.181827,-0.612927,-0.348400
...,...,...,...,...,...,...,...,...,...
678,-0.511643,-0.702212,-0.741774,-0.639366,-0.105454,-0.424217,-0.998853,-0.612927,-0.348400
679,-0.866417,-0.702212,-0.741774,-0.639366,-0.555608,-0.698853,-0.998853,-0.612927,-0.348400
680,0.197905,2.236180,2.271896,0.059333,1.695166,-0.149582,1.860738,2.337476,0.229166
681,-0.156869,1.583204,0.932487,0.408682,-0.105454,0.125054,2.677764,1.026185,-0.348400


In [44]:
from sklearn.preprocessing import MinMaxScaler
pd.DataFrame(MinMaxScaler().fit_transform(x),columns=x_col).head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitose
0,0.444444,0.0,0.0,0.0,0.111111,0.0,0.222222,0.0,0.0
1,0.444444,0.333333,0.333333,0.444444,0.666667,1.0,0.222222,0.111111,0.0
2,0.222222,0.0,0.0,0.0,0.111111,0.111111,0.222222,0.0,0.0
3,0.555556,0.777778,0.777778,0.0,0.222222,0.333333,0.222222,0.666667,0.0
4,0.333333,0.0,0.0,0.222222,0.111111,0.0,0.222222,0.0,0.0


## Training

In [45]:
x_train,x_test,y_train,y_test=train_test_split(df1,y,test_size=0.2,random_state=42)

In [46]:
knn=KNeighborsClassifier(n_neighbors=5,p=2, metric='minkowski')

In [47]:
knn.fit(x_train,y_train)

KNeighborsClassifier()

In [48]:
knn.score(x_test,y_test)

0.9562043795620438

In [49]:
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,roc_auc_score


In [66]:
def print_score(knn,x_train,x_test,y_train,y_test,train=True):
    lb=preprocessing.LabelBinarizer()
    lb.fit(y_train)
    if train:
        result=knn.predict(x_train)
        print("Training Results:\n")
        print('Accuracy Score : {0:.4f}\n'.format(accuracy_score(y_train,result)))
        print('Classification report: \n{}\n'.format(classification_report(y_train,result)))
        print('Confusion matrix : \n{}\n'.format(confusion_matrix(y_train,result)))
        print('ROC AUC    : {0:.4f}'.format(roc_auc_score(lb.transform(y_train),lb.transform(result))))
        print('Average accuracy : {0:.4f}'.format(np.mean(cross_val_score(knn,x_train,y_train,cv=10,scoring='accuracy'))))
        print('Average SD : {0:.4f}'.format(np.std(cross_val_score(knn,x_train,y_train,cv=10,scoring='accuracy'))))
        
    elif train==False:
        res=knn.predict(x_test)
        print("Test Results:\n")
        print('Accuracy Score : {0:.4f}\n'.format(accuracy_score(y_test,res)))
        print('Classification report: \n{}\n'.format(classification_report(y_test,res)))
        print('Confusion matrix : \n{}\n'.format(confusion_matrix(y_test,res)))
        print('ROC AUC    : {0:.4f}'.format(roc_auc_score(lb.transform(y_test),lb.transform(res))))
        print('Average SD : {0:.4f}'.format(np.std(cross_val_score(knn,x_test,y_test,cv=10,scoring='accuracy'))))
        

In [67]:
print_score(knn,x_train,x_test,y_train,y_test,train=True)

Training Results:

Accuracy Score : 0.9725

Classification report: 
              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       365
         1.0       0.96      0.96      0.96       181

    accuracy                           0.97       546
   macro avg       0.97      0.97      0.97       546
weighted avg       0.97      0.97      0.97       546


Confusion matrix : 
[[358   7]
 [  8 173]]

ROC AUC    : 0.9683
Average accuracy : 0.9634
Average SD : 0.0200


In [68]:
print_score(knn,x_train,x_test,y_train,y_test,train=False)

Test Results:

Accuracy Score : 0.9562

Classification report: 
              precision    recall  f1-score   support

         0.0       0.94      0.99      0.96        79
         1.0       0.98      0.91      0.95        58

    accuracy                           0.96       137
   macro avg       0.96      0.95      0.95       137
weighted avg       0.96      0.96      0.96       137


Confusion matrix : 
[[78  1]
 [ 5 53]]

ROC AUC    : 0.9506
Average SD : 0.0479


## GridSearchCV

In [69]:
from sklearn.model_selection import GridSearchCV


In [71]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [72]:
params={'n_neighbors':[1,2,3,4,5,6,7,8,9,10]}

In [73]:
gscv=GridSearchCV(KNeighborsClassifier(),params,n_jobs=-1,verbose=1,cv=10)

In [74]:
gscv.fit(x_train,y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


GridSearchCV(cv=10, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
             verbose=1)

In [75]:
gscv.best_estimator_

KNeighborsClassifier(n_neighbors=7)

In [76]:
print_score(gscv,x_train,x_test,y_train,y_test,train=True)

Training Results:

Accuracy Score : 0.9725

Classification report: 
              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       365
         1.0       0.96      0.96      0.96       181

    accuracy                           0.97       546
   macro avg       0.97      0.97      0.97       546
weighted avg       0.97      0.97      0.97       546


Confusion matrix : 
[[358   7]
 [  8 173]]

ROC AUC    : 0.9683
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 cand

In [77]:
print_score(gscv,x_train,x_test,y_train,y_test,train=False)

Test Results:

Accuracy Score : 0.9562

Classification report: 
              precision    recall  f1-score   support

         0.0       0.94      0.99      0.96        79
         1.0       0.98      0.91      0.95        58

    accuracy                           0.96       137
   macro avg       0.96      0.95      0.95       137
weighted avg       0.96      0.96      0.96       137


Confusion matrix : 
[[78  1]
 [ 5 53]]

ROC AUC    : 0.9506
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, 

In [78]:
gscv.best_params_

{'n_neighbors': 7}

In [79]:
gscv.cv_results_

{'mean_fit_time': array([0.05073555, 0.01213691, 0.00940626, 0.00961292, 0.01089909,
        0.01019657, 0.00952067, 0.01032255, 0.01019802, 0.00950572]),
 'std_fit_time': array([0.04714699, 0.00660133, 0.00119371, 0.00295517, 0.00325911,
        0.00368907, 0.0030182 , 0.00281699, 0.0030331 , 0.00201789]),
 'mean_score_time': array([0.01600909, 0.01199555, 0.01424062, 0.01695755, 0.01395421,
        0.0123306 , 0.0130316 , 0.0145416 , 0.01355987, 0.01386597]),
 'std_score_time': array([0.00436672, 0.00245508, 0.00347072, 0.00341055, 0.00234661,
        0.00328402, 0.00281708, 0.00362886, 0.0020057 , 0.00276668]),
 'param_n_neighbors': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 1},
  {'n_neighbors': 2},
  {'n_neighbors': 3},
  {'n_neighbors': 4},
  {'n_neighbors': 5},
  {'n_neighbors': 6},
