# Project Cancer Detection

## Breast Cancer Wisconsin (Diagnostic) Data Set

In [9]:
import numpy as np
import pandas as pd

In [16]:
df = pd.read_csv('data.csv', header = 0)
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [18]:
df.pop('Unnamed: 32')

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
10    NaN
11    NaN
12    NaN
13    NaN
14    NaN
15    NaN
16    NaN
17    NaN
18    NaN
19    NaN
20    NaN
21    NaN
22    NaN
23    NaN
24    NaN
25    NaN
26    NaN
27    NaN
28    NaN
29    NaN
       ..
539   NaN
540   NaN
541   NaN
542   NaN
543   NaN
544   NaN
545   NaN
546   NaN
547   NaN
548   NaN
549   NaN
550   NaN
551   NaN
552   NaN
553   NaN
554   NaN
555   NaN
556   NaN
557   NaN
558   NaN
559   NaN
560   NaN
561   NaN
562   NaN
563   NaN
564   NaN
565   NaN
566   NaN
567   NaN
568   NaN
Name: Unnamed: 32, Length: 569, dtype: float64

In [19]:
df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
id                         569 non-null int64
diagnosis                  569 non-null object
radius_mean                569 non-null float64
texture_mean               569 non-null float64
perimeter_mean             569 non-null float64
area_mean                  569 non-null float64
smoothness_mean            569 non-null float64
compactness_mean           569 non-null float64
concavity_mean             569 non-null float64
concave points_mean        569 non-null float64
symmetry_mean              569 non-null float64
fractal_dimension_mean     569 non-null float64
radius_se                  569 non-null float64
texture_se                 569 non-null float64
perimeter_se               569 non-null float64
area_se                    569 non-null float64
smoothness_se              569 non-null float64
compactness_se             569 non-null float64
concavity_se               569 non

In [21]:
df['diagnosis'].describe()

count     569
unique      2
top         B
freq      357
Name: diagnosis, dtype: object

In [22]:
df['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

For diagnosis B = Benign, M = Malignant, convert to 0,1

Benign = 0, Malignant = 1

In [24]:
diag_to_num = {'B' : 0,
              'M' : 1}
df['diagnosis'] = df['diagnosis'].map(diag_to_num)

In [25]:
df['diagnosis'].value_counts()

0    357
1    212
Name: diagnosis, dtype: int64

In [26]:
X = df.drop(['id', 'diagnosis'], axis=1)
X_col = X.columns

In [28]:
y = df['diagnosis']

## Pre-processing

In [29]:
from sklearn.preprocessing import StandardScaler

In [30]:
X = StandardScaler().fit_transform(X.values)

## Training

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
df1 = pd.DataFrame(X, columns = X_col)
df1.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1.097064,-2.073335,1.269934,0.984375,1.568466,3.283515,2.652874,2.532475,2.217515,2.255747,...,1.88669,-1.359293,2.303601,2.001237,1.307686,2.616665,2.109526,2.296076,2.750622,1.937015
1,1.829821,-0.353632,1.685955,1.908708,-0.826962,-0.487072,-0.023846,0.548144,0.001392,-0.868652,...,1.805927,-0.369203,1.535126,1.890489,-0.375612,-0.430444,-0.146749,1.087084,-0.24389,0.28119
2,1.579888,0.456187,1.566503,1.558884,0.94221,1.052926,1.363478,2.037231,0.939685,-0.398008,...,1.51187,-0.023974,1.347475,1.456285,0.527407,1.082932,0.854974,1.955,1.152255,0.201391
3,-0.768909,0.253732,-0.592687,-0.764464,3.283553,3.402909,1.915897,1.451707,2.867383,4.910919,...,-0.281464,0.133984,-0.249939,-0.550021,3.394275,3.893397,1.989588,2.175786,6.046041,4.93501
4,1.750297,-1.151816,1.776573,1.826229,0.280372,0.53934,1.371011,1.428493,-0.00956,-0.56245,...,1.298575,-1.46677,1.338539,1.220724,0.220556,-0.313395,0.613179,0.729259,-0.868353,-0.3971


In [34]:
X_train, X_test, y_train, y_test = train_test_split(df1, y, test_size = 0.2, random_state = 42)

In [35]:
from sklearn.preprocessing import MinMaxScaler
pd.DataFrame(MinMaxScaler().fit_transform(df.drop(['id','diagnosis'], axis = 1).values), columns=X_col).head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,0.521037,0.022658,0.545989,0.363733,0.593753,0.792037,0.70314,0.731113,0.686364,0.605518,...,0.620776,0.141525,0.66831,0.450698,0.601136,0.619292,0.56861,0.912027,0.598462,0.418864
1,0.643144,0.272574,0.615783,0.501591,0.28988,0.181768,0.203608,0.348757,0.379798,0.141323,...,0.606901,0.303571,0.539818,0.435214,0.347553,0.154563,0.192971,0.639175,0.23359,0.222878
2,0.601496,0.39026,0.595743,0.449417,0.514309,0.431017,0.462512,0.635686,0.509596,0.211247,...,0.556386,0.360075,0.508442,0.374508,0.48359,0.385375,0.359744,0.835052,0.403706,0.213433
3,0.21009,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,1.0,...,0.24831,0.385928,0.241347,0.094008,0.915472,0.814012,0.548642,0.88488,1.0,0.773711
4,0.629893,0.156578,0.630986,0.48929,0.430351,0.347893,0.463918,0.51839,0.378283,0.186816,...,0.519744,0.123934,0.506948,0.341575,0.437364,0.172415,0.319489,0.558419,0.1575,0.142595


In [36]:
from sklearn.preprocessing import Normalizer
pd.DataFrame(Normalizer().fit_transform(df.drop(['id','diagnosis'], axis = 1).values), columns=X_col).head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,0.007925,0.004573,0.054099,0.440986,5.2e-05,0.000122,0.000132,6.5e-05,0.000107,3.5e-05,...,0.011181,0.007635,0.081325,0.889462,7.1e-05,0.000293,0.000314,0.000117,0.000203,5.2e-05
1,0.008666,0.007486,0.055988,0.558619,3.6e-05,3.3e-05,3.7e-05,3e-05,7.6e-05,2.4e-05,...,0.010528,0.009862,0.066899,0.824026,5.2e-05,7.9e-05,0.000102,7.8e-05,0.000116,3.8e-05
2,0.009367,0.010109,0.061842,0.572276,5.2e-05,7.6e-05,9.4e-05,6.1e-05,9.8e-05,2.9e-05,...,0.011212,0.012145,0.072545,0.812984,6.9e-05,0.000202,0.000214,0.000116,0.000172,4.2e-05
3,0.016325,0.029133,0.110899,0.551922,0.000204,0.000406,0.000345,0.00015,0.000371,0.000139,...,0.021314,0.037881,0.141333,0.811515,0.0003,0.001238,0.000982,0.000368,0.000949,0.000247
4,0.009883,0.006985,0.065808,0.631774,4.9e-05,6.5e-05,9.6e-05,5.1e-05,8.8e-05,2.9e-05,...,0.010979,0.00812,0.074137,0.767189,6.7e-05,0.0001,0.000195,7.9e-05,0.000115,3.7e-05


In [37]:
from sklearn.neighbors import KNeighborsClassifier

In [38]:
knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')

In [39]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [40]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [41]:
def print_score(clf, X_train, X_test, y_train, y_test, train = True):
    '''
    print accuracy score and confusion matrix
    '''
    
    if train:
        '''
        train performance
        '''
        print('Train Result: \n')
        print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_train, clf.predict(X_train))))
        print('Classification Report: \n {} \n'.format(classification_report(y_train, clf.predict(X_train))))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_train, clf.predict(X_train))))
        
        res = cross_val_score(clf, X_train, y_train, cv = 10, scoring='accuracy')
        print('Average Accuracy: \t {0:.4f}'.format(np.mean(res)))
        print('Average SD: \t\t {0:.4f}'.format(np.std(res)))
        
    elif train == False:
        '''
        test performance
        '''
        print('Test Result \n')
        print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_test, clf.predict(X_test))))
        print('Classification Report: \n {} \n'.format(classification_report(y_test, clf.predict(X_test))))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_test, clf.predict(X_test))))

In [42]:
print_score(knn, X_train, X_test, y_train, y_test, train = True)

Train Result: 

Accuracy Score: 0.9802

Classification Report: 
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       286
           1       1.00      0.95      0.97       169

   micro avg       0.98      0.98      0.98       455
   macro avg       0.98      0.97      0.98       455
weighted avg       0.98      0.98      0.98       455
 

Confusion Matrix: 
 [[286   0]
 [  9 160]] 

Average Accuracy: 	 0.9668
Average SD: 		 0.0271


In [43]:
print_score(knn, X_train, X_test, y_train, y_test, train = False)

Test Result 

Accuracy Score: 0.9474

Classification Report: 
               precision    recall  f1-score   support

           0       0.96      0.96      0.96        71
           1       0.93      0.93      0.93        43

   micro avg       0.95      0.95      0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114
 

Confusion Matrix: 
 [[68  3]
 [ 3 40]] 



# Grid Search

In [44]:
from sklearn.model_selection import GridSearchCV

In [45]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [50]:
params = {'n_neighbors' : [1,2,3,4,5,6,7,8,9,10],
         'metric' : ['euclidean', 'manhattan', 'chebyshev']}

In [51]:
grid_search_cv = GridSearchCV(KNeighborsClassifier(),
                             params,
                             n_jobs=-1,
                             verbose=1)

In [52]:
grid_search_cv.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    2.2s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'metric': ['euclidean', 'manhattan', 'chebyshev']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [53]:
grid_search_cv.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [54]:
grid_search_cv.best_params_

{'metric': 'euclidean', 'n_neighbors': 5}

In [55]:
grid_search_cv.cv_results_['mean_train_score']



array([1.        , 0.97582636, 0.9868058 , 0.97032212, 0.97801934,
       0.9659289 , 0.97032212, 0.96263942, 0.96814366, 0.96263942,
       1.        , 0.98022685, 0.98900604, 0.97362612, 0.97362612,
       0.96812914, 0.97142587, 0.96483967, 0.96703991, 0.96264668,
       1.        , 0.96921837, 0.97690833, 0.96263942, 0.96481788,
       0.95605321, 0.9626249 , 0.95054897, 0.95054171, 0.94066603])

In [56]:
grid_search_cv.cv_results_



{'mean_fit_time': array([0.00266695, 0.00300034, 0.00266695, 0.00233332, 0.00233348,
        0.0023334 , 0.00266671, 0.0023334 , 0.00200009, 0.00233348,
        0.00266695, 0.0023334 , 0.00233356, 0.00266695, 0.00166678,
        0.00233356, 0.00300034, 0.00200025, 0.00233356, 0.0023334 ,
        0.0023334 , 0.00233364, 0.00233332, 0.00233364, 0.0023334 ,
        0.00200009, 0.00166678, 0.00166663, 0.0016667 , 0.00100009]),
 'std_fit_time': array([4.71370354e-04, 8.16437410e-04, 4.71370354e-04, 4.71426560e-04,
        4.71314168e-04, 4.71370354e-04, 4.71370354e-04, 4.71370394e-04,
        0.00000000e+00, 4.71482745e-04, 4.71370354e-04, 4.71370354e-04,
        4.71257962e-04, 4.71370394e-04, 4.71201776e-04, 4.71426560e-04,
        1.41439204e-03, 1.12391596e-07, 4.71426560e-04, 4.71370354e-04,
        4.71370354e-04, 4.71538951e-04, 4.71426560e-04, 4.71370354e-04,
        4.71370354e-04, 0.00000000e+00, 4.71538951e-04, 4.71426560e-04,
        4.71482745e-04, 2.24783192e-07]),
 'mean_scor

## Comparing w/ Other Models

In [58]:
from sklearn.ensemble import RandomForestClassifier

In [59]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train.ravel())



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [60]:
print_score(rf_clf, X_train, X_test, y_train.ravel(), y_test, train = True)

Train Result: 

Accuracy Score: 0.9956

Classification Report: 
               precision    recall  f1-score   support

           0       0.99      1.00      1.00       286
           1       1.00      0.99      0.99       169

   micro avg       1.00      1.00      1.00       455
   macro avg       1.00      0.99      1.00       455
weighted avg       1.00      1.00      1.00       455
 

Confusion Matrix: 
 [[286   0]
 [  2 167]] 

Average Accuracy: 	 0.9537
Average SD: 		 0.0289


In [61]:
print_score(rf_clf, X_train, X_test, y_train, y_test, train = False)

Test Result 

Accuracy Score: 0.9561

Classification Report: 
               precision    recall  f1-score   support

           0       0.95      0.99      0.97        71
           1       0.97      0.91      0.94        43

   micro avg       0.96      0.96      0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114
 

Confusion Matrix: 
 [[70  1]
 [ 4 39]] 



In [62]:
import xgboost as xgb

In [63]:
xgb_clf = xgb.XGBClassifier(max_depth = 3, n_estimators = 5000, learning_rate = 0.2)

In [64]:
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.2,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=5000, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [65]:
print_score(xgb_clf, X_train, X_test, y_train, y_test, train = True)

Train Result: 

Accuracy Score: 1.0000

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       286
           1       1.00      1.00      1.00       169

   micro avg       1.00      1.00      1.00       455
   macro avg       1.00      1.00      1.00       455
weighted avg       1.00      1.00      1.00       455
 

Confusion Matrix: 
 [[286   0]
 [  0 169]] 

Average Accuracy: 	 0.9670
Average SD: 		 0.0224


In [66]:
print_score(xgb_clf, X_train, X_test, y_train, y_test, train = False)

Test Result 

Accuracy Score: 0.9649

Classification Report: 
               precision    recall  f1-score   support

           0       0.96      0.99      0.97        71
           1       0.98      0.93      0.95        43

   micro avg       0.96      0.96      0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114
 

Confusion Matrix: 
 [[70  1]
 [ 3 40]] 

