In [30]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from keras.optimizers import SGD, Adam

##### About Data
Wisconsin Breast Cancer dataset has 699 observations collected using fine-needle tissue from a mass under skin. It has 11 variables of which nine are predictor variables (cytological characteristics used to identify mass as benign or malign), ID and a class variable (has values 2 for benign, 4 for malignant). 458 of the samples are benign and 241 are malignant. <br/>

There are 16 samples with missing data. Data file doesn't have column names and they are listed in a separate file. <br/>
'class' variable is the outcome (2=benign, 4=malignant).

In [31]:
data = pd.read_csv('data/breast-cancer-wisconsin.data.txt', sep=",", header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


###### Adding column headers to data

In [32]:
data.columns = ["ID", "clumpThickness", "sizeUniformity",
"shapeUniformity", "maginalAdhesion",
"singleEpithelialCellSize", "bareNuclei",
"blandChromatin", "normalNucleoli", "mitosis", "class"]
data.head(10)

Unnamed: 0,ID,clumpThickness,sizeUniformity,shapeUniformity,maginalAdhesion,singleEpithelialCellSize,bareNuclei,blandChromatin,normalNucleoli,mitosis,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
5,1017122,8,10,10,8,7,10,9,7,1,4
6,1018099,1,1,1,1,2,10,3,1,1,2
7,1018561,2,1,2,1,2,1,3,1,1,2
8,1033078,2,1,1,1,2,1,1,1,5,2
9,1033078,4,2,1,1,2,1,2,1,1,2


In [33]:
data.shape

(699, 11)

###### Counting the different classes of 'class' variable

In [34]:
data['class'].value_counts()

2    458
4    241
Name: class, dtype: int64

###### Converting classes '2','4' to binary '0'(benign) and '1'(malignant)

In [35]:
data.dtypes

ID                           int64
clumpThickness               int64
sizeUniformity               int64
shapeUniformity              int64
maginalAdhesion              int64
singleEpithelialCellSize     int64
bareNuclei                  object
blandChromatin               int64
normalNucleoli               int64
mitosis                      int64
class                        int64
dtype: object

######  Converting 'class' variable from int to string

In [36]:
data['class'] = data['class'].astype(str)

In [37]:
#Converting 'object' to numeric data type
data['bareNuclei'] = pd.to_numeric(data['bareNuclei'], errors='coerce')
#data['bareNuclei'] = data['bareNuclei'].astype(str).astype(int)

In [38]:
data.dtypes

ID                            int64
clumpThickness                int64
sizeUniformity                int64
shapeUniformity               int64
maginalAdhesion               int64
singleEpithelialCellSize      int64
bareNuclei                  float64
blandChromatin                int64
normalNucleoli                int64
mitosis                       int64
class                        object
dtype: object

In [39]:
data.isnull().sum()

ID                           0
clumpThickness               0
sizeUniformity               0
shapeUniformity              0
maginalAdhesion              0
singleEpithelialCellSize     0
bareNuclei                  16
blandChromatin               0
normalNucleoli               0
mitosis                      0
class                        0
dtype: int64

###### Replacing missing values with mean value of column

In [40]:
data["bareNuclei"].fillna(data["bareNuclei"].mean(), inplace=True)

In [41]:
#Now check for missing values in dataframe
data.isnull().sum()

ID                          0
clumpThickness              0
sizeUniformity              0
shapeUniformity             0
maginalAdhesion             0
singleEpithelialCellSize    0
bareNuclei                  0
blandChromatin              0
normalNucleoli              0
mitosis                     0
class                       0
dtype: int64

In [42]:
data = data.drop(['ID'], axis=1)
data.columns

Index(['clumpThickness', 'sizeUniformity', 'shapeUniformity',
       'maginalAdhesion', 'singleEpithelialCellSize', 'bareNuclei',
       'blandChromatin', 'normalNucleoli', 'mitosis', 'class'],
      dtype='object')

In [43]:
data.describe()

Unnamed: 0,clumpThickness,sizeUniformity,shapeUniformity,maginalAdhesion,singleEpithelialCellSize,bareNuclei,blandChromatin,normalNucleoli,mitosis
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,4.41774,3.134478,3.207439,2.806867,3.216023,3.544656,3.437768,2.866953,1.589413
std,2.815741,3.051459,2.971913,2.855379,2.2143,3.601852,2.438364,3.053634,1.715078
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0
50%,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0
75%,6.0,5.0,5.0,4.0,4.0,5.0,5.0,4.0,1.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0


In [44]:
data['mitosis'].value_counts()

1     579
2      35
3      33
10     14
4      12
7       9
8       8
5       6
6       3
Name: mitosis, dtype: int64

In [45]:
#np.ravel(data['class'])

###### Splitting data into input and output variables

In [46]:
X = data.iloc[:,0:9].values
y = data.iloc[:,9].values

In [47]:
#encoding class values as integers
encoder =  LabelEncoder()
y1 = encoder.fit_transform(y)
Y = pd.get_dummies(y1).values

###### Splitting data into train and test variables

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=0) 

In [49]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(489, 9)
(489, 2)
(210, 9)
(210, 2)


In [50]:
model = Sequential()

model.add(Dense(10, input_shape=(9,), activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(2, activation='softmax'))

In [51]:
model.compile(optimizer=Adam(lr=0.04), loss='categorical_crossentropy', metrics=['accuracy'])

In [52]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 10)                100       
_________________________________________________________________
dense_5 (Dense)              (None, 8)                 88        
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 18        
Total params: 206
Trainable params: 206
Non-trainable params: 0
_________________________________________________________________


In [54]:
model.fit(X_train, y_train, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x19e046ea828>

In [55]:
scores = model.evaluate(X_test, y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))


acc: 95.24%


In [56]:
y_pred = model.predict(X_test)

y_test_class = np.argmax(y_test, axis=1)
y_pred_class = np.argmax(y_pred, axis=1)

In [57]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test_class, y_pred_class))
print(confusion_matrix(y_test_class, y_pred_class))

             precision    recall  f1-score   support

          0       0.98      0.95      0.96       135
          1       0.91      0.96      0.94        75

avg / total       0.95      0.95      0.95       210

[[128   7]
 [  3  72]]
