In [190]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split

import random

np.set_printoptions(precision=3, suppress=True)


In [191]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'

column_names = ['ID', 'Diagnosis',
                'Radius Mean', 'Texture Mean', 'Perimeter Mean', 'Area Mean', 'Smoothness Mean', 'Compactness Mean', 'Concavity Mean', 'Concavity Points Mean', 'Symmetry Mean', 'Fractal Dimension Mean',
                'Radius SE', 'Texture SE', 'Perimeter SE', 'Area SE', 'Smoothness SE', 'Compactness SE', 'Concavity SE', 'Concavity Points SE', 'Symmetry SE', 'Fractal Dimension SE',
                'Radius Worst', 'Texture Worst', 'Perimeter Worst', 'Area Worst', 'Smoothness Worst', 'Compactness Worst', 'Concavity Worst', 'Concavity Points Worst', 'Symmetry Worst', 'Fractal Dimension Worst']

rawdataset = pd.read_csv(url, names = column_names, na_values = '?', comment = '\t', skipinitialspace=True)

In [192]:
dataset = rawdataset.copy()
dataset = dataset.dropna() # there are no na values just wanted to put there in case
dataset.pop('ID') # remove id values because it's useless

dataset.head()

Unnamed: 0,Diagnosis,Radius Mean,Texture Mean,Perimeter Mean,Area Mean,Smoothness Mean,Compactness Mean,Concavity Mean,Concavity Points Mean,Symmetry Mean,...,Radius Worst,Texture Worst,Perimeter Worst,Area Worst,Smoothness Worst,Compactness Worst,Concavity Worst,Concavity Points Worst,Symmetry Worst,Fractal Dimension Worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [193]:
dataset['Diagnosis'] = dataset['Diagnosis'].map({'M': 1, 'B': 0}) # convert M and B to 1 and 0 respectively
dataset.tail()

Unnamed: 0,Diagnosis,Radius Mean,Texture Mean,Perimeter Mean,Area Mean,Smoothness Mean,Compactness Mean,Concavity Mean,Concavity Points Mean,Symmetry Mean,...,Radius Worst,Texture Worst,Perimeter Worst,Area Worst,Smoothness Worst,Compactness Worst,Concavity Worst,Concavity Points Worst,Symmetry Worst,Fractal Dimension Worst
564,1,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,...,25.45,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115
565,1,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,...,23.69,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637
566,1,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,...,18.98,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782
567,1,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,...,25.74,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124
568,0,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,...,9.456,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039


In [194]:
dataset.shape

(569, 31)

In [195]:
train,test = train_test_split(dataset, train_size=0.75) # train dataset is 75% of dataset

trainlabels = train.pop('Diagnosis') # set train and test labels to the diagnosis columns
testlabels = test.pop('Diagnosis')

In [196]:
trainlabels.index = pd.RangeIndex(1, len(trainlabels.index) + 1) # reset the indices
testlabels.index = pd.RangeIndex(1, len(testlabels.index) + 1)

In [208]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(36, input_shape = (30, ), activation = 'sigmoid'),  # too many values will cause overfitting
    tf.keras.layers.Dense(1)    
])

In [209]:
model.compile(optimizer='adam',
              loss = tf.keras.losses.MeanSquaredError(), # MSE will be enough for this case
              metrics = ['accuracy'])

In [215]:
logdir = "logs/36hiddensigmoid"
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)  # log it into tensorboard bc you can see the loss and accuracy graphs

model.fit(train, trainlabels, epochs=100, callbacks = [tensorboard_callback], verbose = 0) # train for 100 epochs due to small dataset

<keras.callbacks.History at 0x7f03d5a01780>

In [216]:
test_loss, test_acc = model.evaluate(test,  testlabels, verbose=2) 

print('Accuracy: ', test_acc) # yay

5/5 - 0s - loss: 0.0474 - accuracy: 0.9301 - 17ms/epoch - 3ms/step
Accuracy:  0.9300699234008789


In [296]:
predictions = model.predict(test)

In [310]:
for i in range(len(predictions)):
    if predictions[i] < 0.5:
        predictions[i] = 0
    else:                    # is there a better way of doing this
        predictions[i] = 1

In [307]:
test.shape

(143, 30)

In [391]:
index = random.randint(0, 143)

print(f"prediction for {index}: {int(predictions[index])}") # 0 is benign, 1 is malignant
print(f"actual: {testlabels[index + 1]}")
print(f"equal? {int(predictions[index]) == testlabels[index + 1]}")

prediction for 109: 1
actual: 1
equal? True


### Final Remarks

This was a great example of how a model overfits to a small dataset.
To counter this, lowering the total amount of trainable values will work.

I also feel like 90ish percent isn't the best that I can achieve, but I have no idea how I can improve.