In [1]:
# import required packages
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPRegressor #this is the neural network part
import matplotlib.pylab as plt
from sklearn.metrics import *

In [2]:
# Special package from the class book to import regression summary statistics
!pip install -U dmba;
from dmba import regressionSummary

Collecting dmba
  Downloading dmba-0.1.0-py3-none-any.whl (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 8.9 MB/s 
[?25hInstalling collected packages: dmba
Successfully installed dmba-0.1.0
no display found. Using non-interactive Agg backend


# Work with only 'mean' columns

In [3]:
# import package from the google drive to import data
from google.colab import drive
drive.mount('/content/drive')

# load the data
df = pd.read_csv('/content/drive/Shareddrives/BA305- Team Project/breast-cancer.csv')

df = df.drop(df.filter(regex='_se').columns, axis=1)
df = df.drop(df.filter(regex='_worst').columns, axis=1)


print(df.info())
df.head(10)

Mounted at /content/drive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      569 non-null    int64  
 1   diagnosis               569 non-null    object 
 2   radius_mean             569 non-null    float64
 3   texture_mean            569 non-null    float64
 4   perimeter_mean          569 non-null    float64
 5   area_mean               569 non-null    float64
 6   smoothness_mean         569 non-null    float64
 7   compactness_mean        569 non-null    float64
 8   concavity_mean          569 non-null    float64
 9   concave points_mean     569 non-null    float64
 10  symmetry_mean           569 non-null    float64
 11  fractal_dimension_mean  569 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 53.5+ KB
None


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883
5,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613
6,844359,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742
7,84458202,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451
8,844981,M,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389
9,84501001,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243


In [4]:
# convert categorical data into dummy variables, in this case Diagnosis
df['diagnosis'] = np.where(df['diagnosis']=='B',0, 1)
df.head(10)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
3,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744
4,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883
5,843786,1,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613
6,844359,1,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742
7,84458202,1,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451
8,844981,1,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389
9,84501001,1,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243


In [5]:
# clear unrelated columns
df = df.drop(columns = 'id')

In [6]:
print(df.columns)
df.head(15)

Index(['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean'],
      dtype='object')


Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883
5,1,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613
6,1,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742
7,1,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451
8,1,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389
9,1,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243


In [7]:
# separate out output neuron and input neurons
y_nonscaled = df[['diagnosis']]
X_nonscaled = df.drop(columns=['diagnosis'])
X_nonscaled.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883


In [8]:
# normalize the data into [0,1] range
# MinMax = (X - Xmin) / (Xmax - Xmin)
scaleOutput = MinMaxScaler()
scaleInput = MinMaxScaler()

X = scaleInput.fit_transform(X_nonscaled)
y = scaleOutput.fit_transform(y_nonscaled)

# partition data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4, random_state=1)

print( "Number of 1's in training set:", sum(y_train))
print( "Number of 1's in validing set:", sum(y_valid))
print( "Proportion of 1's in validing set", sum(y_valid)/(sum(y_train)+sum(y_valid)))

Number of 1's in training set: [132.]
Number of 1's in validing set: [80.]
Proportion of 1's in validing set [0.37735849]


Neural Network with a single hidden layer and 2 nodes

In [39]:
# train neural network with single hidden layer and 2 nodes
cancer_nnet = MLPRegressor(hidden_layer_sizes=(2), activation='logistic', solver='adam', random_state=1)
cancer_nnet.fit(X_train, y_train.ravel())



MLPRegressor(activation='logistic', hidden_layer_sizes=2, random_state=1)

In [40]:
# Make predictions
y_pred_train = cancer_nnet.predict(X_train)
y_pred_valid = cancer_nnet.predict(X_valid)

In [41]:
# Convert arry to binary values
y_pred_train = np.where(y_pred_train > 0.5, 1, 0)
y_pred_valid = np.where(y_pred_valid > 0.5, 1, 0)

In [42]:
# Confusion matrix on train set
print("Confusion Matrix on train data:")
print(confusion_matrix(y_train, y_pred_train))


# Confusion matrix on valid set
print("Confusion Matrix on valid data:")
print(confusion_matrix(y_valid, y_pred_valid))

# Accuracy of full tree on training and test sets
train_score = accuracy_score(y_train, y_pred_train)
test_score = accuracy_score(y_valid, y_pred_valid)

print('Accuracy on train data =', round(train_score*100,1), '%')
print('Accuracy on valid data =', round(test_score*100,1), '%')

# recall score
m3 = recall_score(y_valid, y_pred_valid).round(4)
print('Sensitivity on valid data =', round(m3*100,1), '%')

Confusion Matrix on train data:
[[209   0]
 [132   0]]
Confusion Matrix on valid data:
[[148   0]
 [ 80   0]]
Accuracy on train data = 61.3 %
Accuracy on valid data = 64.9 %
Sensitivity on valid data = 0.0 %


Neural Network with a single hidden layer and 5 nodes


In [28]:
# train neural network with single hidden layer and 5 hidden nodes
cancer_nnet = MLPRegressor(hidden_layer_sizes=(5), activation='logistic', solver='adam', random_state=1)
cancer_nnet.fit(X_train, y_train.ravel())

# Make predictions
y_pred_train = cancer_nnet.predict(X_train)
y_pred_valid = cancer_nnet.predict(X_valid)

# Convert arry to binary values
y_pred_train = np.where(y_pred_train > 0.5, 1, 0)
y_pred_valid = np.where(y_pred_valid > 0.5, 1, 0)

# Confusion matrix on train set
print("Confusion Matrix on train data:")
print(confusion_matrix(y_train, y_pred_train))


# Confusion matrix on valid set
print("Confusion Matrix on valid data:")
print(confusion_matrix(y_valid, y_pred_valid))

# Accuracy of full tree on training and test sets
train_score = accuracy_score(y_train, y_pred_train)
test_score = accuracy_score(y_valid, y_pred_valid)

print('Accuracy on train data =', round(train_score*100,1), '%')
print('Accuracy on valid data =', round(test_score*100,1), '%')

# recall score
m3 = recall_score(y_valid, y_pred_valid).round(4)
print('Sensitivity on valid data =', round(m3*100,1), '%')

Confusion Matrix on train data:
[[208   1]
 [ 24 108]]
Confusion Matrix on valid data:
[[147   1]
 [ 19  61]]
Accuracy on train data = 92.7 %
Accuracy on valid data = 91.2 %
Sensitivity on valid data = 76.2 %




## Deep learning: Neural Network with 3 hidden layers, each with 2 nodes


In [30]:
# train neural network with 3 hidden layer and 2 hidden nodes
cancer_nnet = MLPRegressor(hidden_layer_sizes=(2), activation='logistic', solver='sgd', random_state=3)
cancer_nnet.fit(X_train, y_train.ravel())

# Make predictions
y_pred_train = cancer_nnet.predict(X_train)
y_pred_valid = cancer_nnet.predict(X_valid)

# Convert arry to binary values
y_pred_train = np.where(y_pred_train > 0.5, 1, 0)
y_pred_valid = np.where(y_pred_valid > 0.5, 1, 0)

# Confusion matrix on train set
print("Confusion Matrix on train data:")
print(confusion_matrix(y_train, y_pred_train))


# Confusion matrix on valid set
print("Confusion Matrix on valid data:")
print(confusion_matrix(y_valid, y_pred_valid))

# Accuracy of full tree on training and test sets
train_score = accuracy_score(y_train, y_pred_train)
test_score = accuracy_score(y_valid, y_pred_valid)

print('Accuracy on train data =', round(train_score*100,1), '%')
print('Accuracy on valid data =', round(test_score*100,1), '%')

# recall score
m3 = recall_score(y_valid, y_pred_valid).round(4)
print('Sensitivity on valid data =', round(m3*100,1), '%')

Confusion Matrix on train data:
[[209   0]
 [132   0]]
Confusion Matrix on valid data:
[[148   0]
 [ 80   0]]
Accuracy on train data = 61.3 %
Accuracy on valid data = 64.9 %
Sensitivity on valid data = 0.0 %


## Summary

Architecture|NN (2) log. | NN (5) log. | NN (2,2,2) log. |  | |
---|---|---|---|---|---|
Accuracy|93.4|93.4|93.4
Sensitivity |88.8|87.5|88.8

# Work only with 'Worst' columns

In [15]:
# import package from the google drive to import data
from google.colab import drive
drive.mount('/content/drive')

# load the data
df = pd.read_csv('/content/drive/Shareddrives/BA305- Team Project/breast-cancer.csv')

df = df.drop(df.filter(regex='_mean').columns, axis=1)
df = df.drop(df.filter(regex='_se').columns, axis=1)

print(df.info())

# convert categorical data into dummy variables, in this case Diagnosis
df['diagnosis'] = np.where(df['diagnosis']=='B',0, 1)

# clear unrelated columns
df = df.drop(columns = 'id')
print(df.columns)
df.head(15)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_worst             569 non-null    float64
 3   texture_worst            569 non-null    float64
 4   perimeter_worst          569 non-null    float64
 5   area_worst               569 non-null    float64
 6   smoothness_worst         569 non-null    float64
 7   compactness_worst        569 non-null    float64
 8   concavity_worst          569 non-null    float64
 9   concave points_worst     569 non-null    float64
 10  symmetry_worst           569 non-null    float64
 11  fractal_dimension_worst  569 non-null    float64
dtypes: fl

Unnamed: 0,diagnosis,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
5,1,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
6,1,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
7,1,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
8,1,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072
9,1,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075


In [16]:
y_nonscaled = df[['diagnosis']]
X_nonscaled = df.drop(columns=['diagnosis'])
X_nonscaled.head()

# normalize the data into [0,1] range
# MinMax = (X - Xmin) / (Xmax - Xmin)
scaleOutput = MinMaxScaler()
scaleInput = MinMaxScaler()

X = scaleInput.fit_transform(X_nonscaled)
y = scaleOutput.fit_transform(y_nonscaled)

# partition data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4, random_state=1)

print( "Number of 1's in training set:", sum(y_train))
print( "Number of 1's in validing set:", sum(y_valid))
print( "Proportion of 1's in validing set", sum(y_valid)/(sum(y_train)+sum(y_valid)))

Number of 1's in training set: [132.]
Number of 1's in validing set: [80.]
Proportion of 1's in validing set [0.37735849]


Neural Network with a single hidden layer and 2 nodes



In [43]:
# train neural network with single hidden layer and 2 hidden nodes
cancer_nnet = MLPRegressor(hidden_layer_sizes=(2), activation='logistic', solver='sgd', random_state=1)
cancer_nnet.fit(X_train, y_train.ravel())


# Make predictions
y_pred_train = cancer_nnet.predict(X_train)
y_pred_valid = cancer_nnet.predict(X_valid)

# Convert arry to binary values
y_pred_train = np.where(y_pred_train > 0.5, 1, 0)
y_pred_valid = np.where(y_pred_valid > 0.5, 1, 0)

# Confusion matrix on train set
print("Confusion Matrix on train data:")
print(confusion_matrix(y_train, y_pred_train))


# Confusion matrix on valid set
print("Confusion Matrix on valid data:")
print(confusion_matrix(y_valid, y_pred_valid))

# Accuracy of full tree on training and test sets
train_score = accuracy_score(y_train, y_pred_train)
test_score = accuracy_score(y_valid, y_pred_valid)

print('Accuracy on train data =', round(train_score*100,1), '%')
print('Accuracy on valid data =', round(test_score*100,1), '%')

# recall score
m3 = recall_score(y_valid, y_pred_valid).round(4)
print('Sensitivity on valid data =', round(m3*100,1), '%')

Confusion Matrix on train data:
[[209   0]
 [132   0]]
Confusion Matrix on valid data:
[[148   0]
 [ 80   0]]
Accuracy on train data = 61.3 %
Accuracy on valid data = 64.9 %
Sensitivity on valid data = 0.0 %


Neural Network with a single hidden layer and 5 nodes

In [18]:
# train neural network with single hidden layer and 5 hidden nodes
cancer_nnet = MLPRegressor(hidden_layer_sizes=(5), activation='logistic', solver='lbfgs', random_state=1)
cancer_nnet.fit(X_train, y_train.ravel())

# Make predictions
y_pred_train = cancer_nnet.predict(X_train)
y_pred_valid = cancer_nnet.predict(X_valid)

# Convert arry to binary values
y_pred_train = np.where(y_pred_train > 0.5, 1, 0)
y_pred_valid = np.where(y_pred_valid > 0.5, 1, 0)

# Confusion matrix on train set
print("Confusion Matrix on train data:")
print(confusion_matrix(y_train, y_pred_train))


# Confusion matrix on valid set
print("Confusion Matrix on valid data:")
print(confusion_matrix(y_valid, y_pred_valid))

# Accuracy of full tree on training and test sets
train_score = accuracy_score(y_train, y_pred_train)
test_score = accuracy_score(y_valid, y_pred_valid)

print('Accuracy on train data =', round(train_score*100,1), '%')
print('Accuracy on valid data =', round(test_score*100,1), '%')

# recall score
m3 = recall_score(y_valid, y_pred_valid).round(4)
print('Sensitivity on valid data =', round(m3*100,1), '%')

Confusion Matrix on train data:
[[208   1]
 [  2 130]]
Confusion Matrix on valid data:
[[145   3]
 [  6  74]]
Accuracy on train data = 99.1 %
Accuracy on valid data = 96.1 %
Sensitivity on valid data = 92.5 %


Deep learning: Neural Network with 3 hidden layers, each with 2 nodes

In [19]:
# train neural network with 3 hidden layer and 2 hidden nodes
cancer_nnet = MLPRegressor(hidden_layer_sizes=(2), activation='logistic', solver='lbfgs', random_state=3)
cancer_nnet.fit(X_train, y_train.ravel())


# Make predictions
y_pred_train = cancer_nnet.predict(X_train)
y_pred_valid = cancer_nnet.predict(X_valid)

# Convert arry to binary values
y_pred_train = np.where(y_pred_train > 0.5, 1, 0)
y_pred_valid = np.where(y_pred_valid > 0.5, 1, 0)

# Confusion matrix on train set
print("Confusion Matrix on train data:")
print(confusion_matrix(y_train, y_pred_train))


# Confusion matrix on valid set
print("Confusion Matrix on valid data:")
print(confusion_matrix(y_valid, y_pred_valid))

# Accuracy of full tree on training and test sets
train_score = accuracy_score(y_train, y_pred_train)
test_score = accuracy_score(y_valid, y_pred_valid)

print('Accuracy on train data =', round(train_score*100,1), '%')
print('Accuracy on valid data =', round(test_score*100,1), '%')

# recall score
m3 = recall_score(y_valid, y_pred_valid).round(4)
print('Sensitivity on valid data =', round(m3*100,1), '%')

Confusion Matrix on train data:
[[207   2]
 [  2 130]]
Confusion Matrix on valid data:
[[145   3]
 [  3  77]]
Accuracy on train data = 98.8 %
Accuracy on valid data = 97.4 %
Sensitivity on valid data = 96.2 %


## Summary

Architecture|NN (2) log. | NN (5) log. | NN (2,2,2) log. |  | |
---|---|---|---|---|---|
Accuracy|96.5|96.1|97.4
Sensitivity |95|92.5|96.2

# Work with ‘means’ and ‘worst’ columns combined

In [20]:
# import package from the google drive to import data
from google.colab import drive
drive.mount('/content/drive')

# load the data
df = pd.read_csv('/content/drive/Shareddrives/BA305- Team Project/breast-cancer.csv')

df = df.drop(df.filter(regex='_se').columns, axis=1)

print(df.info())

# convert categorical data into dummy variables, in this case Diagnosis
df['diagnosis'] = np.where(df['diagnosis']=='B',0, 1)

# clear unrelated columns
df = df.drop(columns = 'id')
print(df.columns)
df.head(15)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radiu

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
5,1,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
6,1,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,...,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
7,1,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,...,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
8,1,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,...,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072
9,1,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,...,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075


In [21]:
y_nonscaled = df[['diagnosis']]
X_nonscaled = df.drop(columns=['diagnosis'])
X_nonscaled.head()

# normalize the data into [0,1] range
# MinMax = (X - Xmin) / (Xmax - Xmin)
scaleOutput = MinMaxScaler()
scaleInput = MinMaxScaler()

X = scaleInput.fit_transform(X_nonscaled)
y = scaleOutput.fit_transform(y_nonscaled)

# partition data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4, random_state=1)

print( "Number of 1's in training set:", sum(y_train))
print( "Number of 1's in validing set:", sum(y_valid))
print( "Proportion of 1's in validing set", sum(y_valid)/(sum(y_train)+sum(y_valid)))

Number of 1's in training set: [132.]
Number of 1's in validing set: [80.]
Proportion of 1's in validing set [0.37735849]


Neural Network with a single hidden layer and 2 nodes

In [22]:
# train neural network with single hidden layer and 2 hidden nodes
cancer_nnet = MLPRegressor(hidden_layer_sizes=(2), activation='logistic', solver='lbfgs', random_state=1)
cancer_nnet.fit(X_train, y_train.ravel())

# Make predictions
y_pred_train = cancer_nnet.predict(X_train)
y_pred_valid = cancer_nnet.predict(X_valid)

# Convert arry to binary values
y_pred_train = np.where(y_pred_train > 0.5, 1, 0)
y_pred_valid = np.where(y_pred_valid > 0.5, 1, 0)

# Confusion matrix on train set
print("Confusion Matrix on train data:")
print(confusion_matrix(y_train, y_pred_train))


# Confusion matrix on valid set
print("Confusion Matrix on valid data:")
print(confusion_matrix(y_valid, y_pred_valid))

# Accuracy of full tree on training and test sets
train_score = accuracy_score(y_train, y_pred_train)
test_score = accuracy_score(y_valid, y_pred_valid)

print('Accuracy on train data =', round(train_score*100,1), '%')
print('Accuracy on valid data =', round(test_score*100,1), '%')

# recall score
m3 = recall_score(y_valid, y_pred_valid).round(4)
print('Sensitivity on valid data =', round(m3*100,1), '%')

Confusion Matrix on train data:
[[208   1]
 [  2 130]]
Confusion Matrix on valid data:
[[145   3]
 [  5  75]]
Accuracy on train data = 99.1 %
Accuracy on valid data = 96.5 %
Sensitivity on valid data = 93.8 %


Neural Network with a single hidden layer and 5 nodes

In [23]:
# train neural network with single hidden layer and 5 hidden nodes
cancer_nnet = MLPRegressor(hidden_layer_sizes=(5), activation='logistic', solver='lbfgs', random_state=1)
cancer_nnet.fit(X_train, y_train.ravel())

# Make predictions
y_pred_train = cancer_nnet.predict(X_train)
y_pred_valid = cancer_nnet.predict(X_valid)

# Convert arry to binary values
y_pred_train = np.where(y_pred_train > 0.5, 1, 0)
y_pred_valid = np.where(y_pred_valid > 0.5, 1, 0)

# Confusion matrix on train set
print("Confusion Matrix on train data:")
print(confusion_matrix(y_train, y_pred_train))


# Confusion matrix on valid set
print("Confusion Matrix on valid data:")
print(confusion_matrix(y_valid, y_pred_valid))

# Accuracy of full tree on training and test sets
train_score = accuracy_score(y_train, y_pred_train)
test_score = accuracy_score(y_valid, y_pred_valid)

print('Accuracy on train data =', round(train_score*100,1), '%')
print('Accuracy on valid data =', round(test_score*100,1), '%')

# recall score
m3 = recall_score(y_valid, y_pred_valid).round(4)
print('Sensitivity on valid data =', round(m3*100,1), '%')

Confusion Matrix on train data:
[[208   1]
 [  1 131]]
Confusion Matrix on valid data:
[[143   5]
 [  3  77]]
Accuracy on train data = 99.4 %
Accuracy on valid data = 96.5 %
Sensitivity on valid data = 96.2 %


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Deep learning: Neural Network with 3 hidden layers, each with 2 nodes

In [24]:
# train neural network with 3 hidden layer and 2 hidden nodes
cancer_nnet = MLPRegressor(hidden_layer_sizes=(2), activation='logistic', solver='lbfgs', random_state=3)
cancer_nnet.fit(X_train, y_train.ravel())

# Make predictions
y_pred_train = cancer_nnet.predict(X_train)
y_pred_valid = cancer_nnet.predict(X_valid)

# Convert arry to binary values
y_pred_train = np.where(y_pred_train > 0.5, 1, 0)
y_pred_valid = np.where(y_pred_valid > 0.5, 1, 0)

# Confusion matrix on train set
print("Confusion Matrix on train data:")
print(confusion_matrix(y_train, y_pred_train))


# Confusion matrix on valid set
print("Confusion Matrix on valid data:")
print(confusion_matrix(y_valid, y_pred_valid))

# Accuracy of full tree on training and test sets
train_score = accuracy_score(y_train, y_pred_train)
test_score = accuracy_score(y_valid, y_pred_valid)

print('Accuracy on train data =', round(train_score*100,1), '%')
print('Accuracy on valid data =', round(test_score*100,1), '%')

# recall score
m3 = recall_score(y_valid, y_pred_valid).round(4)
print('Sensitivity on valid data =', round(m3*100,1), '%')

Confusion Matrix on train data:
[[208   1]
 [  2 130]]
Confusion Matrix on valid data:
[[144   4]
 [  5  75]]
Accuracy on train data = 99.1 %
Accuracy on valid data = 96.1 %
Sensitivity on valid data = 93.8 %


## Summary

Architecture|NN (2) log. | NN (5) log. | NN (2,2,2) log. |  | |
---|---|---|---|---|---|
Accuracy|96.5|96.5|96.1
Sensitivity |93.8|96.2|93.8