In [1]:
# importing our libraries 

import numpy as np
# scikit learn comes with a number of preloaded datasets
from sklearn.datasets import load_breast_cancer


In [2]:
data = load_breast_cancer()

In [3]:
type(data) # this is a lot like python dictionaries 

sklearn.utils.Bunch

In [4]:
data.keys() # we can use these keys as labels

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [5]:
data.data

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [6]:
data.data.shape

(569, 30)

In [7]:
data.target # we see that these are just bunch of zeroes and ones 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [8]:
# we can see what these labels means as 0 and 1 are not the actual targets they bound to have meaning 
data.target_names
# so 1 is malignant
# 2 is benign

array(['malignant', 'benign'], dtype='<U9')

In [9]:
data.target.shape # so we can interpret this as every row has a target 

(569,)

In [10]:
# so we make the  predictions never on the data we use to train the model
# as we know that could mean that our model has memorized the results
# thats why in sci-kit learn we have the trian test split function
# which divides the dataset into a randomly generated two parts 
# one is used for training and the other is used for testing 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.33)
# so according to this call 33% of the data is going to test data and rest train data 

In [11]:
# Now it is time to make a machine learning model which will do the classsification
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

In [12]:
model.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [13]:
model.score(X_train,y_train)
# this gives us the accuracy of our predictions on the training set 
# 1.0 means all predictions are perfect

0.994750656167979

In [14]:
model.score(X_test, y_test) # this gives us the accuracy of the test set

0.9627659574468085

In [15]:
# now let us make predictions from the model we have trained
predictions = model.predict(X_test)

In [16]:
predictions

array([1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1])

In [17]:
len(predictions) 

188

In [18]:
len(y_test)

188

In [19]:
# let is check the accuracy manually 
correct = 0
for i  in range(0,188):
    if predictions[i]==y_test[i]:
        correct = correct+1
correct_predicts = correct/len(predictions)

In [20]:
correct # this gives us the number of correct predictions

181

In [21]:
correct_predicts # comes out to be the same as calculated earlier 
# using the model.score() function

0.9627659574468085

In [22]:
predict1 = model.predict([X_test[176,:]]) # making one single prediction

In [23]:
predict1

array([1])

In [24]:
if predict1 == [y_test[176]]:
    print("right prediction")

right prediction


In [25]:
# now let do the classification using deep learning


In [26]:
from sklearn.neural_network import MLPClassifier

In [27]:
# its is important to scale the training data to reduce the computation load

from sklearn.preprocessing import StandardScaler 


In [28]:
scaler = StandardScaler()

In [29]:
X_train2 = scaler.fit_transform(X_train)
X_test2 = scaler.transform(X_test)

In [30]:
model = MLPClassifier()
model.fit(X_train2, y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [31]:
model.score(X_train2, y_train)

0.9921259842519685

In [32]:
model.score(X_test2, y_test) 
# we can see that the accuracy of neural net based classifier is better than 
# Random

0.9787234042553191