In [61]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC # SVR for regression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

%config Completer.use_jedi = False

In [2]:
data = load_breast_cancer()
data

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [3]:
data.target_names

array(['malignant', 'benign'], dtype='<U9')

In [4]:
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [5]:
df = pd.DataFrame(np.c_[data.data, data.target], columns = [list(data.feature_names) + ["target"]])
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [6]:
df.shape

(569, 31)

In [7]:
df.select_dtypes(include="O").columns

MultiIndex([], )

## Train Test Split

In [8]:
X = df.iloc[:,0:-1]
y = df.iloc[:,-1]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2, random_state=2020)

print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of y_test: ", y_test.shape)

Shape of X_train:  (455, 30)
Shape of X_test:  (114, 30)
Shape of y_train:  (455,)
Shape of y_test:  (114,)


## Train Support Vector Classification Model

In [10]:
classification_rbf = SVC(kernel="rbf")
classification_rbf.fit(X_train,y_train)

SVC()

In [11]:
classification_rbf.predict(X_test)

array([0., 1., 1., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1., 1.,
       1., 0., 0., 0., 0., 0., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1.,
       0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 0.,
       1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1.,
       0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1.,
       0., 1., 1., 1., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1.])

In [12]:
y_test

236    0.0
106    1.0
284    1.0
262    0.0
356    1.0
      ... 
370    0.0
167    0.0
108    0.0
131    0.0
336    1.0
Name: (target,), Length: 114, dtype: float64

In [13]:
classification_rbf.score(X_test, y_test)

0.9122807017543859

## Feature Scaling

In [14]:
sc = StandardScaler()
sc.fit(X_train)
X_train_sc = sc.transform(X_train)
X_test_sc = sc.transform(X_test)

In [15]:
classification_rbf2 = SVC(kernel="rbf")
classification_rbf2.fit(X_train_sc,y_train)
classification_rbf2.score(X_test_sc, y_test)

0.956140350877193

## SVC with different Kernel

In [16]:
classification_poly = SVC(kernel="poly", degree =1)
classification_poly.fit(X_train_sc,y_train)
classification_poly.score(X_test_sc, y_test)

0.9736842105263158

In [17]:
classification_linear = SVC(kernel="linear")
classification_linear.fit(X_train_sc,y_train)
classification_linear.score(X_test_sc, y_test)

0.9649122807017544

## Using Decision Tree

In [18]:
dt_classifier = DecisionTreeClassifier(criterion="gini")
dt_classifier.fit(X_train_sc,y_train)
dt_classifier.score(X_test_sc, y_test)

0.956140350877193

In [19]:
# Without Scaling
dt_classifier = DecisionTreeClassifier(criterion="gini")
dt_classifier.fit(X_train,y_train)
dt_classifier.score(X_test, y_test)

0.9473684210526315

In [20]:
dt_classifier_entropy = DecisionTreeClassifier(criterion="entropy")
dt_classifier_entropy.fit(X_train_sc,y_train)
dt_classifier_entropy.score(X_test_sc, y_test)

0.9385964912280702

## Random Forest Model

In [33]:
# Without Scaling
rf_classifier = RandomForestClassifier(n_estimators=100, criterion="gini")
rf_classifier.fit(X_train,y_train)
rf_classifier.score(X_test, y_test)

0.9473684210526315

In [34]:
rf_classifier_scaled = RandomForestClassifier(n_estimators=100)
rf_classifier_scaled.fit(X_train_sc,y_train)
rf_classifier_scaled.score(X_test_sc, y_test)

0.956140350877193

## K Nearest Neighbor Model

In [49]:
# Without Scaling
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train,y_train)
knn_classifier.score(X_test, y_test)

0.9298245614035088

In [55]:
knn_classifier_scaled = KNeighborsClassifier(n_neighbors = 5)
knn_classifier_scaled.fit(X_train_sc,y_train)
knn_classifier_scaled.score(X_test_sc, y_test)

0.956140350877193

## Naive Bayes Model

In [62]:
# Without Scaling
naive_classifier = GaussianNB()
naive_classifier.fit(X_train,y_train)
naive_classifier.score(X_test, y_test)

0.9736842105263158

In [63]:
naive_classifier_scaled = GaussianNB()
naive_classifier_scaled.fit(X_train_sc,y_train)
naive_classifier_scaled.score(X_test_sc, y_test)

0.956140350877193

In [64]:
# Without Scaling
naive_classifier = MultinomialNB()
naive_classifier.fit(X_train,y_train)
naive_classifier.score(X_test, y_test)

0.8947368421052632

In [69]:
# Gives Error because after standardisation it contains negative value
# naive_classifier_scaled = MultinomialNB()
# naive_classifier_scaled.fit(X_train_sc,y_train)
# naive_classifier_scaled.score(X_test_sc, y_test)

In [67]:
# Without Scaling
naive_classifier = BernoulliNB()
naive_classifier.fit(X_train,y_train)
naive_classifier.score(X_test, y_test)

0.5789473684210527

In [68]:
naive_classifier_scaled = BernoulliNB()
naive_classifier_scaled.fit(X_train_sc,y_train)
naive_classifier_scaled.score(X_test_sc, y_test)

0.9298245614035088

## Predict Cancer

In [21]:
index = 2
patient = X_test.iloc[index,:]
patient_sc = sc.transform(np.array([patient]))
patient_sc

array([[-0.33443758, -0.81744874, -0.30626226, -0.37684746, -1.28957549,
        -0.12772961,  0.31556048, -0.36685311, -1.37174904, -0.47573344,
        -0.37190425,  0.33430883, -0.22536382, -0.34696803, -0.22852062,
         0.82591904,  1.49122533,  0.93565729, -0.18678167, -0.01508166,
        -0.47334286, -0.97041341, -0.43273443, -0.4854511 , -1.46807446,
        -0.11142846,  0.33689495, -0.17759689, -1.46415587, -0.71427857]])

In [22]:
pred = classification_poly.predict(patient_sc)[0]
pred = dt_classifier.predict(patient_sc)[0]
pred

0.0

In [23]:
y_test.iloc[index]

1.0

In [24]:
if pred == 1:
    print("Sorry you may have Cancer")
else:
    print("Congrats you don't have cancer")

Congrats you don't have cancer


## Saving Model

In [35]:
import pickle

In [39]:
pickle.dump(rf_classifier_scaled, open("breast_cancer_model", "wb"))

In [40]:
model = pickle.load(open("breast_cancer_model","rb"))

In [41]:
model.predict(patient_sc)[0]

1.0

In [42]:
import joblib

In [43]:
joblib.dump(rf_classifier_scaled,"breast_cancer_model")

['breast_cancer_model']

In [44]:
model = joblib.load("breast_cancer_model")
model.predict(patient_sc)[0]

1.0