#  1.Reading Data

In [45]:
import pandas as pd

In [46]:
data = pd.read_csv('breast-cancer-wisconsin.data')

In [47]:
data.head()

Unnamed: 0,1000025,5,1,1.1,1.2,2,1.3,3,1.4,1.5,2.1
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4


In [48]:
columns_names= ['sample_code_number',
    'clump_thickness',
    'uniformity_of_cell_size',
    'uniformity_of_cell_shape',
    'marginal_adhesion',
    'single_epithelial_cell_size',
    'bare_nuclei',
    'bland_chromatin',
    'normal_nucleoli',
    'mitoses',
    'class']

In [49]:
data = pd.read_csv('breast-cancer-wisconsin.data', names=columns_names)

In [50]:
data.head()

Unnamed: 0,sample_code_number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


# 2.Data Preprocessing

In [51]:
data.dtypes

sample_code_number              int64
clump_thickness                 int64
uniformity_of_cell_size         int64
uniformity_of_cell_shape        int64
marginal_adhesion               int64
single_epithelial_cell_size     int64
bare_nuclei                    object
bland_chromatin                 int64
normal_nucleoli                 int64
mitoses                         int64
class                           int64
dtype: object

In [52]:
def check_numeric(x):
    return x.isnumeric()
def check_not_numeric(x):
    return not x.isnumeric()

In [53]:
mask = data['bare_nuclei'].apply(check_not_numeric)
mask

0      False
1      False
2      False
3      False
4      False
       ...  
694    False
695    False
696    False
697    False
698    False
Name: bare_nuclei, Length: 699, dtype: bool

In [54]:
data[mask]

Unnamed: 0,sample_code_number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
23,1057013,8,4,5,1,2,?,7,3,1,4
40,1096800,6,6,6,9,6,?,7,8,1,2
139,1183246,1,1,1,1,1,?,2,1,1,2
145,1184840,1,1,3,1,2,?,2,1,1,2
158,1193683,1,1,2,1,3,?,1,1,1,2
164,1197510,5,1,1,1,2,?,3,1,1,2
235,1241232,3,1,4,1,2,?,3,1,1,2
249,169356,3,1,1,1,2,?,3,1,1,2
275,432809,3,1,3,1,2,?,2,1,1,2
292,563649,8,8,8,1,2,?,6,10,1,4


In [55]:
data_numeric = data[~mask]

In [56]:
data_numeric.head()

Unnamed: 0,sample_code_number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [57]:
data_numeric.dtypes

sample_code_number              int64
clump_thickness                 int64
uniformity_of_cell_size         int64
uniformity_of_cell_shape        int64
marginal_adhesion               int64
single_epithelial_cell_size     int64
bare_nuclei                    object
bland_chromatin                 int64
normal_nucleoli                 int64
mitoses                         int64
class                           int64
dtype: object

In [58]:
data_numeric['bare_nuclei'] = data_numeric['bare_nuclei'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_numeric['bare_nuclei'] = data_numeric['bare_nuclei'].astype('int64')


In [59]:
data_numeric.dtypes

sample_code_number             int64
clump_thickness                int64
uniformity_of_cell_size        int64
uniformity_of_cell_shape       int64
marginal_adhesion              int64
single_epithelial_cell_size    int64
bare_nuclei                    int64
bland_chromatin                int64
normal_nucleoli                int64
mitoses                        int64
class                          int64
dtype: object

In [60]:
#preparing input and output data
data_input = data_numeric.drop(columns=['sample_code_number', 'class'])
data_output = data_numeric['class']

In [61]:
data_input.head()

Unnamed: 0,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1


In [62]:
data_output.head()

0    2
1    2
2    2
3    2
4    2
Name: class, dtype: int64

In [63]:
# print unique values (we can see the values are 2 and 4)
data_output.unique()

array([2, 4], dtype=int64)

In [64]:
data_output = data_output.replace({2:0,4:1})

In [65]:
# print unique values (the values now become 0 and 1)
data_output.unique()

array([0, 1], dtype=int64)

# 3.Splitting the data into (train - validation - test)


In [66]:
from sklearn.model_selection import train_test_split

In [67]:
# splitting 33% for testing
X, X_test, y, y_test = train_test_split(data_input,data_output,test_size=0.33,random_state=0)
# splitting the remaining part (X, y) to training and validation
X_train,X_val,y_train,y_val = train_test_split(X, y,test_size=0.33,random_state=0)

In [68]:
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('------------')
print('X_val:', X_val.shape)
print('y_val:', y_val.shape)
print('------------')
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

X_train: (306, 9)
y_train: (306,)
------------
X_val: (151, 9)
y_val: (151,)
------------
X_test: (226, 9)
y_test: (226,)


# 4. Training and validation (model selection)


In [69]:
from sklearn.naive_bayes import CategoricalNB

In [70]:
model = CategoricalNB()
model.fit(X_train, y_train)

CategoricalNB()

In [71]:
from sklearn.metrics import accuracy_score

In [72]:
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)

In [73]:
print('Training accuracy =', accuracy_score(y_train, y_pred_train))
print('Validation accuracy =', accuracy_score(y_val, y_pred_val))

Training accuracy = 0.9934640522875817
Validation accuracy = 0.9536423841059603


In [74]:
y_pred_test = model.predict(X_test)

In [75]:
print('Test accuracy:', accuracy_score(y_test, y_pred_test))

Test accuracy: 0.9646017699115044


# 6. Saving our model

We use `pickle` to save our model to a file. For example, the following code will save `modelt` to the file `CategroicalNB-model.pickle`.

In [76]:
import pickle
with open('CategroicalNB-model.pickle','wb') as f:
    pickle.dump(model, f)

To load a saved model we use pickle.load() function.

In [77]:
with open('CategroicalNB-model.pickle','rb') as f:
    loaded_model=pickle.load(f)

In [78]:
loaded_model 

CategoricalNB()