# Predicting the Class of Cancer

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## Importing Dataset

In [2]:
df = pd.read_csv('Cancer data set.txt')

## Exploratory Data Analysis

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
id                        699 non-null int64
clump_thickness           699 non-null int64
unif_cell_size            699 non-null int64
unif_cell_shape           699 non-null int64
marg_adhesion             699 non-null int64
single_epith_cell_size    699 non-null int64
bare_nuclei               699 non-null object
bland_chrom               699 non-null int64
norm_nucleoli             699 non-null int64
mitoses                   699 non-null int64
class                     699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


In [4]:
df.head()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [5]:
df.describe()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bland_chrom,norm_nucleoli,mitoses,class
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


The 'bare_nuclei' attribute is an object and contains some missing values in the form of '?'.

So Replacing the '?' with NaN and then replacing NaN using forward fill method.

After that convert the column into int64 data type.

In [6]:
df['bare_nuclei'].replace('?',np.nan,inplace=True)
df['bare_nuclei'].fillna(method='ffill',inplace=True)
df['bare_nuclei'] = df['bare_nuclei'].astype('int64')

## Training the data using Naive Bayes Classifier

#### Using the Multinomial Naive Bayes classifier since the target data is discrete.

In [7]:
# Imports
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [8]:
# Extracting the Independent and Dependent variables from the dataset
X = df.loc[:,:'mitoses']
y = df.loc[:,'class':]

In [9]:
# Splitting into training and testing data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3) 

In [10]:
# Training the data
NBClassifier = MultinomialNB(alpha=1.0)
NBClassifier.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
# Predicting the values for test dataset
y_pred = NBClassifier.predict(X_test)

In [12]:
# Testing the Accuracy of the model
print('Confusion Matrix:-\n')
print(metrics.confusion_matrix(y_test,y_pred))
print(metrics.classification_report(y_test,y_pred),'\n')
mat = metrics.confusion_matrix(y_test,y_pred)

print('Accuracy Score {}'.format(metrics.accuracy_score(y_test,y_pred)),'\n')  # Best Result

Confusion Matrix:-

[[131  10]
 [  2  67]]
             precision    recall  f1-score   support

          2       0.98      0.93      0.96       141
          4       0.87      0.97      0.92        69

avg / total       0.95      0.94      0.94       210
 

Accuracy Score 0.9428571428571428 

