### Classifying	human cells	to whether the samples are benign or malignant.	

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('cell_data.csv')
data.head(10)

Unnamed: 0,ID,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BareNuc,BlandChrom,NormNucl,Mit,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
5,1017122,8,10,10,8,7,10,9,7,1,4
6,1018099,1,1,1,1,2,10,3,1,1,2
7,1018561,2,1,2,1,2,1,3,1,1,2
8,1033078,2,1,1,1,2,1,1,1,5,2
9,1033078,4,2,1,1,2,1,2,1,1,2


The	dataset consists of	several	hundred	human cell sample records, each	of which contains the values of	a set of cell characteristics. The ID field	contains the patient identifiers. The characteristics of the cell samples from each	patient	are	contained in fields	Clump to Mit. The values are graded	from 1 to 10, with 1 being the closest to benign. The Class	field contains the diagnosis, as confirmed by separate medical procedures, as to whether the samples are benign	(value=2) or malignant (value=4).

In [3]:
# Checking number of rows and columns,
# names of all the columns,
# as well as checking for missing values
# and number of unique valus per column.
print('Rows: ', data.shape[0])
print('Columns: ', data.shape[1])
print('\nFeatures: ', '\n',data.columns.tolist())
print('\nMissing values:', '\n',data.isnull().sum().values.sum())
print('\nUnique values:', '\n',data.nunique())

Rows:  699
Columns:  11

Features:  
 ['ID', 'Clump', 'UnifSize', 'UnifShape', 'MargAdh', 'SingEpiSize', 'BareNuc', 'BlandChrom', 'NormNucl', 'Mit', 'Class']

Missing values: 
 0

Unique values: 
 ID             645
Clump           10
UnifSize        10
UnifShape       10
MargAdh         10
SingEpiSize     10
BareNuc         11
BlandChrom      10
NormNucl        10
Mit              9
Class            2
dtype: int64


In [4]:
#Checking data format of columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
ID             699 non-null int64
Clump          699 non-null int64
UnifSize       699 non-null int64
UnifShape      699 non-null int64
MargAdh        699 non-null int64
SingEpiSize    699 non-null int64
BareNuc        699 non-null object
BlandChrom     699 non-null int64
NormNucl       699 non-null int64
Mit            699 non-null int64
Class          699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


In [5]:
data.nunique()

ID             645
Clump           10
UnifSize        10
UnifShape       10
MargAdh         10
SingEpiSize     10
BareNuc         11
BlandChrom      10
NormNucl        10
Mit              9
Class            2
dtype: int64

In [6]:
print(data['BareNuc'].tolist())

['1', '10', '2', '4', '1', '10', '10', '1', '1', '1', '1', '1', '3', '3', '9', '1', '1', '1', '10', '1', '10', '7', '1', '?', '1', '7', '1', '1', '1', '1', '1', '1', '5', '1', '1', '1', '1', '1', '10', '7', '?', '3', '10', '1', '1', '1', '9', '1', '1', '8', '3', '4', '5', '8', '8', '5', '6', '1', '10', '2', '3', '2', '8', '2', '1', '2', '1', '10', '9', '1', '1', '2', '1', '10', '4', '2', '1', '1', '3', '1', '1', '1', '1', '2', '9', '4', '8', '10', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '6', '10', '5', '5', '1', '3', '1', '3', '10', '10', '1', '9', '2', '9', '10', '8', '3', '5', '2', '10', '3', '2', '1', '2', '10', '10', '7', '1', '10', '1', '10', '1', '1', '1', '10', '1', '1', '2', '1', '1', '1', '?', '1', '1', '5', '5', '1', '?', '8', '2', '1', '10', '1', '10', '5', '3', '1', '10', '1', '1', '?', '10', '10', '1', '1', '3', '?', '2', '10', '1', '1', '1', '1', '1', '1', '10', '10', '10', '1', '1', '1', '10', '1', '1', '1', '10', '10', '1', '8', '10', '8', '1', '8', '10', '1',

Column BareNuc has several categorical variables '?'. In order to drop them I transform the column values to numeric first, then the categorical variables will become 'NaN' (missing values). And then I drop them with dropna() function.

In [7]:
data['BareNuc'] = pd.to_numeric(data['BareNuc'], errors='coerce')

In [8]:
data.isnull().sum()

ID              0
Clump           0
UnifSize        0
UnifShape       0
MargAdh         0
SingEpiSize     0
BareNuc        16
BlandChrom      0
NormNucl        0
Mit             0
Class           0
dtype: int64

In [9]:
data = data.dropna(subset=['BareNuc'])
data.isnull().sum()

ID             0
Clump          0
UnifSize       0
UnifShape      0
MargAdh        0
SingEpiSize    0
BareNuc        0
BlandChrom     0
NormNucl       0
Mit            0
Class          0
dtype: int64

In [10]:
# Replace values of the dependent variable Class into:
# benign - 0 and malignant - 1
data.Class.replace({2:0, 4:1}, inplace=True)

In [11]:
# Defining features and predictor variable
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [12]:
# Splitting dataset into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [14]:
from sklearn.svm import LinearSVC
classifier = LinearSVC(C=1, loss='hinge')
classifier.fit(X_train, y_train)

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
          penalty='l2', random_state=None, tol=0.0001, verbose=0)

In [15]:
# Measuring accuracy of the model 
from sklearn.metrics import accuracy_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import confusion_matrix

y_pred = classifier.predict(X_test)
acc_score = accuracy_score(y_test, y_pred)
jacc_index = jaccard_score(y_test, y_pred)

print('Accuracy score = {:.3f}'.format(acc_score))
print('Jaccard Index = {:.3f}'.format(jacc_index))

cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix: ', '\n',cm)

Accuracy score = 0.956
Jaccard Index = 0.885
Confusion matrix:  
 [[85  4]
 [ 2 46]]


In [16]:
from sklearn.svm import SVC
clsfr_rbf = SVC(kernel='rbf', gamma=2, C=1)
clsfr_rbf.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=2, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [17]:
# Measuring accuracy of the model 
from sklearn.metrics import accuracy_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import confusion_matrix

y_pred = clsfr_rbf.predict(X_test)
acc_score = accuracy_score(y_test, y_pred)
jacc_index = jaccard_score(y_test, y_pred)

print('Accuracy score = {:.3f}'.format(acc_score))
print('Jaccard Index = {:.3f}'.format(jacc_index))

cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix: ', '\n',cm)

Accuracy score = 0.964
Jaccard Index = 0.906
Confusion matrix:  
 [[84  5]
 [ 0 48]]


Model with __rbf__ kernel type performed better than linear SVM classifier.