# Support Vector Machine
## by Sklearn Module

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Reading Cancer Cell Data

In [2]:
data = pd.read_csv('../data/cell_samples.csv')
data.head()

Unnamed: 0,ID,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BareNuc,BlandChrom,NormNucl,Mit,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


### Analysing Data

In [3]:
for attribute in data.columns[1:]:
    print(attribute, ": ", np.unique(data[attribute]))
print("\n",data.dtypes)

Clump :  [ 1  2  3  4  5  6  7  8  9 10]
UnifSize :  [ 1  2  3  4  5  6  7  8  9 10]
UnifShape :  [ 1  2  3  4  5  6  7  8  9 10]
MargAdh :  [ 1  2  3  4  5  6  7  8  9 10]
SingEpiSize :  [ 1  2  3  4  5  6  7  8  9 10]
BareNuc :  ['1' '10' '2' '3' '4' '5' '6' '7' '8' '9' '?']
BlandChrom :  [ 1  2  3  4  5  6  7  8  9 10]
NormNucl :  [ 1  2  3  4  5  6  7  8  9 10]
Mit :  [ 1  2  3  4  5  6  7  8 10]
Class :  [2 4]

 ID              int64
Clump           int64
UnifSize        int64
UnifShape       int64
MargAdh         int64
SingEpiSize     int64
BareNuc        object
BlandChrom      int64
NormNucl        int64
Mit             int64
Class           int64
dtype: object


We see that all the Attribute list are in the range of 1 - 10 except BareNuc which is a String, and the Dependent Variable is having values as: 2 (benign) or 4 (malignant).

### Data Pre-Processing
Removing '?' rows in BareNuc Coloum and converting the rest to int

In [4]:
data = data[pd.to_numeric(data['BareNuc'], errors='coerce').notnull()]  # If 'coerce', then invalid parsing will be set as NaN
data['BareNuc'] = data['BareNuc'].astype(int)
print('BareNuc', np.unique(data['BareNuc']))
print(data['BareNuc'].dtypes)

BareNuc [ 1  2  3  4  5  6  7  8  9 10]
int64


Classifying the Independent Variables x and Dependent variables y

In [5]:
x = data[data.columns[1:-1]]
x.head()

Unnamed: 0,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BareNuc,BlandChrom,NormNucl,Mit
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1


In [6]:
y = data[data.columns[-1:]]
y.head()

Unnamed: 0,Class
0,2
1,2
2,2
3,2
4,2


### Spliting the data into Train/Test Split 80:20

In [7]:
from sklearn import model_selection
x_train, x_test, y_train, y_test = model_selection.train_test_split(np.asanyarray(x), np.asanyarray(y).ravel(), test_size=0.2)

print ('Train set:', x_train.shape,  y_train.shape)
print ('Test set:', x_test.shape,  y_test.shape)

Train set: (546, 9) (546,)
Test set: (137, 9) (137,)


### Training SVM classifier

In [8]:
from sklearn import svm

clf = svm.SVC(kernel='linear')    # kernel = 'linear', 'poly', 'rbf', 'sigmoid'
clf.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

### Prediction & Evaluation

In [9]:
y_hat = clf.predict(x_test)

from sklearn import metrics
print("Accuracy\t\t: ", metrics.accuracy_score(y_test, y_hat) * 100)
# print("Jccard Score\t\t: ", metrics.jaccard_score(y_test, y_hat))
print("Jaccard Similarity\t: ", metrics.jaccard_similarity_score(y_test, y_hat))
# print("log loss\t\t: ", metrics.log_loss(y_test, y_hat))
print("\nConfusion Matrix: \n", metrics.confusion_matrix(y_test, y_hat))
print("\nClassification Report: \n", metrics.classification_report(y_test, y_hat))

Accuracy		:  95.62043795620438
Jaccard Similarity	:  0.9562043795620438

Confusion Matrix: 
 [[83  4]
 [ 2 48]]

Classification Report: 
               precision    recall  f1-score   support

           2       0.98      0.95      0.97        87
           4       0.92      0.96      0.94        50

    accuracy                           0.96       137
   macro avg       0.95      0.96      0.95       137
weighted avg       0.96      0.96      0.96       137



