In [101]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

In [73]:
data = pd.read_csv('breast-cancer-wisconsin.data', names =['Sample code number',
'Clump Thickness',
'Uniformity of Cell Size',
'Uniformity of Cell Shape',
'Marginal Adhesion',
'Single Epithelial Cell Size',
'Bare Nuclei',
'Bland Chromatin',
'Normal Nucleoli', 
'Mitoses',
'Class'])

In [74]:
data.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [75]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
Sample code number             699 non-null int64
Clump Thickness                699 non-null int64
Uniformity of Cell Size        699 non-null int64
Uniformity of Cell Shape       699 non-null int64
Marginal Adhesion              699 non-null int64
Single Epithelial Cell Size    699 non-null int64
Bare Nuclei                    699 non-null object
Bland Chromatin                699 non-null int64
Normal Nucleoli                699 non-null int64
Mitoses                        699 non-null int64
Class                          699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


In [76]:
data['Bare Nuclei'].unique()

array(['1', '10', '2', '4', '3', '9', '7', '?', '5', '8', '6'],
      dtype=object)

In [77]:
data['Bare Nuclei'] = data['Bare Nuclei'].replace({'?' : np.nan})

In [78]:
data['Bare Nuclei'].unique()

array(['1', '10', '2', '4', '3', '9', '7', nan, '5', '8', '6'],
      dtype=object)

In [79]:
data['Bare Nuclei'].isnull().sum()

16

In [80]:
data['Bare Nuclei'] = data['Bare Nuclei'].astype('float64')

Replacing the missing values with mean as the missing values account to about 2.28 percent of the entire dataset.

In [81]:
data['Bare Nuclei'].fillna(data['Bare Nuclei'].mean(), inplace = True)

In [82]:
data['Bare Nuclei'].unique()

array([ 1.        , 10.        ,  2.        ,  4.        ,  3.        ,
        9.        ,  7.        ,  3.54465593,  5.        ,  8.        ,
        6.        ])

In [83]:
data['Bare Nuclei'] = data['Bare Nuclei'].astype('int64')

In [84]:
data['Bare Nuclei'].unique()

array([ 1, 10,  2,  4,  3,  9,  7,  5,  8,  6], dtype=int64)

In [85]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
Sample code number             699 non-null int64
Clump Thickness                699 non-null int64
Uniformity of Cell Size        699 non-null int64
Uniformity of Cell Shape       699 non-null int64
Marginal Adhesion              699 non-null int64
Single Epithelial Cell Size    699 non-null int64
Bare Nuclei                    699 non-null int64
Bland Chromatin                699 non-null int64
Normal Nucleoli                699 non-null int64
Mitoses                        699 non-null int64
Class                          699 non-null int64
dtypes: int64(11)
memory usage: 60.1 KB


Splitting the dataset into attributes and labels

In [86]:
le = LabelEncoder()

In [87]:
data['Class'] = le.fit_transform(data['Class'])

In [88]:
X = data.drop(['Class'], axis= 1)

In [89]:
y= data['Class']

In [90]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Class, dtype: int64

In [92]:
scaler = MinMaxScaler()

In [93]:
X = scaler.fit_transform(X)

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =  0 , test_size = 0.3)

Now building Linear SVM model using the training dataset

In [116]:
linearsvm = LinearSVC(max_iter = 40000)
model = linearsvm.fit(X_train, y_train)

In [117]:
y_predict = model.predict(X_test)

In [118]:
metrics.accuracy_score(y_test, y_predict)

0.9523809523809523

In [119]:
print(metrics.classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96       135
           1       0.92      0.95      0.93        75

    accuracy                           0.95       210
   macro avg       0.95      0.95      0.95       210
weighted avg       0.95      0.95      0.95       210



Now building a model for Polynomial SVM 

In [132]:
polysvm = SVC(gamma = 'scale', kernel = 'poly', degree = 6)
modelpoly = polysvm.fit(X_train, y_train)

In [133]:
y_pred = modelpoly.predict(X_test)

In [134]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96       135
           1       0.95      0.92      0.93        75

    accuracy                           0.95       210
   macro avg       0.95      0.95      0.95       210
weighted avg       0.95      0.95      0.95       210

