In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2



# 1) Preprocessing Data

#### Load Data

In [9]:
breast_cancer_data = pd.read_csv('data.txt', header=None, names=['id','target','mean radius', 'mean texture', 'mean perimeter', 'mean area',
 'mean smoothness', 'mean compactness', 'mean concavity',
 'mean concave points', 'mean symmetry', 'mean fractal dimension',
 'radius error', 'texture error', 'perimeter error', 'area error',
 'smoothness error', 'compactness error','concavity error',
 'concave points error', 'symmetry error' ,'fractal dimension error',
 'worst radius', 'worst texture', 'worst perimeter', 'worst area',
 'worst smoothness', 'worst compactness' ,'worst concavity',
 'worst concave points' ,'worst symmetry', 'worst fractal dimension'])

In [10]:
breast_cancer_data.head()

Unnamed: 0,id,target,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


#### Check for null values & drop ID column

In [12]:
print(breast_cancer_data.isnull().values.any())

breast_cancer_data.drop('id',axis=1,inplace=True)



False


In [13]:
#### Convert M and B labels into binary 1 and 0, respectively

In [15]:
breast_cancer_data['target'] = breast_cancer_data['target'].map({'M':1,'B':0})

In [2]:
breast_cancer_data = pd.read_csv('data.txt', header=None)
breast_cancer_data = breast_cancer_data.drop(breast_cancer_data.columns[0], axis=1)

breast_cancer_data[1] = breast_cancer_data[1].replace(["B", "M"], [-1,+1])

In [3]:
train, test = train_test_split(breast_cancer_data, test_size=0.2)

train.shape

(455, 31)

In [4]:
y_train = train[1]
X_train = train.drop(train.columns[1], axis=1)

y_test = test[1]
X_test = test.drop(test.columns[1], axis=1)

print(y_train.shape)
print(X_train.shape)

(455,)
(455, 30)


In [5]:
X_test

Unnamed: 0,1,3,4,5,6,7,8,9,10,11,...,22,23,24,25,26,27,28,29,30,31
292,-1,16.02,83.14,513.7,0.10050,0.07943,0.061550,0.033700,0.1730,0.06470,...,13.740,19.93,88.81,585.4,0.14830,0.20680,0.22410,0.10560,0.3380,0.09584
350,-1,17.07,73.70,421.0,0.07561,0.03630,0.008306,0.011620,0.1671,0.05731,...,13.280,19.74,83.61,542.5,0.09958,0.06476,0.03046,0.04262,0.2731,0.06825
501,1,24.49,92.33,595.9,0.11620,0.16810,0.135700,0.067590,0.2275,0.07237,...,16.010,32.94,106.00,788.0,0.17940,0.39660,0.33810,0.15210,0.3651,0.11830
91,1,22.76,100.20,728.2,0.09200,0.10360,0.112200,0.074830,0.1717,0.06097,...,16.430,25.84,107.50,830.9,0.12570,0.19970,0.28460,0.14760,0.2556,0.06828
403,-1,16.17,83.18,507.6,0.09879,0.08836,0.032960,0.023900,0.1735,0.06200,...,13.860,23.02,89.69,580.9,0.11720,0.19580,0.18100,0.08388,0.3297,0.07834
319,-1,17.00,78.60,477.3,0.07557,0.03454,0.013420,0.016990,0.1472,0.05561,...,12.900,20.21,81.76,515.9,0.08409,0.04712,0.02237,0.02832,0.1901,0.05932
195,-1,16.33,82.53,516.4,0.07941,0.05366,0.038730,0.023770,0.1829,0.05667,...,13.880,22.00,90.81,600.6,0.10970,0.15060,0.17640,0.08235,0.3024,0.06949
85,1,18.52,121.10,1075.0,0.09874,0.10530,0.133500,0.087950,0.2132,0.06022,...,22.930,27.68,152.20,1603.0,0.13980,0.20890,0.31570,0.16420,0.3695,0.08579
7,1,20.83,90.20,577.9,0.11890,0.16450,0.093660,0.059850,0.2196,0.07451,...,17.060,28.14,110.60,897.0,0.16540,0.36820,0.26780,0.15560,0.3196,0.11510
200,-1,19.56,78.54,461.0,0.09586,0.08087,0.041870,0.041070,0.1979,0.06013,...,14.440,28.36,92.15,638.4,0.14290,0.20420,0.13770,0.10800,0.2668,0.08174


In [8]:
y_test

292   -1
350   -1
501    1
91     1
403   -1
319   -1
195   -1
85     1
7      1
200   -1
57     1
59    -1
251   -1
44     1
568   -1
330    1
331   -1
73     1
351    1
426   -1
415   -1
196    1
430    1
5      1
32     1
213    1
64     1
423   -1
250    1
264    1
      ..
259    1
191   -1
175   -1
185   -1
477   -1
375   -1
521    1
478   -1
111   -1
68    -1
365    1
324   -1
247   -1
325   -1
96    -1
531   -1
40     1
338   -1
222   -1
400    1
300    1
371   -1
147   -1
143   -1
348   -1
402   -1
543   -1
8      1
506   -1
31     1
Name: 1, dtype: int64

In [6]:
def svm():
    model = SVC()
    model.fit(X_train, y_train) 
    score = model.score(X_test, y_test)
    print(score)

In [7]:
svm()

0.59649122807
