# 乳癌資料庫預測SVM分類
>使用scikit-learn 機器學習套件裡的SVR演算法

* (一)引入函式庫及內建乳癌資料集<br>
引入之函式庫如下<br>
sklearn.datasets: 用來匯入內建之乳癌資料集`datasets.load_breast_cancer()`<br>
sklearn.SVR: 支持向量機回歸分析之演算法<br>
matplotlib.pyplot: 用來繪製影像

In [1]:
from sklearn import svm
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Step1. 下載資料

In [2]:
breast_cancer=datasets.load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target
print(breast_cancer.keys())
print(breast_cancer.feature_names)
print(breast_cancer.target_names)

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
['malignant' 'benign']


In [3]:
X

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [4]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [5]:
#尋找相關性較高的屬性
#df = pd.DataFrame(X)
#print(df.corr())

In [6]:
'''
target_names=breast_cancer.target_names
labels=target_names[y]

import numpy as np

malignant_radius=X[labels=='malignant',0]
malignant_texture=X[labels=='malignant',1]
malignant=np.c_[malignant_radius,malignant_texture]

benign_radius=X[labels=='benign',0]
benign_texture=X[labels=='benign',1]
benign=np.c_[benign_radius,benign_texture]

import matplotlib.pyplot as plt

plt.scatter(malignant[:,0],malignant[:,1],color='red')
plt.scatter(benign[:,0],benign[:,1],color='blue')
'''

"\ntarget_names=breast_cancer.target_names\nlabels=target_names[y]\n\nimport numpy as np\n\nmalignant_radius=X[labels=='malignant',0]\nmalignant_texture=X[labels=='malignant',1]\nmalignant=np.c_[malignant_radius,malignant_texture]\n\nbenign_radius=X[labels=='benign',0]\nbenign_texture=X[labels=='benign',1]\nbenign=np.c_[benign_radius,benign_texture]\n\nimport matplotlib.pyplot as plt\n\nplt.scatter(malignant[:,0],malignant[:,1],color='red')\nplt.scatter(benign[:,0],benign[:,1],color='blue')\n"

In [7]:
#training_data=np.r_[malignant, benign]
#training_labels=np.r_[np.zeros(len(malignant)),np.ones(len(benign))]

#clf=svm.LinearSVC()
#clf.fit(training_data,training_labels)

'''
training_x_min=training_data[:,0].min()-1
training_x_max=training_data[:,0].max()+1
training_y_min=training_data[:,1].min()-1
training_y_max=training_data[:,1].max()+1
grid_interval=0.02
xx,yy=np.meshgrid(np.arange(training_x_min,training_x_max,grid_interval),np.arange(training_y_min,training_y_max,grid_interval))
    
Z=clf.predict(np.c_[xx.ravel(),yy.ravel()])
Z=Z.reshape(xx.shape)
plt.contourf(xx,yy,Z,cmap=plt.cm.bone,alpha=0.2)
plt.autoscale()
plt.grid()
plt.scatter(malignant[:,0],malignant[:,1],color='red')
plt.scatter(benign[:,0],benign[:,1],color='blue')
plt.title(title)
plt.show()
'''

"\ntraining_x_min=training_data[:,0].min()-1\ntraining_x_max=training_data[:,0].max()+1\ntraining_y_min=training_data[:,1].min()-1\ntraining_y_max=training_data[:,1].max()+1\ngrid_interval=0.02\nxx,yy=np.meshgrid(np.arange(training_x_min,training_x_max,grid_interval),np.arange(training_y_min,training_y_max,grid_interval))\n    \nZ=clf.predict(np.c_[xx.ravel(),yy.ravel()])\nZ=Z.reshape(xx.shape)\nplt.contourf(xx,yy,Z,cmap=plt.cm.bone,alpha=0.2)\nplt.autoscale()\nplt.grid()\nplt.scatter(malignant[:,0],malignant[:,1],color='red')\nplt.scatter(benign[:,0],benign[:,1],color='blue')\nplt.title(title)\nplt.show()\n"

## Step2. 區分訓練集與測試集

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=0)

## Step3. 建模

In [9]:
#Error: X has 2 features, but SVC is expecting 30 features as input
'''
clf=svm.SVC(kernel='linear',gamma=1,C=10)
clf=svm.SVC(kernel='poly',gamma=1,C=10)
clf=svm.SVC(kernel='rbf',gamma=1,C=10)
clf=svm.SVC(kernel='sigmoid',gamma=1,C=10)

clf.fit(X_train,y_train)
SVM_plot(clf, 'rbf')
'''

"\nclf=svm.SVC(kernel='linear',gamma=1,C=10)\nclf=svm.SVC(kernel='poly',gamma=1,C=10)\nclf=svm.SVC(kernel='rbf',gamma=1,C=10)\nclf=svm.SVC(kernel='sigmoid',gamma=1,C=10)\n\nclf.fit(X_train,y_train)\nSVM_plot(clf, 'rbf')\n"

In [22]:
#clf=svm.SVC(kernel='poly',gamma=1,C=100,max_iter=100000) 需要花非常非常久的時間...

#clf=svm.SVC(kernel='rbf',gamma=1,C=10)
#print(clf.score(X_train,y_train))  = 1.0
#print(clf.score(X_test, y_test))   = 0.631578947368421

#clf=svm.SVC(kernel='rbf',gamma=1,C=100)
#print(clf.score(X_train,y_train))  = 1.0
#print(clf.score(X_test, y_test))   = 0.631578947368421

#clf=svm.SVC(kernel='rbf',gamma=3,C=10)
#clf=svm.SVC(kernel='rbf',gamma=3,C=10)
#print(clf.score(X_train,y_train))  = 1.0
#print(clf.score(X_test, y_test))   = 0.631578947368421

#clf=svm.SVC(kernel='linear',gamma=1,C=10)
#print(clf.score(X_train,y_train))  = 0.9698492462311558
#print(clf.score(X_test, y_test))   = 0.9473684210526315

clf=svm.SVC(kernel='linear',gamma=1,C=100)
clf.fit(X_train,y_train)

## Step4. 預測

```

```


In [23]:
clf.predict(X_test)

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1])

## Step5. 準確度分析

In [24]:
print(clf.score(X_train,y_train))
print(clf.score(X_test, y_test))

0.964824120603015
0.9590643274853801
