In [83]:
'''
这段代码演示svm关于乳腺癌检测应用的一个实例，仔细阅读，能够发现---模型选择与参数调优的重要性！！！！！

'''
import time  #时间戳
import numpy as np
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer       #导入乳腺癌数据集，sklearn的datasets里有很多数据集
from sklearn.model_selection import train_test_split  #导入数据集切分函数！！！！
from sklearn.model_selection import GridSearchCV
from sklearn import metrics   #与查准率，召回率，f1有关

In [84]:
#加载数据集
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target
print('data shape:{0}; no. positive:{1}; no. negative:{2}'.format(X.shape, y[y==1].shape[0], y[y==0].shape[0]))  #打印正负类数
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)  #数据集切分,分成训练集，测试集

data shape:(569, 30); no. positive:357; no. negative:212


In [85]:
#注意下面输出结果----过拟合（gamma的影响），！！！！！注意这里gamma可能与吴恩达讲解有出入,C的影响与吴恩达一致
#使用RBF高斯核函数
clf = SVC(C=1, kernel='rbf', gamma=0.1)
start = time.clock()
clf.fit(X_train, y_train)
print("耗时：",time.clock()-start)
print("Train_score:{0}\nTest_score:{1}".format(clf.score(X_train, y_train), clf.score(X_test, y_test))) #打印训练集测试集正确率

耗时： 0.016562696480100596
Train_score:1.0
Test_score:0.6754385964912281


In [86]:
'''
下面用GridSearchCV来自动选择γ的最优参数以及对应的交叉验证评分及召回率和F1得分！！！！！！！
'''

thresholds = np.linspace(0, 0.001, 100)
# 通过交叉验证设置参数
param_grid = {'gamma': thresholds}

clf = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=5)
start = time.clock()
clf.fit(X_train, y_train)
print("耗时：",time.clock()-start)
print("best param: {0}\nbest score: {1}".format(clf.best_params_,  clf.best_score_))
y_pred = clf.predict(X_test)

print("查准率：",metrics.precision_score(y_pred, y_test))
print("召回率：",metrics.recall_score(y_pred, y_test))
print("F1：",metrics.f1_score(y_pred, y_test))

耗时： 6.52321767588745
best param: {'gamma': 8.080808080808081e-05}
best score: 0.945054945054945
查准率： 0.935064935064935
召回率： 0.9473684210526315
F1： 0.9411764705882352


In [89]:
#使用多项式核函数
clf = SVC(C=1., kernel='poly',gamma='auto', degree=2)
start = time.clock()
clf.fit(X_train, y_train)
print("耗时：",time.clock()-start)

y_pred = clf.predict(X_test)
print("Train_score:{0}\nTest_score:{1}".format(clf.score(X_train, y_train), clf.score(X_test, y_test)))
print("查准率：",metrics.precision_score(y_pred, y_test))
print("召回率：",metrics.recall_score(y_pred, y_test))
print("F1：",metrics.f1_score(y_pred, y_test))

耗时： 30.933629662018404
Train_score:0.9714285714285714
Test_score:0.9736842105263158
查准率： 0.987012987012987
召回率： 0.9743589743589743
F1： 0.9806451612903225


In [None]:
#注意对比以上两种模型的优缺点！！！！！！