In [1]:
import sys
import os
sys.path.insert(0, os.path.dirname(os.getcwd()))
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn import svm, linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc
from evaluate_model import evaluate_on_sample, calculate_acc_spe_sen

In [2]:
feature_path = 'C:\\Users\\hy\\Desktop\\bsp_project\\ML\\data\\features'
feature = pd.read_csv(os.path.join(feature_path, 'features.csv'), header=0)
feature.head(10)

FileNotFoundError: [Errno 2] File b'C:\\Users\\hy\\Desktop\\bsp_project\\ML\\data\\features\\features.csv' does not exist: b'C:\\Users\\hy\\Desktop\\bsp_project\\ML\\data\\features\\features.csv'

读取提前提取好的特征准备分类

In [None]:
feature_array = feature.values
labels = np.array([int(feature_array[i, 1]=='A') for i in range(0, len(feature_array))])
data = feature_array[:, 2:].astype('float')
data -= data.mean(axis=0)
data /= data.std(axis=0)
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=30)

首先将通过pandas读入的DataFranme数据结构的数据提取出来并进行强制类型转换，标签中N置为0类，A置为1类  
对特征数据进行z-score标准化从而避免不同特征取值范围不同引起的偏差，提升分类器效果  
按照4:1的比例划分数据集

In [None]:
svc = svm.SVC(probability=True, gamma='auto')
parameters = [
    {
        'C': [0.5, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
        'gamma': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
        'kernel': ['rbf']
    },
    {
        'C': [0.5, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
        'kernel': ['linear']
    }
]
svm_clf = GridSearchCV(svc, parameters, cv=5, n_jobs=8, iid=True)
svm_clf.fit(X_train, y_train)
svm_best = svm_clf.best_estimator_
predictions = svm_best.predict(X_test)
y_scores = svm_best.predict_proba(X_test)
TP, TN, FP, FN = evaluate_on_sample(predictions, y_test)
ACC, SPE, SEN = calculate_acc_spe_sen(TP, TN, FP, FN)
svm_fpr, svm_tpr, _ = roc_curve(y_test, y_scores[:, 1])
AUC = auc(svm_fpr, svm_tpr)
svm_performance = pd.DataFrame.from_dict([{'Classifier': 'SVM', 
                                           'ACC': '{:.3f}'.format(ACC), 
                                           'SPE': '{:.3f}'.format(SPE), 
                                           'SEN': '{:.3f}'.format(SEN), 
                                           'AUC': '{:.3f}'.format(AUC)}])

训练SVM并执行测试，训练过程中采用网格搜索寻找最佳参数组合（5折交叉验证）

In [None]:
lr = linear_model.LogisticRegression(solver='lbfgs')
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)
y_scores = lr.predict_proba(X_test)
TP, TN, FP, FN = evaluate_on_sample(predictions, y_test)
ACC, SPE, SEN = calculate_acc_spe_sen(TP, TN, FP, FN)
lr_fpr, lr_tpr, _ = roc_curve(y_test, y_scores[:, 1])
AUC = auc(lr_fpr, lr_tpr)
lr_performance = pd.DataFrame.from_dict([{'Classifier': 'Logistic Regression', 
                                           'ACC': '{:.3f}'.format(ACC), 
                                           'SPE': '{:.3f}'.format(SPE), 
                                           'SEN': '{:.3f}'.format(SEN), 
                                           'AUC': '{:.3f}'.format(AUC)}])

训练LR并执行测试

In [None]:
rf = RandomForestClassifier(random_state=40)
rf_parameters = {'n_estimators': np.arange(20, 420, 20)}
rf_clf = GridSearchCV(rf, rf_parameters, cv=5, iid=True)
rf_clf.fit(X_train, y_train)
rf_best = rf_clf.best_estimator_
predictions = rf_best.predict(X_test)
y_scores = rf_best.predict_proba(X_test)
TP, TN, FP, FN = evaluate_on_sample(predictions, y_test)
ACC, SPE, SEN = calculate_acc_spe_sen(TP, TN, FP, FN)
rf_fpr, rf_tpr, _ = roc_curve(y_test, y_scores[:, 1])
AUC = auc(rf_fpr, rf_tpr)
rf_performance = pd.DataFrame.from_dict([{'Classifier': 'Random Forest', 
                                           'ACC': '{:.3f}'.format(ACC), 
                                           'SPE': '{:.3f}'.format(SPE), 
                                           'SEN': '{:.3f}'.format(SEN), 
                                           'AUC': '{:.3f}'.format(AUC)}])

训练RF并执行测试，训练过程中采用网格搜索寻找最佳参数组合（5折交叉验证）

In [None]:
performance = pd.concat([svm_performance, lr_performance, rf_performance], axis=0)
performance.to_csv(os.path.join(feature_path, 'performance.csv'), index=False)

performance

打印各个分类器的分类效果，可见SVM与LR表现类似，RF表现略差，可能是因为还没有寻找到最佳参数组合

In [None]:
plt.figure(figsize=(9, 6))
plt.plot(svm_fpr, svm_tpr, 'b', linewidth=3)
plt.plot(lr_fpr, lr_tpr, 'r', linewidth=3)
plt.plot(rf_fpr, rf_tpr, 'g', linewidth=3)
plt.plot(svm_fpr, svm_fpr, linestyle='--', color='k')
plt.xlabel('False Positive Rate')
plt.xticks(np.arange(0, 1.1, 0.1))
plt.xlim(-0.002, 1)
plt.ylabel('True Positive Rate')
plt.yticks(np.arange(0, 1.1, 0.1))
plt.ylim(0, 1.004)
plt.title('ROC Curve')
plt.legend(['SVM', 'Logistic Regression', 'Random Forest'])
plt.grid()
plt.savefig(os.path.join(feature_path, 'ROC.png'))
plt.show()

绘制ROC曲线，可以看出三个分类器的ROC曲线都在黑线上方（分类效果好于随机）