In [2]:
# system lib
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn import model_selection

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier  #随机森林
from sklearn import tree

#用于参数搜索
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report

from sklearn.metrics import roc_curve, auc #绘制ROC曲线
import pylab as pl

from time import time
import datetime
import numpy as np

In [3]:
import pickle
from sklearn.model_selection import cross_validate
import pandas as pd

In [4]:
def load_data(filename):
    """根据数据格式，读取数据中的X和分类标签y
    """

    return x_data, ylabel

def evaluate_classifier( real_label_list,predict_label_list):
    """
       return Precision, Recall and ConfusionMatrix
       Input : predict_label_list,real_label_list
    """
    msg=''
    Confusion_matrix = confusion_matrix( real_label_list,predict_label_list)
    msg += '\n Confusion Matrix\n ' + str(Confusion_matrix)
    precision = precision_score(real_label_list,predict_label_list, average=None)
    recall = recall_score(real_label_list,predict_label_list, average=None)
    msg += '\n Precision of tag 0 and 1 =%s' %str(precision)
    msg += '\n Recall of tag 0 and 1 =%s' %str(recall)

    return msg

def test_svm(train_file, test_file):
    """用SVM分类 """
    # use SVM directly

    train_xdata, train_ylabel = load_data(train_file)

    test_xdata, test_ylabel = load_data(test_file)

    print('\nuse SVM directly')

    #classifier1 = SVC(kernel='linear')
    #classifier1 = SVC(kernel='linear',probability=True, C=200, cache_size=500)
    classifier1 = SVC(kernel='linear',probability=True,C=10, cache_size=500)

    classifier1.fit(train_xdata, train_ylabel)

    predict_labels = classifier1.predict(test_xdata)
    accuracy = accuracy_score(test_ylabel, predict_labels)
    print("\n The Classifier's Accuracy is : %f" %accuracy)
    #
    eval_msg = evaluate_classifier(test_ylabel,predict_labels)
    print(eval_msg)
    #
    #GridSearchCV搜索最优参数示例
    print("GridSearchCV搜索最优参数......")
    t0 = time()
    param_grid = {
        "C": [1e3, 5e3, 1e4, 5e4, 1e5],
        "gamma": [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
    classifier1 = GridSearchCV(SVC(kernel="rbf", class_weight="balanced",probability=True), param_grid)
    classifier1 = classifier1.fit(train_xdata, train_ylabel)
    print("done in %0.3fs" % (time() - t0))
    print("Best estimator found by grid search:")
    print(classifier1.best_estimator_)


    #对于SVM来说，概率是通过交叉验证得到的，与其预测的结果未必一致，对小数据集来说，此概率没什么意义
    probas_ = classifier1.predict_proba(test_xdata)

    #对于二分类问题，可为分类器绘制ROC曲线，计算AUC
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(test_ylabel, probas_[:, 1])
    roc_auc = auc(fpr, tpr)
    print("Area under the ROC curve : %f" % roc_auc)

    # Plot ROC curve
    pl.clf()
    pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    pl.plot([0, 1], [0, 1], 'k--')
    pl.xlim([0.0, 1.0])
    pl.ylim([0.0, 1.0])
    pl.xlabel('False Positive Rate')
    pl.ylabel('True Positive Rate')
    pl.title('%s SVM ROC' %train_file)
    pl.legend(loc="lower right")
    pl.show()


In [5]:
data = pd.read_csv('preprocess_train.csv')

In [6]:
# 使用平均数填充缺失值
data = data.fillna(data.mean())

In [7]:
print(data.describe())

         sample_id     feature0       feature1     feature2     feature3  \
count  6296.000000  6296.000000    6296.000000  6296.000000  6296.000000   
mean   3147.500000    63.748003  285239.586221     1.132154     1.177755   
std    1817.642979    38.489715   72644.008774     0.320382     1.490461   
min       0.000000   -34.739442 -575880.089809    -2.157527    -3.055975   
25%    1573.750000    48.145602  288358.400000     1.102452     0.350584   
50%    3147.500000    63.609268  288358.400000     1.105857     0.875418   
75%    4721.250000    71.931683  288358.400000     1.110334     1.363743   
max    6295.000000   463.739205  860586.441356     3.373289    14.456153   

          feature4     feature5     feature6      feature7      feature8  ...  \
count  6296.000000  6296.000000  6296.000000  6.296000e+03  6.296000e+03  ...   
mean    251.501641    11.553379     4.526707  8.634842e+10  8.238839e+04  ...   
std     141.786644    12.995523    17.906249  6.430692e+11  7.998810e+05

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [9]:
# 分割特征和标签
X = data.iloc[:, :-1]  # 特征
y = data.iloc[:, -1]   # 标签

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # 可根据需求设置测试集比例和随机种子


In [10]:
# 求出各个列的方差
variances = X_train.var(axis=0)
print(variances)

sample_id     3.291326e+06
feature0      1.524526e+03
feature1      4.999518e+09
feature2      1.057878e-01
feature3      2.171464e+00
                  ...     
feature102    1.086364e+04
feature103    3.220143e+00
feature104    3.493834e+00
feature105    8.062620e+02
feature106    1.524351e+04
Length: 108, dtype: float64


In [11]:
# 展示方差大于0.1的特征
print(variances[variances > 0.1])
#  输出个数
print(len(variances[variances > 0.1]))

sample_id     3.291326e+06
feature0      1.524526e+03
feature1      4.999518e+09
feature2      1.057878e-01
feature3      2.171464e+00
                  ...     
feature102    1.086364e+04
feature103    3.220143e+00
feature104    3.493834e+00
feature105    8.062620e+02
feature106    1.524351e+04
Length: 103, dtype: float64
103


In [12]:
# 选择方差大于0.1的特征
X_train = X_train.loc[:, variances > 0.1]

In [13]:
# 对于test集选择相同的特征
X_test = X_test.loc[:, variances > 0.1]

In [14]:
# 特征归一化
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) # 注意这里是fit_transform
X_test = scaler.transform(X_test) # 注意这里是transform

print('X_train.shape:', X_train.shape)
print('X_test.shape:', X_test.shape)


X_train.shape: (5036, 103)
X_test.shape: (1260, 103)


In [15]:
# 方差选择法数据预处理
from sklearn.feature_selection import VarianceThreshold
# 创建VarianceThreshold对象
selector = VarianceThreshold(threshold=0.01)

# 在训练集上拟合并应用特征选择
X_train = selector.fit_transform(X_train)

# 在测试集上应用相同的特征选择
X_test = selector.transform(X_test)

print('X_train.shape:', X_train.shape)
print('X_test.shape:', X_test.shape)

X_train.shape: (5036, 103)
X_test.shape: (1260, 103)


In [16]:
classifier1 = SVC(kernel='linear',probability=True,C=10, cache_size=10000)
classifier1.fit(X_train, y_train)

SVC(C=10, cache_size=10000, kernel='linear', probability=True)

In [17]:
from sklearn.metrics import f1_score

predict_labels = classifier1.predict(X_test)
accuracy = accuracy_score(y_test, predict_labels)
print("\n The Classifier's Accuracy is : %f" %accuracy)
# 计算f1score
f1score = f1_score(y_test, predict_labels, average='macro')
print("\n The Classifier's f1score is : %f" %f1score)


 The Classifier's Accuracy is : 0.800000

 The Classifier's f1score is : 0.761745


In [18]:
classifier1 = SVC(kernel='linear',probability=True,C=10, cache_size=5000)
classifier1.fit(X_train, y_train)

SVC(C=10, cache_size=5000, kernel='linear', probability=True)

In [19]:
eval_msg = evaluate_classifier(y_test,predict_labels)
print(eval_msg)


 Confusion Matrix
 [[595  12  19   8   4   0]
 [ 17  77  36   1   2   0]
 [ 45  33 127   0   2   0]
 [ 35   0   0  81   0   0]
 [ 11   5   8   2  53   0]
 [ 11   0   0   1   0  75]]
 Precision of tag 0 and 1 =[0.83333333 0.60629921 0.66842105 0.87096774 0.86885246 1.        ]
 Recall of tag 0 and 1 =[0.93260188 0.57894737 0.61352657 0.69827586 0.67088608 0.86206897]


In [20]:
print("GridSearchCV搜索最优参数......")
t0 = time()
param_grid = {
    "C": [1e3, 5e3, 1e4, 5e4, 1e5],
    "gamma": [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
classifier1 = GridSearchCV(SVC(kernel="rbf",probability=True), param_grid) #balance不需要 
classifier1 = classifier1.fit(X_train, y_train)

GridSearchCV搜索最优参数......


In [21]:
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(classifier1.best_estimator_)

done in 6246.634s
Best estimator found by grid search:
SVC(C=1000.0, gamma=0.01, probability=True)


In [22]:
probas_ = classifier1.predict_proba(X_test)
print(probas_)

[[8.16101179e-01 5.15765803e-03 2.01856723e-02 6.17129875e-03
  2.13426512e-03 1.50249927e-01]
 [1.46075673e-03 9.90209436e-01 8.13670713e-03 1.77052973e-04
  5.63004452e-06 1.04171678e-05]
 [4.21160362e-01 9.83590689e-02 4.49655009e-01 2.09297166e-02
  5.03880709e-03 4.85703640e-03]
 ...
 [1.55972085e-02 8.39649573e-02 8.98602806e-01 3.59081993e-04
  1.05043782e-04 1.37090226e-03]
 [8.14255823e-01 1.95148847e-02 1.58429002e-01 1.05554180e-04
  1.09025247e-04 7.58571109e-03]
 [3.90514747e-01 2.01705647e-02 2.55324086e-01 2.96305335e-01
  1.66721826e-02 2.10130850e-02]]
