In [2]:
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from numpy import loadtxt, sum, size, where, abs, average, std, ones,sign, sqrt, max, min, diff, sort, floor, array, concatenate
import numpy as np
from scipy import stats
from os import listdir
from sklearn.metrics import classification_report
import joblib
# import graphviz

## 定义对mag的特征提取函数

In [3]:
def calculate_F(n, m, mag):
    flux = 10 ** (-0.4 * mag)
    N = size(flux)
    sorted_flux = sort(flux)
    n = int(floor(N * n / 100))
    m = int(floor(N * m / 100))
    f_n = sorted_flux[n]
    f_m = sorted_flux[m]
    return f_m - f_n

In [4]:
def moment_based_features(mag):
    n = size(mag)
    ave_mag = average(mag)
    weights = 1
    wtd_ave_mag = average(mag)

    delta = std(mag)
    beyond1std = size(where(abs(mag - wtd_ave_mag) > delta)) / n
    kurtosis = stats.kurtosis(mag)
    skew = stats.skew(mag)

    sk_1 = n * (n + 1) / ((n - 1) * (n - 2) * (n - 3))
    sk_2 = 3 * (n - 1) ** 2 / ((n - 2) * (n - 3))
    sk = sk_1 * sum(((mag - ave_mag) / std(mag, ddof=1)) ** 4) - sk_2

    delta_i = sqrt(n / (n - 1)) * ((mag - ave_mag) / delta)
    P_k = delta_i ** 2 - 1
    stetson_j = sum(weights * sign(P_k) * sqrt(abs(P_k))) / sum(weights)
    stetson_k = 1 / n * sum(abs(delta_i)) / sqrt(1 / n * sum(delta_i ** 2))

    return [beyond1std, kurtosis, skew, sk, stetson_j, stetson_k]

In [5]:
def magnitude_based_features(mag):
    slope = diff(mag)
    amp = max(mag) - min(mag)
    max_slope = max(abs(slope))
    mad = stats.median_abs_deviation(mag)

    return [amp, max_slope, mad]


In [6]:
def percentile_based_features(mag):
    F_5_95 = calculate_F(5, 95, mag)
    fpr20 = calculate_F(40, 60, mag) / F_5_95
    fpr35 = calculate_F(32.5, 67.5, mag) / F_5_95
    fpr50 = calculate_F(25, 75, mag) / F_5_95
    fpr65 = calculate_F(17.5, 82.5, mag) / F_5_95
    fpr80 = calculate_F(10, 90, mag) / F_5_95

    return [fpr20, fpr35, fpr50, fpr65, fpr80]

In [7]:
def get_feature(mag):
    mag_scaled = preprocessing.scale(mag)
    features1 = moment_based_features(mag_scaled)
    features2 = magnitude_based_features(mag_scaled)
    features3 = percentile_based_features(mag_scaled)
    return array(features1 + features2 + features3)

In [8]:
# 得到提取的特征，依次为beyond1std, kurtosis, skew, sk, stetson_j, stetson_k，amp, max_slope, mad，fpr20, fpr35, fpr50, fpr65, fpr80
def get_data(path):
    magarray = loadtxt(path,dtype=str,delimiter = ',',encoding='utf-8-sig')
    feature = ones([len(magarray), 14])
    for i,maglist in enumerate(magarray):
        mag = maglist[0:30]
        feature[i, :] = get_feature(mag)
    return feature

## 加载数据，计算特征，并保存在文件中

In [9]:
## 计算train和test的特征矩阵，
feature_train_0 = get_data('../../dataset/train_0.csv')
# np.savetxt('./feature_train_0.txt', feature_train_0 ,fmt='%f',delimiter=',')
feature_train_1 = get_data('../../dataset/train_1.csv')
# np.savetxt('./feature_train_1.txt', feature_train_1 ,fmt='%f',delimiter=',')
feature_test_1 = get_data('../../dataset/test_1.csv')
# np.savetxt('./feature_test_1.txt', feature_test_1 ,fmt='%f',delimiter=',')
feature_test_0 = get_data('../../dataset/test_0.csv')
# np.savetxt('./feature_test_0.txt', feature_test_0 ,fmt='%f',delimiter=',')
feature_train = concatenate((feature_train_0,feature_train_1))
feature_test = concatenate((feature_test_0,feature_test_1))

## 对数据进行整理，准备输入模型，包括shuffle

In [11]:
## 生成train和test的label矩阵
train_label_0 = ones(size(feature_train_0, 0))*0
train_label_1 = ones(size(feature_train_1, 0))
train_label = concatenate((train_label_0, train_label_1), axis=0)

test_label_0 = ones(size(feature_test_0, 0))*0
test_label_1 = ones(size(feature_test_1, 0))
test_label = concatenate((test_label_0, test_label_1), axis=0)

In [12]:
#将train_label的shape改成（,38022，1）才能和feature拼接，如果不改，是（38022，）就不能拼接
train_label=train_label[:,np.newaxis]
print(feature_train.shape)
print(train_label.shape)
train = concatenate((feature_train,train_label),axis=1)
print(train.shape)
## 打乱顺序
np.random.shuffle(train)

(38022, 14)
(38022, 1)
(38022, 15)


In [13]:
test_label=test_label[:,np.newaxis]
print(feature_test.shape)
print(test_label.shape)
test= concatenate((feature_test,test_label),axis=1)
print(test.shape)
## 打乱顺序
np.random.shuffle(test)

(9505, 14)
(9505, 1)
(9505, 15)


## 决策树模型

https://blog.csdn.net/qq_38384924/article/details/98382513

https://blog.csdn.net/TeFuirnever/article/details/99656571

https://blog.csdn.net/yxc9681/article/details/88285061

In [18]:
##  训练决策树
clf = DecisionTreeClassifier(max_features=14, max_depth=15 )
clf = clf.fit(train[:,0:14], train[:,14])
train_score_c = clf.score(train[:,0:14], train[:,14])  # 返回预测的准确度
test_score_c = clf.score(test[:,0:14], test[:,14])
print(train_score_c,test_score_c)
print(classification_report(test[:,14], clf.predict(test[:,0:14]),digits=4))
# joblib.dump(clf, './DecisionTreeClassifier.pkl')

0.9882436484140761 0.983903208837454
              precision    recall  f1-score   support

         0.0     0.9962    0.9815    0.9888      6872
         1.0     0.9535    0.9901    0.9715      2633

    accuracy                         0.9839      9505
   macro avg     0.9749    0.9858    0.9801      9505
weighted avg     0.9844    0.9839    0.9840      9505



In [26]:
# 得到提取的特征，依次为beyond1std, kurtosis, skew, sk, stetson_j, stetson_k，amp, max_slope, mad，fpr20, fpr35, fpr50, fpr65, fpr80
train[:,[1,2,5,6,7,8,9,10,11,12,13]].shape

(38022, 11)

In [None]:
##  使用较少特征训练决策树
clf = DecisionTreeClassifier(max_features=14, max_depth=15 )
clf = clf.fit(train[:,[1,2,5,6,7,8,9,10,11,12,13]], train[:,15])
train_score_c = clf.score(train[:,[1,2,5,6,7,8,9,10,11,12,13]], train[:,15])  # 返回预测的准确度
test_score_c = clf.score(test[:,[1,2,5,6,7,8,9,10,11,12,13]], test[:,15])
print(train_score_c,test_score_c)
print(classification_report(test[:,15], clf.predict(test[:,[1,2,5,6,7,8,9,10,11,12,13]]),digits=4))
# joblib.dump(clf, './DecisionTreeClassifier.pkl')

In [26]:
# 决策树的训练时间
from timeit import default_timer as timer
train_time_dt = []
for i in range(10):
    start_time = timer()
    print(start_time)
    clf.fit(train[:,0:14], train[:,14])
    current_time = timer()
    print(current_time)
    train_time_dt.append(current_time-start_time)
    print(current_time-start_time)
    i = i+1
print(train_time_dt)
print(np.mean(train_time_dt))

48859178.31864036
48859178.74525647
0.4266161099076271
48859178.74530517
48859179.16138619
0.4160810187458992
48859179.16143391
48859179.57808605
0.4166521355509758
48859179.578132436
48859179.994732246
0.4165998101234436
48859179.99477812
48859180.411218256
0.41644013673067093
48859180.41126411
48859180.82846632
0.4172022119164467
48859180.82851262
48859181.24497774
0.416465125977993
48859181.24502636
48859181.66162826
0.41660190373659134
48859181.66167414
48859182.07750164
0.4158274978399277
48859182.077546306
48859182.49354739
0.4160010814666748
[0.4266161099076271, 0.4160810187458992, 0.4166521355509758, 0.4165998101234436, 0.41644013673067093, 0.4172022119164467, 0.416465125977993, 0.41660190373659134, 0.4158274978399277, 0.4160010814666748]
0.417448703199625


In [10]:
# 决策树的测试时间
test_time_dt = []
for i in range(10):
    start_time = timer()
    print(start_time)
    clf.predict(test[:,0:14])
    current_time = timer()
    print(current_time)
    test_time_dt.append(current_time-start_time)
    print(current_time-start_time)
    i = i+1
print(test_time_dt)
print(np.mean(test_time_dt))

NameError: name 'timer' is not defined

In [15]:
### 进行决策树的模型剖析
feature_name = ['beyond1std', 'kurtosis', 'skew', 'sk', 'stetson_j', 'stetson_k','amp', 'max_slope', 'mad','fpr20', 'fpr35', 'fpr50', 'fpr65', 'fpr80']
clf_importances = clf.feature_importances_
clf_indices = np.argsort(clf_importances)[::-1]
print("决策树的特征重要性排序")
for f in range(train.shape[1]-1):
    print("%2d) %-*s %f" % (f + 1, 30, feature_name[clf_indices[f]], clf_importances[clf_indices[f]]))

决策树的特征重要性排序
 1) skew                           0.394200
 2) max_slope                      0.124816
 3) amp                            0.101379
 4) fpr65                          0.067451
 5) fpr35                          0.061255
 6) fpr20                          0.058771
 7) mad                            0.041256
 8) stetson_k                      0.032251
 9) fpr80                          0.027791
10) fpr50                          0.025341
11) sk                             0.021403
12) beyond1std                     0.018633
13) kurtosis                       0.014852
14) stetson_j                      0.010599


In [16]:
dot_data = tree.export_graphviz(clf,feature_names= feature_name,class_names=["耀发","非耀发"],filled=True,rounded=True)
graph = graphviz.Source(dot_data)#画树
# graph.render('./desiciontree5',view=True)

## 随机森林模型

https://blog.csdn.net/xiaonannanxn/article/details/51374483

https://www.jianshu.com/p/a493ebb90cce

https://blog.csdn.net/w952470866/article/details/78987265

https://zhuanlan.zhihu.com/p/212361817#:~:text=sklearn%EF%BC%88%E4%B8%83%EF%BC%89-%E9%9A%8F%E6%9C%BA%E6%A3%AE%E6%9E%97.%20%E5%9C%A8%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E4%B8%AD%EF%BC%8C%E9%9A%8F%E6%9C%BA%E6%A3%AE%E6%9E%97%E6%98%AF%E4%B8%80%E4%B8%AA%E5%8C%85%E5%90%AB%E5%A4%9A%E4%B8%AA%E5%86%B3%E7%AD%96%E6%A0%91%EF%BC%88%E5%86%B3%E7%AD%96%E6%A0%91%E7%9A%84%E6%A0%B8%E5%BF%83%E9%97%AE%E9%A2%98%E6%9C%89%E4%B8%A4%E4%B8%AA%EF%BC%9A%E6%89%BE%E5%87%BA%E6%AD%A3%E7%A1%AE%E7%9A%84%E7%89%B9%E5%BE%81%E5%88%86%E7%B1%BB%EF%BC%8C%E5%8D%B3%E5%A6%82%E4%BD%95%E5%88%86%E6%9E%9D%EF%BC%9B%E4%BA%8C%E6%98%AF%E6%A0%91%E7%94%9F%E9%95%BF%E5%88%B0%E4%BB%80%E4%B9%88%E6%97%B6%E5%80%99%E5%BA%94%E8%AF%A5%E5%81%9C%E4%B8%8B%EF%BC%8C%E6%88%AA%E6%9E%9D%EF%BC%89%E7%9A%84%E5%88%86%E7%B1%BB%E5%99%A8%EF%BC%8C%E5%B9%B6%E4%B8%94%E5%85%B6%E8%BE%93%E5%87%BA%E7%9A%84%E7%B1%BB%E5%88%AB%E6%98%AF%E6%9C%89%E4%B8%AA%E5%88%AB%E6%A0%91%E8%BE%93%E5%87%BA%E7%9A%84%E7%B1%BB%E5%88%AB%E7%9A%84%E4%BC%97%E6%95%B0%E8%80%8C%E5%AE%9A%E3%80%82.,%E9%80%9A%E8%BF%87%E6%9E%84%E5%BB%BA%E5%A4%9A%E4%B8%AA%E5%88%86%E7%B1%BB%E5%99%A8%EF%BC%8C%E8%BE%BE%E5%88%B0%E6%9C%80%E7%BB%88%E7%9A%84%E4%B8%80%E4%B8%AA%E5%88%86%E7%B1%BB%E6%95%88%E6%9E%9C%E8%83%BD%E5%A4%9F%E8%B6%85%E8%BF%87%E5%86%B3%E7%AD%96%E6%A0%91%EF%BC%88%E9%9A%8F%E6%9C%BA%E6%A3%AE%E6%9E%97%E6%98%AF%E7%94%B1%E5%BE%88%E5%A4%9A%E5%86%B3%E7%AD%96%E6%A0%91%E6%9E%84%E6%88%90%E7%9A%84%EF%BC%8C%E4%B8%8D%E5%90%8C%E5%86%B3%E7%AD%96%E6%A0%91%E4%B9%8B%E9%97%B4%E6%B2%A1%E6%9C%89%E5%85%B3%E8%81%94%EF%BC%89%EF%BC%8C%E6%9C%89%E6%96%B0%E7%9A%84%E6%95%B0%E6%8D%AE%E8%BF%9B%E5%85%A5%EF%BC%8C%E5%B0%B1%E8%AE%A9%E6%A3%AE%E6%9E%97%E4%B8%AD%E7%9A%84%E6%AF%8F%E4%B8%80%E5%88%BB%E5%86%B3%E7%AD%96%E6%A0%91%E5%88%86%E5%88%AB%E8%BF%9B%E8%A1%8C%E5%88%A4%E6%96%AD%E5%92%8C%E5%88%86%E7%B1%BB%EF%BC%8C%E6%AF%8F%E4%B8%AA%E5%86%B3%E7%AD%96%E6%A0%91%E4%BC%9A%E5%BE%97%E5%88%B0%E4%B8%80%E4%B8%AA%E5%88%86%E7%B1%BB%E7%BB%93%E6%9E%9C%EF%BC%8C%E5%86%B3%E7%AD%96%E6%A0%91%E7%9A%84%E5%88%86%E7%B1%BB%E7%BB%93%E6%9E%9C%E4%B8%AD%E7%9A%84%E5%93%AA%E4%B8%80%E4%B8%AA%E5%88%86%E7%B1%BB%E6%9C%80%E5%A4%9A%EF%BC%8C%E9%82%A3%E4%B9%88%E9%9A%8F%E6%9C%BA%E6%A3%AE%E6%9E%97%E5%B0%B1%E4%BC%9A%E6%8A%8A%E8%BF%99%E4%B8%AA%E7%BB%93%E6%9E%9C%E5%BD%93%E4%BD%9C%E6%9C%80%E7%BB%88%E7%9A%84%E7%BB%93%E6%9E%9C%E3%80%82.%20%E6%9E%84%E5%BB%BA%E9%9A%8F%E6%9C%BA%E6%A3%AE%E6%9E%97%E7%9A%84%E6%AD%A5%E9%AA%A4%E5%A4%A7%E6%A6%82%E5%8C%85%E6%8B%AC%EF%BC%9A1.%E6%95%B0%E6%8D%AE%E7%9A%84%E9%9A%8F%E6%9C%BA%E9%80%89%E5%8F%96%EF%BC%8C%E7%89%B9%E5%BE%81%E7%9A%84%E9%9A%8F%E6%9C%BA%E9%80%89%E5%8F%96%E3%80%82.


In [17]:
## 训练随机森林
rfc = RandomForestClassifier(max_features=14, max_depth=15,n_estimators=70)
rfc = rfc.fit(train[:,0:14], train[:,14])
train_score_r = rfc.score(train[:,0:14], train[:,14])  # 返回预测的准确度
test_score_r = rfc.score(test[:,0:14], test[:,14])
print(train_score_r,test_score_r)
print(classification_report(test[:,14], rfc.predict(test[:,0:14]),digits=4))
# joblib.dump(rfc, './RandomForestClassifier.pkl')
# crfc2 = joblib.load('./RandomForestClassifier.pkl')

0.9938719688601336 0.9917937927406628
              precision    recall  f1-score   support

         0.0     0.9997    0.9889    0.9943      6872
         1.0     0.9719    0.9992    0.9854      2633

    accuracy                         0.9918      9505
   macro avg     0.9858    0.9941    0.9898      9505
weighted avg     0.9920    0.9918    0.9918      9505



In [28]:
# 随机森林的训练时间
from timeit import default_timer as timer
train_time_rf = []
for i in range(10):
    start_time = timer()
    print(start_time)
    rfc.fit(train[:,0:14], train[:,14])
    current_time = timer()
    print(current_time)
    train_time_rf.append(current_time-start_time)
    print(current_time-start_time)
    i = i+1
print(train_time_rf)
print(np.mean(train_time_rf))

48859235.02238029
48859253.05061975
18.028239458799362
48859253.050768524
48859270.97318602
17.922417499125004
48859270.97324957
48859288.91578768
17.942538112401962
48859288.9158538
48859307.00465871
18.08880490809679
48859307.00482542
48859324.992248766
17.987423345446587
48859324.99231303
48859342.96112135
17.968808323144913
48859342.961186685
48859360.88742349
17.92623680830002
48859360.88748919
48859378.853128836
17.965639643371105
48859378.853275776
48859396.83971216
17.986436381936073
48859396.839771174
48859414.92508482
18.08531364798546
[18.028239458799362, 17.922417499125004, 17.942538112401962, 18.08880490809679, 17.987423345446587, 17.968808323144913, 17.92623680830002, 17.965639643371105, 17.986436381936073, 18.08531364798546]
17.990185812860727


In [29]:
# 随机森林的测试时间
from timeit import default_timer as timer
test_time_rf = []
for i in range(10):
    start_time = timer()
    print(start_time)
    rfc.predict(test[:,0:14])
    current_time = timer()
    print(current_time)
    test_time_rf.append(current_time-start_time)
    print(current_time-start_time)
    i = i+1
print(test_time_rf)
print(np.mean(test_time_rf))

48859512.28598021
48859512.341637254
0.0556570440530777
48859512.34168719
48859512.39169664
0.05000945180654526
48859512.39173919
48859512.43981989
0.048080697655677795
48859512.43985836
48859512.48727471
0.04741634428501129
48859512.48731089
48859512.53490145
0.04759056121110916
48859512.534942634
48859512.58241224
0.0474696084856987
48859512.58244896
48859512.62980332
0.04735436290502548
48859512.62983981
48859512.677211314
0.047371506690979004
48859512.67724808
48859512.72474357
0.047495484352111816
48859512.72478063
48859512.77343614
0.04865550994873047
[0.0556570440530777, 0.05000945180654526, 0.048080697655677795, 0.04741634428501129, 0.04759056121110916, 0.0474696084856987, 0.04735436290502548, 0.047371506690979004, 0.047495484352111816, 0.04865550994873047]
0.04871005713939667


In [18]:
### 进行随机森林模型剖析
feature_name = ['beyond1std', 'kurtosis', 'skew', 'sk', 'stetson_j', 'stetson_k','amp', 'max_slope', 'mad','fpr20', 'fpr35', 'fpr50', 'fpr65', 'fpr80']
rfc_importances = rfc.feature_importances_
rfc_indices = np.argsort(rfc_importances)[::-1]
print("随机森林的特征重要性排序")
for f in range(train.shape[1]-1):
    print("%2d) %-*s %f" % (f + 1, 30, feature_name[rfc_indices[f]], rfc_importances[rfc_indices[f]]))

随机森林的特征重要性排序
 1) skew                           0.392478
 2) max_slope                      0.129698
 3) amp                            0.109865
 4) fpr35                          0.060976
 5) fpr65                          0.058131
 6) fpr20                          0.054748
 7) mad                            0.041733
 8) fpr80                          0.034711
 9) fpr50                          0.029315
10) stetson_k                      0.027215
11) stetson_j                      0.024212
12) sk                             0.014194
13) kurtosis                       0.012859
14) beyond1std                     0.009865


## 使用提取出的特征的支持向量机模型，但是效果不好，在另一份代码里直接使用序列数据作为SVM的输入

https://www.jianshu.com/p/a9f9954355b3

https://blog.csdn.net/moriarty_jack/article/details/108432867

In [15]:
from sklearn import svm
X_train = train[:,0:14]
y_train = train[:,14]
X_test = test[:,0:14]
y_test = test[:,14]


# kernel = 'rbf'
clf_rbf = svm.SVC(kernel='rbf')
clf_rbf.fit(X_train,y_train)
score_rbf = clf_rbf.score(X_test,y_test)
print("The score of rbf is : %f"%score_rbf)

# kernel = 'linear'
clf_linear = svm.SVC(kernel='linear')
clf_linear.fit(X_train,y_train)
score_linear = clf_linear.score(X_test,y_test)
print("The score of linear is : %f"%score_linear)

# kernel = 'poly'
clf_poly = svm.SVC(kernel='poly')
clf_poly.fit(X_train,y_train)
score_poly = clf_poly.score(X_test,y_test)
print("The score of poly is : %f"%score_poly)

# kernel = 'sigmoid'
clf_sigmoid = svm.SVC(kernel='sigmoid')
clf_sigmoid.fit(X_train,y_train)
score_sigmoid = clf_sigmoid.score(X_test,y_test)
print("The score of sigmoid is : %f"%score_sigmoid)

The score of rbf is : 0.906575
The score of linear is : 0.889216
The score of poly is : 0.891741
The score of sigmoid is : 0.615992


In [16]:
print(classification_report(y_test, clf_rbf.predict(X_test),digits=4))

              precision    recall  f1-score   support

         0.0     0.9072    0.9700    0.9376      6872
         1.0     0.9045    0.7410    0.8146      2633

    accuracy                         0.9066      9505
   macro avg     0.9058    0.8555    0.8761      9505
weighted avg     0.9064    0.9066    0.9035      9505

