In [59]:
import pandas as pd
import numpy as np
import os
from scipy import stats
from scipy import signal

In [2]:
#列名定义
df_out_columns = ['time_mean','time_std','time_max','time_min',
                  'time_ptp','time_median','time_iqr','time_pr',
                  'time_skew','time_kurtosis','time_var','time_amp',
                  'time_smr',
                  'time_pulse','time_margin','1X','2X','3X','1XRatio',
                  '2XRatio','3XRatio']    #'time_rms','time_wavefactor','time_peakfactor',

In [12]:
DE_columns = ['DE_' + i for i in df_out_columns]
FE_columns = ['FE_' + i for i in df_out_columns]
label_columns = ['label']
full_columns = DE_columns + FE_columns + label_columns

In [4]:
#直接白嫖特征提取函数
def featureget(df_line):
    #提取时域特征
    time_mean = df_line.mean()
    time_std = df_line.std()
    time_max = df_line.max()
    time_min = df_line.min()
    #time_rms = np.sqrt(np.square(df_line).mean().astype(np.float64))   #AttributeError: 'float' object has no attribute 'astype'
    time_ptp = np.asarray(df_line).ptp()
    time_median = np.median(df_line)
    time_iqr = np.percentile(df_line,75)-np.percentile(df_line,25)
    time_pr = np.percentile(df_line,90)-np.percentile(df_line,10)
    time_skew = stats.skew(df_line)
    time_kurtosis = stats.kurtosis(df_line)
    time_var = np.var(df_line)
    time_amp = np.abs(df_line).mean()
    time_smr = np.square(np.sqrt(np.abs(df_line).astype(np.float64)).mean())
    #下面四个特征需要注意分母为0或接近0问题，可能会发生报错
    #time_wavefactor = time_rms/time_amp
    #time_peakfactor = time_max/time_rms
    time_pulse = time_max/time_amp
    time_margin = time_max/time_smr
    #提取频域特征倍频能量以及能量占比
    plist_raw = np.fft.fft(list(df_line), n=1024)
    plist = np.abs(plist_raw)
    plist_energy = (np.square(plist)).sum()
    #在傅里叶变换结果中，在32点处的幅值为一倍频幅值，64点处幅值为二倍频幅值，96点处为三倍频幅值，因此提取这三处幅值并计算能量占比
    return_list = [
    time_mean,time_std,time_max,time_min,time_ptp, 
    time_median,time_iqr,time_pr,time_skew,time_kurtosis,
    time_var,time_amp,time_smr,
    time_pulse,time_margin,plist[32], plist[64], plist[96],
    np.square(plist[32]) / plist_energy,
    np.square(plist[64]) / plist_energy,
    np.square(plist[96]) / plist_energy
    ]                                              #time_rms,time_wavefactor,time_peakfactor,
    return return_list

In [38]:
windowSize = 150   #时间窗大小          #时间片从100改为150，得分cong81.7提高到84

#约定normal(NORMAL), ball(B), outer race(OR), inner race(IR)的预测输出标签为0, 1, 2, 3

####################################################################################################################
#特征提取的文件路径  B
B_fault = pd.read_csv('./merge_4/B_fault.csv')

feature_B = []
for i in range(0,int(len(B_fault)/windowSize)):  #int(len(B_fault)/windowSize)              #残余数据省略了，能不能改进？
    fea_DE = featureget(B_fault.loc[i*windowSize+1:(i+1)*windowSize,'DE_time'])
    fea_FE = featureget(B_fault.loc[i*windowSize+1:(i+1)*windowSize,'FE_time'])
    fea_FE.extend('1')
    fea_DE.extend(fea_FE)
    feature_B.append(fea_DE)

#换成数据帧格式
feature_B = pd.DataFrame(feature_B,columns=full_columns)
#提取完的保存路径
#feature_B.to_csv('../Data_set/feature_4/feature_B.csv',index=False)

####################################################################################################################
#特征提取的文件路径    IR
IR_fault = pd.read_csv('./merge_4/IR_fault.csv')

feature_IR = []
for i in range(0,int(len(IR_fault)/windowSize)):  #int(len(B_fault)/windowSize)              #残余数据省略了，能不能改进？
    fea_DE = featureget(IR_fault.loc[i*windowSize+1:(i+1)*windowSize,'DE_time'])
    fea_FE = featureget(IR_fault.loc[i*windowSize+1:(i+1)*windowSize,'FE_time'])
    fea_FE.extend('3')
    fea_DE.extend(fea_FE)
    feature_IR.append(fea_DE)
    
#换成数据帧格式
feature_IR = pd.DataFrame(feature_IR,columns=full_columns)
#提取完的保存路径
#feature_IR.to_csv('../Data_set/feature_4/feature_IR.csv',index=False)

###############################################################################################################
#特征提取的文件路径    NORMAL
NORMAL = pd.read_csv('./merge_4/NORMAL.csv')

feature_NORMAL = []
for i in range(0,int(len(NORMAL)/windowSize)):  #int(len(B_fault)/windowSize)              #残余数据省略了，能不能改进？
    fea_DE = featureget(NORMAL.loc[i*windowSize+1:(i+1)*windowSize,'DE_time'])
    fea_FE = featureget(NORMAL.loc[i*windowSize+1:(i+1)*windowSize,'FE_time'])
    fea_FE.extend('0') 
    fea_DE.extend(fea_FE)
    feature_NORMAL.append(fea_DE)

#换成数据帧格式
feature_NORMAL = pd.DataFrame(feature_NORMAL,columns=full_columns)
#提取完的保存路径
#feature_NORMAL.to_csv('../Data_set/feature_4/feature_NORMAL.csv',index=False)

##################################################################################################################################
#特征提取的文件路径    OR
OR_fault = pd.read_csv('./merge_4/OR_fault.csv')

feature_OR = []
for i in range(0,int(len(OR_fault)/windowSize)):  #int(len(B_fault)/windowSize)              #残余数据省略了，能不能改进？
    fea_DE = featureget(OR_fault.loc[i*windowSize+1:(i+1)*windowSize,'DE_time'])
    fea_FE = featureget(OR_fault.loc[i*windowSize+1:(i+1)*windowSize,'FE_time'])
    fea_FE.extend('2')
    fea_DE.extend(fea_FE)
    feature_OR.append(fea_DE)

#换成数据帧格式
feature_OR = pd.DataFrame(feature_OR,columns=full_columns)
#提取完的保存路径
#feature_OR.to_csv('../Data_set/feature_4/feature_OR.csv',index=False)

测试集特征提取，结果保存在Data_set/feature_test_all

In [42]:
test1 = pd.read_csv('../Data_set/test/TEST01.csv')
test2 = pd.read_csv('../Data_set/test/TEST02.csv')
test3 = pd.read_csv('../Data_set/test/TEST03.csv')
test4 = pd.read_csv('../Data_set/test/TEST04.csv')
test5 = pd.read_csv('../Data_set/test/TEST05.csv')
test6 = pd.read_csv('../Data_set/test/TEST06.csv')
test7 = pd.read_csv('../Data_set/test/TEST07.csv')
test8 = pd.read_csv('../Data_set/test/TEST08.csv')
test9 = pd.read_csv('../Data_set/test/TEST09.csv')
test10 = pd.read_csv('../Data_set/test/TEST10.csv')
test11 = pd.read_csv('../Data_set/test/TEST11.csv')
test12 = pd.read_csv('../Data_set/test/TEST12.csv')
test13 = pd.read_csv('../Data_set/test/TEST13.csv')
test14 = pd.read_csv('../Data_set/test/TEST14.csv')

In [45]:
index = 0
for test in [test1,test2,test3,test4,test5,test6,test7,test8,test9,test10,test11,test12,test13,test14]:
    feature_test = []
    index = index + 1
    for i in range(0,int(len(test)/windowSize)):  #int(len(B_fault)/windowSize)              #残余数据省略了，能不能改进？
        fea_DE = featureget(test.loc[i*windowSize+1:(i+1)*windowSize,'DE_time'])
        fea_FE = featureget(test.loc[i*windowSize+1:(i+1)*windowSize,'FE_time'])
        fea_DE.extend(fea_FE)
        feature_test.append(fea_DE)
    #换成数据帧格式
    feature_test = pd.DataFrame(feature_test,columns=full_columns[:-1])
    
    feature_test.to_csv('../Data_set/feature_test_all/TEST'+str(index).zfill(2)+'_all.csv',index=False)

### 特征筛选

In [60]:
feature_selected_list = ['DE_time_mean',                                       
                         'DE_time_std', 
                         'DE_time_max',
                         'DE_time_min',
                         'DE_time_ptp',
                         'DE_time_median',
                         'DE_time_iqr',
                         'DE_time_pr',
                         'DE_time_skew',
                         'DE_time_kurtosis',
                     #    'DE_time_var',
                         'DE_time_amp',
                         'DE_time_smr',
                         'DE_time_pulse',
                      #   'DE_time_margin',
                         'DE_1X',
                         'DE_2X',
                         'DE_3X',
                         'DE_1XRatio',
                         'DE_2XRatio',
                         'DE_3XRatio',
                         'FE_time_mean',
                         'FE_time_std',
                         'FE_time_max',
                         'FE_time_min',
                         'FE_time_ptp',
                         'FE_time_median',
                         'FE_time_iqr',
                         'FE_time_pr',
                         'FE_time_skew',
                         'FE_time_kurtosis',
                      #   'FE_time_var',
                         'FE_time_amp',
                         'FE_time_smr',
                         'FE_time_pulse',
                     #    'FE_time_margin',
                         'FE_1X',
                         'FE_2X',
                         'FE_3X',
                         'FE_1XRatio',
                         'FE_2XRatio',
                         'FE_3XRatio',
                         'label']

训练集特征筛选,筛选后的训练文件/Data_set/feature_4/feature_selected.csv

In [61]:
#frames = [feature_B,feature_IR,feature_NORMAL,feature_OR]
#feature_all = pd.concat(frames)
feature_all = pd.read_csv('../Data_set/feature_4/feature_all.csv')

#特征筛选
feature_selected = feature_all[feature_selected_list]

feature_selected.to_csv('../Data_set/feature_4/feature_selected.csv',index=False)

测试集特征筛选，筛选后的测试文件/Data_set/feature_test/TEST0x.csv

In [62]:
path = '../Data_set/feature_test_all'
files = os.listdir(path)

index = 0
for file2select in files:
    index = index + 1
    dffile = pd.read_csv(path + '/' + file2select)
    #特征筛选
    dffile = dffile[feature_selected_list[:-1]]
    dffile.to_csv('../Data_set/feature_test/TEST'+str(index).zfill(2)+'.csv',index=False)  

记录
1.全部，model1_0,score81.
3.不用DE_time_var，DE_time_margin，model1_2,score84.65259032102647