### 1.训练数据集处理
- 训练集train_15和trian_21
- 根据故障时间为训练集设定标签
- 结冰为1，无结冰为0

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.stats import norm 
from scipy import stats
from imblearn.under_sampling import OneSidedSelection
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import NeighbourhoodCleaningRule
from imblearn.combine import SMOTETomek
from sklearn.utils import shuffle

path=r"F:\Diverse\statistics\Python_data_analysis\首届（2017）中国工业大数据创新竞赛\叶片结冰预测"
os.chdir(path)
%matplotlib inline

In [2]:
#导入数据集
train_15=pd.read_csv(r"train\15\15_data.csv")
train_21=pd.read_csv(r"train\21\21_data.csv")
print("train_15.shape: ",train_15.shape,"train_21.shape: ",train_21.shape)

In [171]:
#导入故障时间
failureInfo_15=pd.read_csv(r"train\15\15_failureInfo.csv")
normalInfo_15=pd.read_csv(r"train/15/15_normalInfo.csv")
failureInfo_21=pd.read_csv(r"train\21\21_failureInfo.csv")
normalInfo_21=pd.read_csv(r"train/21/21_normalInfo.csv")

In [172]:
#风机状态标签函数
def filter_state(train=train_15,failureInfo=failureInfo_15,normalInfo=normalInfo_15):
    #将时间转换为时间序列,时间戳\
    #建立记录事故情况序列，索引为时间戳
    data=np.ones([train.shape[0],1])*np.nan
    time=pd.DataFrame(data,index=pd.to_datetime(train.time),columns=["state"])
    
    #将事故时间段转化为时间戳
    for k in failureInfo.columns:
        failureInfo[k]=pd.to_datetime(failureInfo[k])
        normalInfo[k]=pd.to_datetime(normalInfo[k])
    
    #运行状况标记
    #Dataframe可以使用时间戳切片，不可以直接使用时间戳进行索引
    for i in failureInfo.index:
        time[failureInfo.iloc[i,0]:failureInfo.iloc[i,1]]=1
    for j in normalInfo.index:
        time[normalInfo.iloc[j,0]:normalInfo.iloc[j,1]]=0
               
    return time

In [173]:
#风机状态设定
time_15=filter_state(train_15,failureInfo_15,normalInfo_15)
time_21=filter_state(train_21,failureInfo_21,normalInfo_21)

In [190]:
#数据拼接函数
def data_merge(train,time):
    train.time=pd.to_datetime(train.time)
    data=pd.merge(train,time,left_on="time",right_index=True)
    return data

In [193]:
data_15=data_merge(train_15,time_15)
data_21=data_merge(train_21,time_21)

In [194]:
#特征预处理函数
def filter_data(train=train_15):
    #将成组的变量转化为均值，约简为该属性的特征
    train["pitch_angle"]=train[["pitch1_angle","pitch2_angle","pitch3_angle"]].T.mean()
    train["pitch_speed"]=train[["pitch1_speed","pitch2_speed","pitch3_speed"]].T.mean()
    train["pitch_moto_tmp"]=train[["pitch1_moto_tmp","pitch2_moto_tmp","pitch3_moto_tmp"]].T.mean()
    train["pitch_ng5_tmp"]=train[["pitch1_ng5_tmp","pitch2_ng5_tmp","pitch3_ng5_tmp"]].T.mean()
    train["pitch_ng5_DC"]=train[["pitch1_ng5_DC","pitch2_ng5_DC","pitch3_ng5_DC"]].T.mean()
    train["acc"]=np.sqrt((train.acc_x.values)**2+(train.acc_y.values)**2)
    #丢弃多余特征
    train=train.drop(["pitch1_angle","pitch2_angle","pitch3_angle","pitch1_speed","pitch2_speed","pitch3_speed","pitch1_moto_tmp","pitch2_moto_tmp","pitch3_moto_tmp","pitch1_ng5_tmp","pitch2_ng5_tmp","pitch3_ng5_tmp","pitch1_ng5_DC","pitch2_ng5_DC","pitch3_ng5_DC"],axis=1)
    train=train.drop(["time","acc_x","acc_y"],axis=1)
    #生成新特征
    #考虑偏航系统对风角跟风向角有关，做差
    #考虑环境温度与机舱温度差
    #偏航速度受偏航系统限制，变化不大，舍弃
    train["difference_tmp"]=train.int_tmp-train.environment_tmp
    train["difference_wind_direction"]=train.wind_direction_mean-train.wind_direction
    train.drop(["yaw_speed","int_tmp","environment_tmp","wind_direction_mean","wind_direction"],axis=1,inplace=True)
    return train

In [195]:
#特征处理
data_15=filter_data(train=data_15)
data_21=filter_data(train=data_21)
data=pd.concat([data_15,data_21],axis=0)
data.dropna(inplace=True)
data.iloc[:,:-1].to_csv(r"train\train.csv",index=False)
data.iloc[:,-1].to_csv(r"train\results.csv",index=False)

print("data-state: ",data.groupby("state").size(),"data.shape: ",data.shape)

### 2.特征工程
- 生成特征多项式
- 根据filter、embedded和wrapper方法筛选特征
- 统计各种方法出现的特征情况，依据频数选取特征

#### 2.1导入数据集

In [None]:
train=pd.read_csv(r"train\train.csv")
results=pd.read_csv(r"train\results.csv")
train.drop(["pitch_speed","acc","pitch_ng5_DC"],axis=1,inplace=True)
print(results.groupby("state").size())
x=train.copy()
y=results.values.copy()
sample=x.copy()
sample["state"]=y
print(sample.shape,"\n",sample.info())

#### 2.2简单随机抽样,减少计算时间

In [None]:
from sklearn.utils import shuffle
sample1=sample[sample.state==0].sample(frac=0.06650808*0.75)
sample2=sample[sample.state==1].sample(frac=0.75)

data=pd.concat([sample1,sample2],ignore_index=True)
x_resample,y_resample=data.iloc[:,:-1],data.state.values
x_resample,y_resample=shuffle(x_resample,y_resample)

print(sample.shape,"\n",x_resample.shape,"\n",y_resample.shape)

#### 2.3创造多项式特征

In [None]:
from sklearn.preprocessing import PolynomialFeatures 

def polyfeature(x_resample):    
    x_resample_poly=PolynomialFeatures().fit_transform(x_resample)
    poly_feature=PolynomialFeatures().fit(x_resample).get_feature_names(input_features=x_resample.columns)
    x_resample_poly=pd.DataFrame(x_resample_poly,columns=poly_feature)
    return x_resample_poly

In [None]:
x_resample_poly=polyfeature(x_resample)
X_train,X_test,y_train,y_test=train_test_split(x_resample_poly,y_resample,random_state=42)
ratio=(np.count_nonzero(y_train==1)/np.count_nonzero(y_train==0))

print(X_train.shape,"\n",ratio)

#### 2.4特征选择

In [None]:
#1) filter-过滤法，按照发散性或相关性，1.f_score得分
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

f_test=SelectKBest(k=10).fit(X_train,y_train)
f_index=X_train.columns[np.argsort(-f_test.scores_)]
print(np.argsort(-f_test.scores_),"\n",f_index[:20])

In [None]:
#2) wrapper-封装式，递归特征消除法-recursive-feature-elimination
from sklearn.feature_selection import RFECV
from sklearn.ensemble import GradientBoostingClassifier

estimator=GradientBoostingClassifier(
    learning_rate=0.0001,
    n_estimators=100,
    max_depth=5,
    subsample=0.5,
    max_features=0.7
    )
rfe_gbc=RFECV(estimator,cv=3,scoring="precision").fit(X_train,y_train.ravel())

rfe_index=X_train.columns[np.argsort(-rfe_gbc.grid_scores_)]
print(rfe_gbc.support_,"\n",rfe_index[:20])

In [None]:
#3.1) 特征选择方法，embedded--嵌入式，GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
estimator=GradientBoostingClassifier(
    learning_rate=0.0001,
    n_estimators=100,
    max_depth=5,
    subsample=0.5,
    max_features=0.7
    )
emb_gbc=SelectFromModel(estimator).fit(X_train,y_train.ravel())
emb_gb_index=X_train.columns[emb_gbc.get_support()]
print(emb_gbc.get_support(),"\n",emb_gb_index)

In [None]:
#3.2) 特征选择方法，embedded--嵌入式，RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

estimator=RandomForestClassifier(
    #learning_rate=0.0001,
    n_estimators=100,
    max_depth=5,
    #subsample=0.5,
    max_features=0.7
    )

emb_rfc=SelectFromModel(estimator).fit(X_train,y_train.ravel())
emb_rfc_index=X_train.columns[emb_rfc.get_support()]
print(emb_rfc.get_support(),"\n",emb_rfc_index)

In [None]:
#3.3) 特征选择方法，embedded--嵌入式，XGBoost
from sklearn.feature_selection import SelectFromModel
import xgboost as xgb

xgb_final = xgb.XGBClassifier( 
    learning_rate =0.0001, 
    n_estimators=100,
    max_depth=5, 
    min_child_weight=1,
    gamma=0.2,
    subsample=0.5,
    colsample_bytree=0.7,
    reg_alpha=120,
    reg_lambda=10,
    objective= 'binary:logistic',
    nthread=4, 
    scale_pos_weight=ratio*0.8,
    seed=27)

emb_xgb=SelectFromModel(xgb_final).fit(X_train,y_train.ravel())
emb_xgb_index=X_train.columns[emb_xgb.get_support()]
print(emb_xgb.get_support(),"\n",emb_xgb_index)

In [None]:
#4) 统计特征出现次数
feature_index=np.hstack((rfe_index[:20].values,emb_rfc_index.values,emb_gb_index.values,emb_xgb_index.values))
f=pd.DataFrame(feature_index,columns=["f_index"])
feature_rank=f.apply(pd.value_counts)

feature_rank.to_csv("feature_rank.csv")

### 3.特征数据集
- 根据特征出现频数选择特征
- 定义数据集处理函数
- 生成多项式数据集

In [None]:
#1) 根据需求调整特征
feature_rank=pd.read_csv("feature_rank.csv",index_col=0)
feature=feature_rank[feature_rank.f_index>=3].index
print("Features: ",feature)

In [None]:
#2) 定义数据集处理函数，将原始数据集转化为多项式数据集，便于直接使用
def polydataset(dataname=r"train\train.csv",resultname=r"train_15_results.csv",feature=feature,outputfilename=r"train_ensemble.csv"):
    data=pd.read_csv(dataname)
    results=pd.read_csv(resultname)
    data.drop(["pitch_speed","acc","pitch_ng5_DC"],axis=1,inplace=True)
    data=polyfeature(data)[feature]
    print("data: ",data.shape,results.shape)
    
    ee=EasyEnsemble()
    x_rem,y_rem=ee.fit_sample(data,results)
    print("x_rem: ",x_rem.shape)
    
    draw=pd.DataFrame(x_rem[0],columns=feature)
    draw["results"]=y_rem[0]
    draw.to_csv(outputfilename,index=False)
    print("drawdata: ",draw.shape)

In [None]:
#3) 数据集整理
#train_15数据集
polydataset(dataname=r"train\train_15.csv",resultname=r"train\train_15_results.csv",feature=feature,outputfilename=r"train_15_ensemble.csv")

#train_21数据集
polydataset(dataname=r"train\train_21.csv",resultname=r"train\train_21_results.csv",feature=feature,outputfilename=r"train_21_ensemble.csv")

#train_21+train_15混合数据集
polydataset(dataname=r"train\train.csv",resultname=r"train\results.csv",feature=feature,outputfilename=r"train_ensemble.csv")