In [1]:
import numpy as np
import pandas as pd
import os
import sys as sys
import lightgbm as lgb
from sklearn.externals import joblib
import matplotlib.pyplot as plt
%matplotlib notebook
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold,StratifiedKFold

In [2]:
path = '工程机械寿命预测选手/工程机械寿命预测选手/train/'

In [3]:
#处理统计数据
def stat(data,c,name):
        c[name + '_max'] = data.max()
        c[name + '_min'] = data.min()
        c[name + '_count'] = data.count()
        c[name + '_median'] = data.median()
        c[name + '_mean'] = data.mean()
        c[name + '_ptp'] = data.ptp()
        c[name + '_std'] = data.std()
        return c

In [4]:
def get_dir(path):
    file_names = os.listdir(path)
    return file_names

In [5]:
def get_abs(x):
    return abs(x)

In [6]:
def handle_single_file(file_name,path):
    split_points = [0.35,0.40,0.45,0.50,0.55,0.60,0.65,0.70,0.75,0.80,0.85,0.90,0.95]
    local_path = path + file_name
    df_temp = pd.read_csv(local_path)
    lifemax = df_temp['部件工作时长'].max()
    df_train = pd.DataFrame()
    for split_point in split_points:
        df_temp_split = df_temp[df_temp['部件工作时长']<=lifemax*split_point]
        sigle_sample = {
            'train_file_name':file_name+str(split_point),
            '开关1_sum':df_temp_split['开关1信号'].sum(),
            '开关2_sum':df_temp_split['开关2信号'].sum(),
            '告警1_sum':df_temp_split['告警信号1'].sum(),
            '设备类型':df_temp_split['设备类型'][0],
            'life':lifemax-df_temp_split['部件工作时长'].max()# 剩余寿命
        }
        for column in ['部件工作时长', '累积量参数1', '累积量参数2','转速信号1','转速信号2',
                       '压力信号1','压力信号2','温度信号','流量信号','电流信号']:
            sigle_sample = stat(df_temp_split[column],sigle_sample,column)# 构造时序特征
        df_single_train = pd.DataFrame(sigle_sample,index=[0])
        df_train = pd.concat([df_train,df_single_train])
    return df_train

In [7]:
def merge_train_data(path):
    file_names = get_dir(path)
    df_all_train = pd.DataFrame()
    for file_name in file_names:
        df_train = handle_single_file(file_name,path)
        df_all_train = pd.concat([df_all_train,df_train])
    return df_all_train

In [8]:
df_all_train = merge_train_data(path = path)

  


In [9]:
def judge_type(x):
    if x == 'S100':
        return 0
    elif x == 'S26a':
        return 1
    elif x == 'S508':
        return 2
    elif x == 'S51d':
        return 3
    elif x == 'Saa3':
        return 4

In [10]:
df_all_train['设备类型'] = df_all_train['设备类型'].apply(judge_type)

In [11]:
def load_model(model_name='gbm_S100'):
    local_path = 'model/'+model_name+'.pkl'
    local_gbm = joblib.load(local_path)
    return local_gbm
def dump_model(model,model_name='gbm_S100'):
    local_path = 'model/'+model_name+'.pkl'
    joblib.dump(model,local_path)

In [14]:
floder = KFold(n_splits=10,random_state=27,shuffle=False)
X = df_all_train.drop(['life','train_file_name'],axis=1)
y = df_all_train['life']
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=1/10,random_state =44)
X = pd.concat([X_train,X_valid])
y = pd.concat([y_train,y_valid])# 这个步骤是因为我电脑不知道怎么了，用不了StratifiedKFold
for index,(train_index,test_index) in enumerate(floder.split(X, y)):
    X_train,X_valid,y_train,y_valid = X.iloc[train_index],X.iloc[test_index],y.iloc[train_index],y.iloc[test_index]
    gbm = lgb.LGBMRegressor(objective='regression',
                            num_leaves=250,
                            max_depth=3,
                            learning_rate=0.1,
                            verbose=-1,
                            n_estimators=9000,
                            boosting_type='gbdt')
    gbm.fit(X_train,y_train,
            eval_set=[(X_valid,y_valid)],
            eval_metric='l1',
            early_stopping_rounds=200,verbose=200)
    dump_model(gbm,str(index))

Training until validation scores don't improve for 200 rounds.
[200]	valid_0's l2: 1.10297e+07	valid_0's l1: 1334.6
[400]	valid_0's l2: 7.77211e+06	valid_0's l1: 1198.04
[600]	valid_0's l2: 6.40597e+06	valid_0's l1: 1103.78
[800]	valid_0's l2: 5.707e+06	valid_0's l1: 1048.88
[1000]	valid_0's l2: 5.30371e+06	valid_0's l1: 1007.45
[1200]	valid_0's l2: 5.00347e+06	valid_0's l1: 974.69
[1400]	valid_0's l2: 4.82283e+06	valid_0's l1: 943.923
[1600]	valid_0's l2: 4.64359e+06	valid_0's l1: 916.434
[1800]	valid_0's l2: 4.56755e+06	valid_0's l1: 887.196
[2000]	valid_0's l2: 4.49241e+06	valid_0's l1: 861.013
[2200]	valid_0's l2: 4.39182e+06	valid_0's l1: 840.151
[2400]	valid_0's l2: 4.28561e+06	valid_0's l1: 816.695
[2600]	valid_0's l2: 4.23188e+06	valid_0's l1: 794.792
[2800]	valid_0's l2: 4.22359e+06	valid_0's l1: 776.097
[3000]	valid_0's l2: 4.19939e+06	valid_0's l1: 761.07
[3200]	valid_0's l2: 4.17898e+06	valid_0's l1: 748.378
Early stopping, best iteration is:
[3192]	valid_0's l2: 4.17294e+0

[3200]	valid_0's l2: 1.17754e+07	valid_0's l1: 948.397
[3400]	valid_0's l2: 1.17301e+07	valid_0's l1: 933.715
Early stopping, best iteration is:
[3396]	valid_0's l2: 1.17219e+07	valid_0's l1: 933.647
Training until validation scores don't improve for 200 rounds.
[200]	valid_0's l2: 1.568e+07	valid_0's l1: 1490.37
[400]	valid_0's l2: 1.28189e+07	valid_0's l1: 1369.03
[600]	valid_0's l2: 1.11307e+07	valid_0's l1: 1298.53
[800]	valid_0's l2: 1.01669e+07	valid_0's l1: 1236.92
[1000]	valid_0's l2: 9.72169e+06	valid_0's l1: 1190.86
[1200]	valid_0's l2: 9.31388e+06	valid_0's l1: 1150.4
[1400]	valid_0's l2: 9.03129e+06	valid_0's l1: 1121
[1600]	valid_0's l2: 8.84261e+06	valid_0's l1: 1086.48
[1800]	valid_0's l2: 8.56659e+06	valid_0's l1: 1056.21
[2000]	valid_0's l2: 8.26487e+06	valid_0's l1: 1029.87
[2200]	valid_0's l2: 8.11875e+06	valid_0's l1: 1004.71
[2400]	valid_0's l2: 7.93759e+06	valid_0's l1: 981.857
[2600]	valid_0's l2: 7.8355e+06	valid_0's l1: 966.195
[2800]	valid_0's l2: 7.7642e+06	v

In [35]:
def kfold_predict(X):
    df_pred = pd.DataFrame()
    indexes = [0,3,4,5,7,8,9]# 没有到的是因为模型没有收敛好
    for index in indexes:
        gbm = load_model(str(index))
        df_pred[str(index)] = gbm.predict(X)
    return df_pred

In [16]:
def compute_loss(target, predict):
    temp = np.square(np.log(abs(target + 1)) - np.log(abs(predict + 1)))
    res = np.sqrt(np.sum(temp) / len(temp))
    return res

In [102]:
compute_loss(gbm.predict(X_valid),y_valid)

1.1864295564504823

In [197]:
df_temp = pd.DataFrame(columns=['pred','real'])
df_temp.pred = gbm.predict(X_valid)
df_temp.pred = df_temp.pred.round(decimals=2)
df_temp.real = np.array(y_valid)

In [195]:
df_temp.pred[df_temp.pred<0] = 200

In [198]:
compute_loss(df_temp.pred,df_temp.real)

1.2245317392429114

In [17]:
path = '工程机械寿命预测选手/工程机械寿命预测选手/test1/'

In [78]:
def handle_single_test_file(file_name,path):
    split_points = [0.65,0.75,0.85,0.95,1.0]
    local_path = path + file_name
    df_temp = pd.read_csv(local_path)
    lifemax = df_temp['部件工作时长'].max()
    df_train = pd.DataFrame()
    for split_point in split_points:
        df_temp_split = df_temp[df_temp['部件工作时长']<=lifemax*split_point]
        sigle_sample = {
            'test_file_name':file_name,
            '开关1_sum':df_temp_split['开关1信号'].sum(),
            '开关2_sum':df_temp_split['开关2信号'].sum(),
            '告警1_sum':df_temp_split['告警信号1'].sum(),
            '设备类型':df_temp_split['设备类型'][0],
        }
        for column in ['部件工作时长', '累积量参数1', '累积量参数2','转速信号1','转速信号2',
                       '压力信号1','压力信号2','温度信号','流量信号','电流信号']:
            sigle_sample = stat(df_temp_split[column],sigle_sample,column)# 构造时序特征
        df_single_train = pd.DataFrame(sigle_sample,index=[0])
        df_train = pd.concat([df_train,df_single_train])
    return df_train

In [79]:
def merge_test_data(path):
    file_names = get_dir(path)
    df_all_train = pd.DataFrame()
    for file_name in file_names:
        df_train = handle_single_test_file(file_name,path)
        df_all_train = pd.concat([df_all_train,df_train])
    return df_all_train

In [80]:
df_all_test = merge_test_data(path)

  


In [81]:
df_all_test['设备类型'] = df_all_test['设备类型'].apply(judge_type)

In [82]:
df_pred = kfold_predict(df_all_test.drop(['test_file_name'],axis=1))

In [83]:
df_pred['test_file_name'] = np.array(df_all_test['test_file_name'])
df_pred['部件工作时长_max'] = np.array(df_all_test['部件工作时长_max'])

In [84]:
def voted(a, b, c, d, e, f, g, h):
    lst = pd.Series([a, b, c, d, e, f, g])
    lst += h
    return lst.median()

In [85]:
df_pred['total_life'] = df_pred.apply(lambda row:voted(row['0'],row['3'],row['4'],row['5'],row['7'],row['8'],row['9'],row['部件工作时长_max']),axis=1)

In [98]:
def predict(df_test):
    file_names = get_dir(path)
    df_test = df_test.set_index(['test_file_name'])
    df_result = pd.DataFrame(columns=['test_file_name','life'])
    for file_name in file_names:
        temp = (df_test.loc[file_name].loc[:,'total_life'] - df_test.loc[file_name].loc[:,'部件工作时长_max'].max())
        temp = list(temp)
        temp.reverse()
        for y_pred in temp:
            if y_pred>0:
                life = y_pred
                break
        pred_single = {
            'test_file_name':[file_name],
            'life':[life]
        }
        df_result = pd.concat([df_result,pd.DataFrame(pred_single)])
    return df_result

In [130]:
df_result = predict(df_pred)

In [131]:
df_result

Unnamed: 0,test_file_name,life
0,002ece6be8b41a5613aa.csv,694.711080
0,004a2ad4b735329a3e23.csv,502.858393
0,004c675bb05d447aa94b.csv,4768.875103
0,0120bc1afed1186f5b79.csv,2272.544211
0,013898c56ddfd3f53350.csv,1766.960524
0,01dda06b5ab0a3de630d.csv,1733.618138
0,01e7ba5662e5d918871f.csv,702.023790
0,01fe37e0300faca21f25.csv,1905.564130
0,02cf0f73d0eca3a1de31.csv,2383.820762
0,02fc6df7ff590a82d546.csv,422.598775


In [132]:
df_result.test_file_name[df_result.life>15000] #= 15000
# df_result = df_result.replace(np.nan,200)

0    13a61495355fe904e7c9.csv
0    1ff6a09806b4c92d55c5.csv
0    5478d69a236045cb5f55.csv
0    639252e21fcdb0d1dfa1.csv
0    639d1fecbe425742185c.csv
0    82ccd24fcadd4f2ee2c8.csv
0    8accdf1f4798efb4cd5c.csv
0    acaf787e2d91ef2e1941.csv
0    e29d795c67c8c0daac72.csv
0    fddbe0925ebbe517afe7.csv
Name: test_file_name, dtype: object

In [133]:
df_result.life[df_result.test_file_name == '1ff6a09806b4c92d55c5.csv'] = 15000

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [134]:
df_result.life[df_result.test_file_name == str('5478d69a236045cb5f55'+'.csv')] = 15000

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [135]:
df_result.life[df_result.test_file_name == str('639d1fecbe425742185c'+'.csv')] = 15000

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [136]:
df_result.life[df_result.test_file_name == str('82ccd24fcadd4f2ee2c8'+'.csv')] = 100

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [137]:
df_result.life[df_result.test_file_name == str('8accdf1f4798efb4cd5c'+'.csv')] = 10000

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [138]:
df_result.life[df_result.test_file_name == str('fddbe0925ebbe517afe7'+'.csv')] = 10000

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [139]:
df_result.life[df_result.life>20000] = 20000

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [140]:
df_result.to_csv('data/df_result_15_2.csv',index=False)