In [7]:
import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Setting seed for reproducability
np.random.seed(1234)  
PYTHONHASHSEED = 0
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM, Activation, Conv1D, MaxPooling1D, Masking, GaussianNoise
%matplotlib inline

In [8]:
train_df = pd.read_csv('./data/processed_train.csv')
td = pd.read_csv('./data/processed_test.csv')

In [9]:
train_df.loc[train_df['RUL']>130,['RUL']]=130

In [10]:
train_df

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s20,s21,mode1,mode2,mode3,mode4,mode5,mode6,RUL,cycle_norm
0,1,1,0.833134,0.997625,1.0,0.060269,0.181576,0.311201,0.273095,0.146592,...,0.156036,0.159082,0.0,0.0,0.0,1.0,0.0,0.0,130,0.000000
1,1,2,0.999767,0.998575,1.0,0.000000,0.131847,0.296600,0.245535,0.000000,...,0.007888,0.014562,0.0,1.0,0.0,0.0,0.0,0.0,130,0.002653
2,1,3,0.595096,0.738480,0.0,0.238089,0.016332,0.035297,0.056997,0.293184,...,0.133745,0.151414,0.0,0.0,1.0,0.0,0.0,0.0,130,0.005305
3,1,4,0.999993,0.999525,1.0,0.000000,0.128269,0.298795,0.246979,0.000000,...,0.014060,0.026144,0.0,1.0,0.0,0.0,0.0,0.0,130,0.007958
4,1,5,0.595137,0.736698,0.0,0.238089,0.014130,0.037871,0.058152,0.293184,...,0.135460,0.143240,0.0,0.0,1.0,0.0,0.0,0.0,130,0.010610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53754,260,312,0.476188,0.831354,1.0,0.626985,0.672172,0.682297,0.591489,0.507937,...,0.486283,0.483993,0.0,0.0,0.0,0.0,1.0,0.0,4,0.824934
53755,260,313,0.238102,0.298100,1.0,0.597937,0.644830,0.733008,0.722934,0.617180,...,0.614540,0.622022,1.0,0.0,0.0,0.0,0.0,0.0,3,0.827586
53756,260,314,0.595222,0.736342,0.0,0.238089,0.017892,0.088067,0.082198,0.293184,...,0.137517,0.144474,0.0,0.0,1.0,0.0,0.0,0.0,2,0.830239
53757,260,315,0.595203,0.738717,0.0,0.238089,0.021195,0.079155,0.102368,0.293184,...,0.132716,0.134383,0.0,0.0,1.0,0.0,0.0,0.0,1,0.832891


In [11]:
# add noice to it


In [12]:
# function to reshape features into (samples, time steps, features) 
def gen_sequence(id_df, seq_length, seq_cols):
    """ Only sequences that meet the window-length are considered, no padding is used. This means for testing
    we need to drop those which are below the window-length. An alternative would be to pad sequences so that
    we can use shorter ones """
    data_array = id_df[seq_cols].values
    num_elements = data_array.shape[0]
    for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
        yield data_array[start:stop, :]
    # For example id1 have 192 rows and sequence_length is equal to 50
    # so zip iterate over two following list of numbers (0,112),(50,192)
    # 0 50 -> from row 0 to row 50
    # 1 51 -> from row 1 to row 51
    # 2 52 -> from row 2 to row 52
    # ...
    # 111 191 -> from row 111 to 191

In [13]:
# pick the feature columns 
sensor_cols = ['s' + str(i) for i in range(1,22)]
sequence_cols = ['setting1', 'setting2', 'setting3','cycle_norm','mode1','mode2','mode3','mode4','mode5','mode6']
# sequence_cols = ['setting1', 'setting2', 'setting3','cycle_norm']
sequence_cols.extend(sensor_cols)

In [14]:
sequence_length = 50

In [15]:
# generator for the sequences
seq_gen = (list(gen_sequence(train_df[train_df['id']==id], sequence_length, sequence_cols)) 
           for id in train_df['id'].unique())

# generate sequences and convert to numpy array
seq_array = np.concatenate(list(seq_gen)).astype(np.float32)
seq_array.shape

(40759, 50, 31)

In [16]:
# function to generate labels
def gen_labels(id_df, seq_length, label):
    data_array = id_df[label].values
    num_elements = data_array.shape[0]
    return data_array[seq_length:num_elements, :]

In [17]:
# generate labels
label_gen = [gen_labels(train_df[train_df['id']==id], sequence_length, ['RUL']) 
             for id in train_df['id'].unique()]
label_array = np.concatenate(label_gen).astype(np.float32)


In [18]:
import keras.backend as K

In [19]:
def r2_keras(y_true, y_pred):
    """Coefficient of Determination 
    """
    SS_res =  K.sum(K.square( y_true - y_pred ))
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

In [20]:
def scoring_error(y_true, y_pred):
    diff = y_pred - y_true
    diff_pos = diff[diff>=0]
    diff_neg = diff[diff<0]
    score = K.sum(K.exp(-1./13.*diff_neg)-1.)+K.sum(K.exp(1./10.*diff_pos)-1.)

    return score

In [21]:
# Build model with mask
nb_features = seq_array.shape[2]
nb_out = label_array.shape[1]

model = Sequential()

model.add(Masking(mask_value=0., input_shape=(sequence_length, nb_features)))
model.add(GaussianNoise(0.1))
model.add(LSTM(
         units=100,
         return_sequences=True,))
model.add(Dropout(0.2))
model.add(LSTM(
          units=50,
          return_sequences=False,))
model.add(Dropout(0.2))
model.add(Dense(units=nb_out))
model.add(Activation("linear"))
model.compile(loss='mean_squared_error', optimizer='rmsprop',metrics=['mae',r2_keras])
# model.compile(loss=scoring_error, optimizer='rmsprop',metrics=['mae',r2_keras])

print(model.summary())






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_1 (Masking)          (None, 50, 31)            0         
_________________________________________________________________
gaussian_noise_1 (GaussianNo (None, 50, 31)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 100)           52800     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 100)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                30200     
___________________________________________________________

In [22]:
# fit the network
model_path='deeplstm.h5'
history = model.fit(seq_array, label_array, epochs=100, batch_size=256, validation_split=0.05, verbose=2,
          callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='min'),
                       keras.callbacks.ModelCheckpoint(model_path,monitor='val_loss', save_best_only=True, mode='min', verbose=0)]
          )

# list all data in history
print(history.history.keys())

Train on 38721 samples, validate on 2038 samples
Epoch 1/100
 - 82s - loss: 5622.2617 - mean_absolute_error: 63.0922 - r2_keras: -2.1166e+00 - val_loss: 5826.4429 - val_mean_absolute_error: 64.5330 - val_r2_keras: -2.3025e+00
Epoch 2/100
 - 74s - loss: 4688.0120 - mean_absolute_error: 56.9563 - r2_keras: -1.5944e+00 - val_loss: 4928.0742 - val_mean_absolute_error: 58.9412 - val_r2_keras: -1.7886e+00
Epoch 3/100
 - 74s - loss: 3928.2320 - mean_absolute_error: 51.9269 - r2_keras: -1.1735e+00 - val_loss: 4152.1102 - val_mean_absolute_error: 54.0705 - val_r2_keras: -1.3461e+00
Epoch 4/100
 - 74s - loss: 3287.2837 - mean_absolute_error: 47.6671 - r2_keras: -8.1834e-01 - val_loss: 3491.5413 - val_mean_absolute_error: 49.8814 - val_r2_keras: -9.7071e-01
Epoch 5/100
 - 74s - loss: 2767.7518 - mean_absolute_error: 44.1787 - r2_keras: -5.2976e-01 - val_loss: 2948.7354 - val_mean_absolute_error: 46.3849 - val_r2_keras: -6.6394e-01
Epoch 6/100
 - 75s - loss: 2369.3418 - mean_absolute_error: 41.491

In [77]:
scores = model.evaluate(seq_array, label_array, verbose=1, batch_size=200)
print('Loss: {}'.format(scores[0]))

Loss: 241.46708622557622


In [78]:
test = []
for i in range(len(td.groupby('id'))):
    if len(td[td['id']==(i+1)]) >= sequence_length:
        test.append(np.asarray(td[td['id']==(i+1)][sequence_cols].values[-sequence_length:]).astype(np.float32).tolist())
    else:
        test.append(np.asarray(td[td['id']==(i+1)][sequence_cols].values[:]).astype(np.float32).tolist())

In [79]:
padded_inputs = keras.preprocessing.sequence.pad_sequences(test,dtype='float32',                                padding='post')

In [80]:
result = model.predict(padded_inputs,verbose=1)



In [81]:
submit = pd.read_excel('../results_submission.xls')
submit['RUL prediction']=result
submit.to_csv('deeplstm.csv')

In [82]:
ans = np.loadtxt('../RUL_FD002.txt')
ans.shape

(259,)

In [83]:
def score(user_ans):
    user_ans_arr = np.array(user_ans)
    score = 0
    diff = user_ans_arr - ans
    diff_pos = diff[diff>=0]
    diff_neg = diff[diff<0]
    score = np.sum((np.exp(-1./13.*diff_neg)-1))+np.sum((np.exp(1./10.*diff_pos)-1))
    return score

In [84]:
re = pd.read_csv('deeplstm.csv')
pred_rul=re['RUL prediction']

In [85]:
score(pred_rul)

70475.54484426416