In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import scipy
df_train = pd.read_csv("/content/train_FD001.txt", sep=" ", header=None)
df_test = pd.read_csv("/content/test_FD001.txt", sep=" ", header=None)
df_rul = pd.read_csv("/content/RUL_FD001.txt", sep=" ", header=None)

In [None]:
columns = ['Engine_ID', 'Cycle',
          'ALT', 'Mach', 'TRA',
          'T2', 'T24', 'T30', 'T50',
           'P2', 'P15', 'P30',
           'Nf', 'Nc', 'epr', 'Ps30', 'phi',
           'NRf', 'NRc', 'BPR', 'farB',
           'htBleed', 'Nf_dmd', 'PCNfR_dmd',
           'W31', 'W32', "SD_22", "SD_23"
          ]
df_train.columns = columns
df_test.columns = columns

In [None]:
def prepare_train_data(data, factor = 0):
    df = data.copy()
    fd_RUL = df.groupby('Engine_ID')['Cycle'].max().reset_index()
    fd_RUL = pd.DataFrame(fd_RUL)
    fd_RUL.columns = ['Engine_ID','max']
    df = df.merge(fd_RUL, on=['Engine_ID'], how='left')
    df['RUL'] = df['max'] - df['Cycle']
    df.drop(columns=['max'],inplace = True)
    
    return df[df['Cycle'] > factor]

In [None]:
df_train = prepare_train_data(df_train, factor = 0)

In [None]:
df_test = df_test.groupby('Engine_ID').last().reset_index()
print(len(df_test))

# Add max RUL to df_test
# Make list of df_rul
max_rul = df_rul[0].tolist() #len = 100

# Create actual RUL column: MAX_RUL - Cycle
df_test["RUL"] = max_rul

100


In [None]:
features = ['Cycle', 'ALT', 'Mach', 'T24', 'T30', 'T50', 'P15', 'P30',
       'Nf', 'Nc', 'Ps30', 'phi', 'NRf', 'NRc', 'BPR', 'htBleed', 'W31', 'W32']

output = ['RUL']

In [None]:
X_train = df_train[features]
X_test = df_test[features]

y_train = df_train[output]
y_test = df_test[output]

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from xgboost import XGBRFRegressor
rf = XGBRFRegressor(n_estimators=300, subsample=0.9, colsample_bynode=0.2, n_fold = 5, eval_metric = 'rmse')
# set up 5-fold cross-validation
from sklearn import model_selection
cv = model_selection.KFold(5)
# pipeline standardization and model
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[('standardize', preprocessing.StandardScaler())
                           , ('model', rf) ])
# tune the model
my_min_samples_leaf = [2, 10, 25, 50, 100]
my_max_depth = [7, 8, 9, 10, 11, 12]
# run the model using gridsearch, select the model with best search
from sklearn.model_selection import GridSearchCV
optimized_rf = GridSearchCV(estimator=pipeline
                            , cv=cv
                            , param_grid =dict(model__min_samples_leaf = my_min_samples_leaf, model__max_depth = my_max_depth)
                            , scoring = 'neg_mean_squared_error'
                            , verbose = 1
                            , n_jobs = -1
                           )
optimized_rf.fit(X_train, y_train)
# show the best model estimators
print(optimized_rf.best_estimator_)
# evaluate metrics on holdout
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
y_pred = optimized_rf.predict(X_test)
print("Random Forest Mean Squared Error: ", mean_squared_error(y_test, y_pred))
print("Random Forest Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))
print("Random Forest r-squared: ", r2_score(y_test, y_pred))

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  9.1min finished


Pipeline(memory=None,
         steps=[('standardize',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('model',
                 XGBRFRegressor(base_score=0.5, colsample_bylevel=1,
                                colsample_bynode=0.2, colsample_bytree=1,
                                eval_metric='rmse', gamma=0, learning_rate=1,
                                max_delta_step=0, max_depth=12,
                                min_child_weight=1, min_samples_leaf=2,
                                missing=None, n_estimators=300, n_fold=5,
                                n_jobs=1, nthread=None, objective='reg:linear',
                                random_state=0, reg_alpha=0, reg_lambda=1,
                                scale_pos_weight=1, seed=None, silent=None,
                                subsample=0.9, verbosity=1))],
         verbose=False)
Random Forest Mean Squared Error:  629.4551136839975
Random Forest Mean Absolute Error:  18.857

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from xgboost import XGBRFRegressor
rf = XGBRFRegressor(n_estimators=300, subsample=0.9, colsample_bynode=0.2, n_fold = 5, eval_metric = 'rmse')
# set up 5-fold cross-validation
from sklearn import model_selection
cv = model_selection.KFold(5)
# pipeline standardization and model
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[('standardize', preprocessing.StandardScaler())
                           , ('model', rf) ])
# tune the model
my_min_samples_leaf = [2, 10, 25, 50, 100]
my_max_depth = [7, 8, 9, 10, 11, 12]
my_learning_rate = [0.1, 0.5, 1]
# run the model using gridsearch, select the model with best search
from sklearn.model_selection import GridSearchCV
optimized_rf = GridSearchCV(estimator=pipeline
                            , cv=cv
                            , param_grid =dict(model__min_samples_leaf = my_min_samples_leaf, model__max_depth = my_max_depth, model__learning_rate = my_learning_rate)
                            , scoring = 'neg_mean_squared_error'
                            , verbose = 1
                            , n_jobs = -1
                           )
optimized_rf.fit(X_train, y_train)
# show the best model estimators
print(optimized_rf.best_estimator_)
# evaluate metrics on holdout
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
y_pred = optimized_rf.predict(X_test)
print("Random Forest Mean Squared Error: ", mean_squared_error(y_test, y_pred))
print("Random Forest Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))
print("Random Forest r-squared: ", r2_score(y_test, y_pred))

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 26.9min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed: 27.2min finished


Pipeline(memory=None,
         steps=[('standardize',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('model',
                 XGBRFRegressor(base_score=0.5, colsample_bylevel=1,
                                colsample_bynode=0.2, colsample_bytree=1,
                                eval_metric='rmse', gamma=0, learning_rate=1,
                                max_delta_step=0, max_depth=12,
                                min_child_weight=1, min_samples_leaf=2,
                                missing=None, n_estimators=300, n_fold=5,
                                n_jobs=1, nthread=None, objective='reg:linear',
                                random_state=0, reg_alpha=0, reg_lambda=1,
                                scale_pos_weight=1, seed=None, silent=None,
                                subsample=0.9, verbosity=1))],
         verbose=False)
Random Forest Mean Squared Error:  629.4551136839975
Random Forest Mean Absolute Error:  18.857

In [None]:
import keras
import keras.backend as K
from keras.layers.core import Activation
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout, LSTM

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn import preprocessing

# Setting seed for reproducibility
np.random.seed(1234)  
PYTHONHASHSEED = 0

# define path to save model
model_path = '/content/Output/regression_model.h5'

train_df = pd.read_csv("/content/train_FD001.txt", sep=" ", header=None)
test_df = pd.read_csv("/content/test_FD001.txt", sep=" ", header=None)
truth_df = pd.read_csv("/content/RUL_FD001.txt", sep=" ", header=None)

In [None]:
train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True)
train_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']
train_df = train_df.sort_values(['id','cycle'])

test_df.drop(test_df.columns[[26, 27]], axis=1, inplace=True)
test_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']

truth_df.drop(truth_df.columns[[1]], axis=1, inplace=True)

In [None]:
rul = pd.DataFrame(train_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
train_df = train_df.merge(rul, on=['id'], how='left')
train_df['RUL'] = train_df['max'] - train_df['cycle']
train_df.drop('max', axis=1, inplace=True)

In [None]:
w1 = 30
w0 = 15
train_df['label1'] = np.where(train_df['RUL'] <= w1, 1, 0 )
train_df['label2'] = train_df['label1']
train_df.loc[train_df['RUL'] <= w0, 'label2'] = 2

In [None]:
# MinMax normalization (from 0 to 1) train set
train_df['cycle_norm'] = train_df['cycle']
cols_normalize = train_df.columns.difference(['id','cycle','RUL'])
min_max_scaler = preprocessing.MinMaxScaler()
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df[cols_normalize]), 
                             columns=cols_normalize, 
                             index=train_df.index)
join_df = train_df[train_df.columns.difference(cols_normalize)].join(norm_train_df)
train_df = join_df.reindex(columns = train_df.columns)

# MinMax normalization (from 0 to 1) test set
test_df['cycle_norm'] = test_df['cycle']
norm_test_df = pd.DataFrame(min_max_scaler.transform(test_df[cols_normalize]), 
                            columns=cols_normalize, 
                            index=test_df.index)
test_join_df = test_df[test_df.columns.difference(cols_normalize)].join(norm_test_df)
test_df = test_join_df.reindex(columns = test_df.columns)
test_df = test_df.reset_index(drop=True)
print(test_df.head())

# We use the ground truth dataset to generate labels for the test data.
# generate column max for test data
rul = pd.DataFrame(test_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
truth_df.columns = ['more']
truth_df['id'] = truth_df.index + 1
truth_df['max'] = rul['max'] + truth_df['more']
truth_df.drop('more', axis=1, inplace=True)

# generate RUL for test data
test_df = test_df.merge(truth_df, on=['id'], how='left')
test_df['RUL'] = test_df['max'] - test_df['cycle']
test_df.drop('max', axis=1, inplace=True)



   id  cycle  setting1  setting2  ...  s19       s20       s21  cycle_norm
0   1      1  0.632184  0.750000  ...  0.0  0.558140  0.661834     0.00000
1   1      2  0.344828  0.250000  ...  0.0  0.682171  0.686827     0.00277
2   1      3  0.517241  0.583333  ...  0.0  0.728682  0.721348     0.00554
3   1      4  0.741379  0.500000  ...  0.0  0.666667  0.662110     0.00831
4   1      5  0.580460  0.500000  ...  0.0  0.658915  0.716377     0.01108

[5 rows x 27 columns]


In [None]:
# generate label columns for training data
w1 = 30
w0 = 15
train_df['label1'] = np.where(train_df['RUL'] <= w1, 1, 0 )
train_df['label2'] = train_df['label1']
train_df.loc[train_df['RUL'] <= w0, 'label2'] = 2

In [None]:
sequence_length = 50
# function to reshape features into (samples, time steps, features) 
def gen_sequence(id_df, seq_length, seq_cols):
    data_matrix = id_df[seq_cols].values
    num_elements = data_matrix.shape[0]
    for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
        yield data_matrix[start:stop, :]

In [None]:
# pick the feature columns 
sensor_cols = ['s' + str(i) for i in range(1,22)]
sequence_cols = ['setting1', 'setting2', 'setting3', 'cycle_norm']
sequence_cols.extend(sensor_cols)

seq_gen = (list(gen_sequence(train_df[train_df['id']==id], sequence_length, sequence_cols)) 
           for id in train_df['id'].unique())

# generate sequences and convert to numpy array
seq_array = np.concatenate(list(seq_gen)).astype(np.float32)

In [None]:
def gen_labels(id_df, seq_length, label):
    data_matrix = id_df[label].values
    num_elements = data_matrix.shape[0]
    return data_matrix[seq_length:num_elements, :]

# generate labels
label_gen = [gen_labels(train_df[train_df['id']==id], sequence_length, ['RUL']) 
             for id in train_df['id'].unique()]

label_array = np.concatenate(label_gen).astype(np.float32)
label_array.shape

(15631, 1)

In [None]:
def r2_keras(y_true, y_pred):
    SS_res =  K.sum(K.square( y_true - y_pred ))
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

In [None]:
nb_features = seq_array.shape[2]
nb_out = label_array.shape[1]

model = Sequential()
model.add(LSTM(
         input_shape=(sequence_length, nb_features),
         units=100,
         return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(
          units=50,
          return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=nb_out))
model.add(Activation("linear"))
model.compile(loss='mean_squared_error', optimizer='rmsprop',metrics=['mae',r2_keras])


print(model.summary())

# fit the network
history = model.fit(seq_array, label_array, epochs=5, batch_size=10, validation_split=0.05, verbose=2,
          callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='min'),
                       keras.callbacks.ModelCheckpoint(model_path,monitor='val_loss', save_best_only=True, mode='min', verbose=0)])


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 50, 100)           50400     
_________________________________________________________________
dropout (Dropout)            (None, 50, 100)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                30200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 51        
_________________________________________________________________
activation (Activation)      (None, 1)                 0         
Total params: 80,651
Trainable params: 80,651
Non-trainable params: 0
____________________________________________________

In [None]:
# training metrics
scores = model.evaluate(seq_array, label_array, verbose=1, batch_size=200)
print('\nMAE: {}'.format(scores[1]))
print('\nR^2: {}'.format(scores[2]))


MAE: 15.501461029052734

R^2: 0.7874729037284851


In [None]:
seq_array_test_last = [test_df[test_df['id']==id][sequence_cols].values[-sequence_length:] 
                       for id in test_df['id'].unique() if len(test_df[test_df['id']==id]) >= sequence_length]

seq_array_test_last = np.asarray(seq_array_test_last).astype(np.float32)

In [None]:
y_mask = [len(test_df[test_df['id']==id]) >= sequence_length for id in test_df['id'].unique()]
label_array_test_last = test_df.groupby('id')['RUL'].nth(-1)[y_mask].values
label_array_test_last = label_array_test_last.reshape(label_array_test_last.shape[0],1).astype(np.float32)

In [None]:
scores_test = model.evaluate(seq_array_test_last, label_array_test_last, verbose = 1, batch_size = 200)
print('\nMAE: {}'.format(scores_test[1]))
print('\nR^2: {}'.format(scores_test[2]))


MAE: 14.383890151977539

R^2: 0.7537777423858643


In [None]:
seq_array_test_last

array([[[0.3505747 , 0.8333333 , 0.        , ..., 0.        ,
         0.4728682 , 0.45457056],
        [0.39655173, 0.5833333 , 0.        , ..., 0.        ,
         0.6511628 , 0.5608948 ],
        [0.5114943 , 0.25      , 0.        , ..., 0.        ,
         0.6356589 , 0.4975145 ],
        ...,
        [0.43678162, 0.75      , 0.        , ..., 0.        ,
         0.41860464, 0.4710025 ],
        [0.5804598 , 0.6666667 , 0.        , ..., 0.        ,
         0.3255814 , 0.45954156],
        [0.40804598, 0.8333333 , 0.        , ..., 0.        ,
         0.6124031 , 0.52444077]],

       [[0.55172414, 0.25      , 0.        , ..., 0.        ,
         0.5503876 , 0.6974593 ],
        [0.39655173, 0.5833333 , 0.        , ..., 0.        ,
         0.37209302, 0.59748685],
        [0.5344828 , 0.16666667, 0.        , ..., 0.        ,
         0.51937985, 0.602596  ],
        ...,
        [0.43103448, 0.33333334, 0.        , ..., 0.        ,
         0.56589144, 0.49461475],
        [0.3

In [None]:
seq_array_test_last

array([[[0.3505747 , 0.8333333 , 0.        , ..., 0.        ,
         0.46456692, 0.44467875],
        [0.39655173, 0.5833333 , 0.        , ..., 0.        ,
         0.6456693 , 0.55293125],
        [0.5114943 , 0.25      , 0.        , ..., 0.        ,
         0.62992126, 0.48840153],
        ...,
        [0.43678162, 0.75      , 0.        , ..., 0.        ,
         0.40944883, 0.46140867],
        [0.5804598 , 0.6666667 , 0.        , ..., 0.        ,
         0.31496063, 0.4497399 ],
        [0.40804598, 0.8333333 , 0.        , ..., 0.        ,
         0.6062992 , 0.5158161 ]],

       [[0.55172414, 0.25      , 0.        , ..., 0.        ,
         0.54330707, 0.69197243],
        [0.39655173, 0.5833333 , 0.        , ..., 0.        ,
         0.36220473, 0.59018695],
        [0.5344828 , 0.16666667, 0.        , ..., 0.        ,
         0.511811  , 0.5953887 ],
        ...,
        [0.43103448, 0.33333334, 0.        , ..., 0.        ,
         0.5590551 , 0.48544917],
        [0.3

In [None]:
label_array_test_last

array([[ 69.],
       [ 82.],
       [ 91.],
       [ 93.],
       [ 91.],
       [ 95.],
       [111.],
       [ 96.],
       [ 97.],
       [124.],
       [ 95.],
       [ 83.],
       [ 84.],
       [ 50.],
       [ 28.],
       [ 87.],
       [ 16.],
       [ 57.],
       [113.],
       [ 20.],
       [119.],
       [ 66.],
       [ 97.],
       [ 90.],
       [115.],
       [  8.],
       [ 48.],
       [106.],
       [  7.],
       [ 11.],
       [ 19.],
       [ 21.],
       [ 50.],
       [ 28.],
       [ 18.],
       [ 10.],
       [ 59.],
       [109.],
       [114.],
       [ 47.],
       [135.],
       [ 92.],
       [ 21.],
       [ 79.],
       [114.],
       [ 29.],
       [ 26.],
       [ 97.],
       [137.],
       [ 15.],
       [103.],
       [ 37.],
       [114.],
       [100.],
       [ 21.],
       [ 54.],
       [ 72.],
       [ 28.],
       [128.],
       [ 14.],
       [ 77.],
       [  8.],
       [121.],
       [ 94.],
       [118.],
       [ 50.],
       [13

In [None]:
scores_test = model.predict(seq_array_test_last)
scores_test

array([[ 57.18303  ],
       [ 83.78577  ],
       [ 86.9017   ],
       [124.13684  ],
       [ 94.4136   ],
       [ 91.85351  ],
       [136.23607  ],
       [ 86.8845   ],
       [ 90.25686  ],
       [106.21658  ],
       [ 85.67246  ],
       [110.121284 ],
       [ 96.855995 ],
       [ 54.178593 ],
       [ 31.306421 ],
       [ 86.202194 ],
       [ 15.824972 ],
       [ 71.56887  ],
       [116.368805 ],
       [ 22.186783 ],
       [135.33075  ],
       [ 78.286316 ],
       [115.57735  ],
       [106.111885 ],
       [ 85.731445 ],
       [  9.271268 ],
       [ 42.866665 ],
       [114.6494   ],
       [  7.8766527],
       [ 15.538999 ],
       [ 23.194567 ],
       [ 28.226822 ],
       [ 52.82044  ],
       [ 25.51702  ],
       [ 23.33508  ],
       [  9.628778 ],
       [ 64.65748  ],
       [136.5018   ],
       [ 79.01032  ],
       [ 38.667725 ],
       [128.13979  ],
       [119.47623  ],
       [ 25.605177 ],
       [ 87.59459  ],
       [ 94.26726  ],
       [ 3

In [None]:
len(label_array_test_last)

93

In [None]:
len(scores_test)

93

In [None]:
scores_test.shape

(93, 1)

In [None]:
label_array_test_last.shape

(93, 1)

In [None]:
from sklearn.metrics import mean_squared_error
rms = mean_squared_error(label_array_test_last, scores_test, squared=False)
rms

16.605112

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
print(r2_score(label_array_test_last, scores_test))

0.8362035798143256
