In [1]:
# Module Importations
import numpy as np
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# Module Versioning
print(sklearn.__version__) 

0.22.2.post1


In [2]:
# Custom Modules
from Source.Data import ntfp_dataset_import as data_load
from Source.Data import ntfp_split_data as split_data
from Source.Features import ntfp_dataset_preprocessing as preprocessing

In [3]:
# Constants
MEAN_RUL = 107.8

In [4]:
# Load Dataset
data_filename = 'rul_dataset.pkl'
rul_df = data_load.load_pickled_data(data_filename)

print(rul_df.info())

Loading pickled dataframe started ...
Loading pickled dataframe complete ...
<class 'pandas.core.frame.DataFrame'>
Int64Index: 20631 entries, 1 to 100
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Cycles  20631 non-null  int64  
 1   Sn_02   20631 non-null  float64
 2   Sn_03   20631 non-null  float64
 3   Sn_04   20631 non-null  float64
 4   Sn_07   20631 non-null  float64
 5   Sn_09   20631 non-null  float64
 6   Sn_11   20631 non-null  float64
 7   Sn_12   20631 non-null  float64
 8   Sn_14   20631 non-null  float64
 9   Sn_15   20631 non-null  float64
 10  Sn_17   20631 non-null  int64  
 11  Sn_20   20631 non-null  float64
 12  Sn_21   20631 non-null  float64
 13  RUL     20631 non-null  float64
dtypes: float64(12), int64(2)
memory usage: 2.4 MB
None


In [5]:
# Normalise columns into array
normalised_array = preprocessing.standardise_columns(rul_df)

print(normalised_array)
print("Shape:", normalised_array.shape)

[[-1.72172539 -0.13425518 -0.92593596 ... -0.78170979  1.34849274
   1.19442705]
 [-1.06177971  0.21152849 -0.64372587 ... -0.78170979  1.01652793
   1.23692196]
 [-0.66181262 -0.41316559 -0.52595315 ... -2.07309423  0.73989059
   0.50342281]
 ...
 [ 1.47801126  1.94697106  2.13837684 ...  3.09244354 -2.08181033
  -3.29248147]
 [ 1.09804254  2.40366648  1.95505138 ...  1.15536688 -2.91172236
  -2.08507166]
 [ 2.33794049  1.60771161  2.57835793 ...  1.8010591  -2.46910261
  -2.19408035]]
Shape: (20631, 12)


In [6]:
# Calculate slopes for each column
slopes_df, slopes_array = preprocessing.calculate_slopes_all_engines(rul_df, normalised_array)

print(slopes_df.describe())

            Sn_02       Sn_03       Sn_04       Sn_07       Sn_09       Sn_11  \
count  100.000000  100.000000  100.000000  100.000000  100.000000  100.000000   
mean     0.010975    0.010625    0.012324   -0.011917    0.007100    0.012641   
std      0.002842    0.002459    0.003010    0.003444    0.009205    0.003387   
min      0.005116    0.005192    0.007006   -0.020537   -0.005766    0.006454   
25%      0.009114    0.008862    0.010195   -0.013892   -0.000709    0.010143   
50%      0.010601    0.010617    0.012143   -0.011197    0.003977    0.012485   
75%      0.012328    0.012220    0.013796   -0.009588    0.014261    0.014375   
max      0.018116    0.016361    0.019765   -0.005189    0.026665    0.021425   

            Sn_12       Sn_14       Sn_15       Sn_17       Sn_20       Sn_21  
count  100.000000  100.000000  100.000000  100.000000  100.000000  100.000000  
mean    -0.012249    0.005574    0.011670    0.010988   -0.011533   -0.011540  
std      0.003736    0.009765 

In [7]:
# Order slopes by value
slope_order = preprocessing.return_data_ordered_abs_value(slopes_array, rul_df)


Slope Order: 
['Sn_11' 'Sn_04' 'Sn_12' 'Sn_07' 'Sn_15' 'Sn_21' 'Sn_20' 'Sn_17' 'Sn_02'
 'Sn_03' 'Sn_09' 'Sn_14']


In [8]:
# Visualise slopes for each reading

In [9]:
# Determine [5] most influential columns
data_columns = rul_df.columns.values[1:-1]

slope_slice = slope_order[5:]

data_columns_not_influential = data_columns[slope_slice]

print("Not influential:", data_columns_not_influential)

Not influential: ['Sn_21' 'Sn_20' 'Sn_17' 'Sn_02' 'Sn_03' 'Sn_09' 'Sn_14']


In [10]:
# Remove least influential columns
rul_df = preprocessing.dataset_remove_columns(rul_df, data_columns_not_influential)

print(rul_df)

        Cycles    Sn_04   Sn_07  Sn_11   Sn_12   Sn_15    RUL
Engine                                                       
1            1  1400.60  554.36  47.47  521.66  8.4195 -191.0
1            2  1403.14  553.75  47.49  522.28  8.4318 -190.0
1            3  1404.20  554.26  47.27  522.42  8.4178 -189.0
1            4  1401.87  554.45  47.13  522.86  8.3682 -188.0
1            5  1406.22  554.00  47.28  522.19  8.4294 -187.0
...        ...      ...     ...    ...     ...     ...    ...
100        196  1428.63  551.43  48.07  519.49  8.4956   -4.0
100        197  1433.58  550.86  48.04  519.68  8.5139   -3.0
100        198  1428.18  550.94  48.09  520.01  8.5646   -2.0
100        199  1426.53  550.68  48.39  519.67  8.5389   -1.0
100        200  1432.14  550.79  48.20  519.30  8.5036    0.0

[20631 rows x 7 columns]


In [11]:
# Save dataset
filename = 'rul_dataset_preprocessed'
data_load.pickle_data(rul_df, filename)

Pickling dataframe ...
Pickled dataframe to: C:/Developer/nasa-turbofan-failure-prediction/Data/Interim/rul_dataset_preprocessed.pkl


In [12]:
# Split data into training and evaluation sets
training_set, evaluation_set = split_data.split_train_eval(rul_df, 0.2)

Original Data Items: 20631
Training Data Items: 16505
Evaluation Data Items: 4126


In [13]:
# Create RUL Target Dataset

# Training Set as Array
rul_training_data = training_set.drop('RUL', axis = 1).values
rul_training_label = training_set['RUL'].copy().values

print(rul_training_data)
print(rul_training_label)

# Evaluation Set as Array

[[  88.     1410.13    552.95     47.65    521.17      8.4184]
 [  24.     1406.22    552.91     47.43    521.53      8.4314]
 [ 116.     1404.9     554.3      47.42    522.76      8.3808]
 ...
 [  82.     1415.54    553.5      47.48    521.68      8.3974]
 [  14.     1396.28    554.52     47.26    522.19      8.4214]
 [  42.     1401.35    554.01     47.12    521.91      8.3522]]
[-187. -167. -115. ...  -74. -255. -157.]


In [15]:
# Linear Regression Model (Target - RUL)
lin_reg_rul = LinearRegression()
model_name = "LinReg_RUL"

# Train model
lin_reg_rul.fit(rul_training_data, rul_training_label)

# Save model

# Compute RMSE via cross validation
scores_mse = cross_val_score(lin_reg_rul, rul_training_data, rul_training_label, scoring = "neg_mean_squared_error", cv = 5)
scores_rmse = np.sqrt(-scores_mse)

# Compute MAE via cross validation
scores_mae = cross_val_score(lin_reg_rul, rul_training_data, rul_training_label, scoring = "neg_mean_absolute_error", cv = 5)
scores_mae = -1 * scores_mae

# Visualise cross validation results
print(model_name, "rmse mean (cv):", scores_rmse.mean())
print(model_name, "rmse std (cv):", scores_rmse.std())

print(model_name, "mae mean (cv):", scores_mae.mean())
print(model_name, "mae std (cv):", scores_mae.std())

LinReg_RUL rmse mean (cv): 41.18492574232157
LinReg_RUL rmse std (cv): 0.5652551597479265
LinReg_RUL mae mean (cv): 31.591276452712655
LinReg_RUL mae std (cv): 0.4205946666188317
