In [68]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn_pandas import DataFrameMapper

In [30]:
# Data
df_train = pd.read_csv("CMaps/train_FD001.txt", sep=" ", header=None)
df_test = pd.read_csv("CMaps/test_FD001.txt", sep=" ", header=None)
df_rul = pd.read_csv("CMaps/RUL_FD001.txt", sep=" ", header=None)

In [31]:
print(f"Train: {df_train.shape}")
print(f"Test: {df_test.shape}")
print(f"RUL: {df_rul.shape}")

Train: (20631, 28)
Test: (13096, 28)
RUL: (100, 2)


In [32]:
columns = ['Engine_ID', 'Cycle',
          'ALT', 'Mach', 'TRA',
          'T2', 'T24', 'T30', 'T50',
           'P2', 'P15', 'P30',
           'Nf', 'Nc', 'epr', 'Ps30', 'phi',
           'NRf', 'NRc', 'BPR', 'farB',
           'htBleed', 'Nf_dmd', 'PCNfR_dmd',
           'W31', 'W32', "SD_22", "SD_23"
          ]

In [33]:
df_train.columns = columns
df_test.columns = columns

In [34]:
# Count number of unique engines
print(f"df_train unique engines: {df_train['Engine_ID'].nunique()}")
print(f"df_test unique engines: {df_test['Engine_ID'].nunique()}")

df_train unique engines: 100
df_test unique engines: 100


In [35]:
print(f"df_rul engines count: {len(df_rul)}")

df_rul engines count: 100


In [36]:
#function for preparing training data and forming a RUL column with information about the remaining
# before breaking cycles
def prepare_train_data(data, factor = 0):
    df = data.copy()
    fd_RUL = df.groupby('Engine_ID')['Cycle'].max().reset_index()
    fd_RUL = pd.DataFrame(fd_RUL)
    fd_RUL.columns = ['Engine_ID','max']
    df = df.merge(fd_RUL, on=['Engine_ID'], how='left')
    df['RUL'] = df['max'] - df['Cycle']
    df.drop(columns=['max'],inplace = True)
    
    return df[df['Cycle'] > factor]

In [37]:
df_train = prepare_train_data(df_train, factor = 0)

In [77]:
# Add RUL to df_test

df_test = df_test.groupby('Engine_ID').last().reset_index()
print(len(df_test))

# Add max RUL to df_test
# Make list of df_rul
max_rul = df_rul[0].tolist() #len = 100

# Create actual RUL column: MAX_RUL - Cycle
df_test["RUL"] = max_rul

100


In [38]:
display(df_train)

Unnamed: 0,Engine_ID,Cycle,ALT,Mach,TRA,T2,T24,T30,T50,P2,...,BPR,farB,htBleed,Nf_dmd,PCNfR_dmd,W31,W32,SD_22,SD_23,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,...,8.4195,0.03,392,2388,100.0,39.06,23.4190,,,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,8.4318,0.03,392,2388,100.0,39.00,23.4236,,,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,...,8.4178,0.03,390,2388,100.0,38.95,23.3442,,,189
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,8.3682,0.03,392,2388,100.0,38.88,23.3739,,,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,8.4294,0.03,393,2388,100.0,38.90,23.4044,,,187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,8.4956,0.03,397,2388,100.0,38.49,22.9735,,,4
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,...,8.5139,0.03,395,2388,100.0,38.30,23.1594,,,3
20628,100,198,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,...,8.5646,0.03,398,2388,100.0,38.44,22.9333,,,2
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,8.5389,0.03,395,2388,100.0,38.29,23.0640,,,1


In [39]:
df_train = df_train[['Engine_ID', 'Cycle',
          'ALT', 'Mach', 'TRA',
          'T2', 'T24', 'T30', 'T50',
           'P2', 'P15', 'P30',
           'Nf', 'Nc', 'epr', 'Ps30', 'phi',
           'NRf', 'NRc', 'BPR', 'farB',
           'htBleed', 'Nf_dmd', 'PCNfR_dmd',
           'W31', 'W32', 'RUL']]

In [40]:
response_column = "RUL"
training_columns = list(df_train.columns)
training_columns.remove('RUL')
training_columns.remove('Engine_ID')
training_columns.remove('Cycle')

### 1. Feature Ranking Using Sklearn Feature Selection (Top 8)

In [41]:
# Set mapper
df_mapper = DataFrameMapper([(training_columns, None), (response_column, None)])

In [42]:
# Train data - pandas to sklearn
train = df_mapper.fit_transform(df_train)

In [43]:
column_count = len(train[0, :])
# train
xx = train[:, 0:column_count-1]
# response
yy = train[:, column_count-1]

In [51]:
model = RandomForestRegressor(max_depth=20, n_estimators=50)
rfe = RFE(model, 8)
fit = rfe.fit(xx, yy)



In [63]:
print(f"Num Features: {fit.n_features_} \n")
print(f"Selected Features: \n {fit.support_} \n")
print(f"Feature Ranking: {fit.ranking_}")

Num Features: 8 

Selected Features: 
 [False False False False False False  True False False  True False  True
 False  True  True False  True  True False False False False False  True] 

Feature Ranking: [ 5  8 17 15  3  2  1 12 10  1  7  1 16  1  1  6  1  1 11  9 13 14  4  1]


In [66]:
top_features = []
pointer = 1
index =0
while len(top_features) < 8:
    for i in range(len(fit.ranking_)):
        if fit.ranking_[i] == pointer:
            top_features.append(training_columns[i])
    pointer += 1

print(top_features)

['T50', 'P30', 'Nc', 'Ps30', 'phi', 'NRc', 'BPR', 'W32']


### 2. Regularization

In [90]:
features = [
          'ALT', 'Mach', 'TRA',
          'T2', 'T24', 'T30', 'T50',
           'P2', 'P15', 'P30',
           'Nf', 'Nc', 'epr', 'Ps30', 'phi',
           'NRf', 'NRc', 'BPR', 'farB',
           'htBleed', 'Nf_dmd', 'PCNfR_dmd',
           'W31', 'W32'
          ]

output = ['RUL']

In [91]:
X_train = df_train[features]
X_test = df_test[features]

y_train = df_train[output]
y_test = df_test[output]

In [92]:
scaler = StandardScaler()
scaler.fit(X_train.fillna(0))

StandardScaler()

In [93]:
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l2'))

sel_.fit(scaler.transform(X_train.fillna(0)), y_train)

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


SelectFromModel(estimator=LogisticRegression(C=1))

In [94]:
sel_.get_support()

array([False, False, False, False,  True, False,  True,  True, False,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True, False, False, False, False,  True])

In [95]:
selected_feat = X_train.columns[(sel_.get_support())]
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
      np.sum(sel_.estimator_.coef_ == 0)))

total features: 24
selected features: 13
features with coefficients shrank to zero: 1810


In [102]:
# Sklearn feature selection (8)
top_features

['BPR', 'NRc', 'Nc', 'P30', 'Ps30', 'T50', 'W32', 'phi']

In [104]:
# Regularisation (13)
x = selected_feat.tolist()
x

['T24',
 'T50',
 'P2',
 'P30',
 'Nf',
 'Nc',
 'Ps30',
 'phi',
 'NRf',
 'NRc',
 'BPR',
 'farB',
 'W32']