In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix,accuracy_score

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation
from keras.callbacks import EarlyStopping

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

In [None]:
df = pd.read_csv("PM_train.txt", sep = ' ',header=None)

In [None]:
df.shape

(20631, 28)

In [None]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,,
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,,
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,,
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,,
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,,


These CMAPSS data files are space-separated.

But the files have extra spaces at the end of each line, so when pandas reads them with sep=' ', it creates extra empty columns at the end so we drop those two of them below.


In [None]:
df = pd.read_csv("PM_train.txt", sep = ' ',header=None).drop([26,27], axis=1)

In [None]:
df.shape

(20631, 26)

In [None]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


Lets now rename our columns to id, cycle, setting1, setting2, setting3, sensor1, sensor2,.....,sensor21

In [None]:
col_names = ['id','cycle','setting1','setting2','setting3','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11','s12','s13','s14','s15','s16','s17','s18','s19','s20','s21']

In [None]:
df.columns=col_names

In [None]:
df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [None]:
df_test = pd.read_csv("PM_test.txt", sep=' ', header=None).drop([26,27],axis = 1)

In [None]:
df_test.columns = col_names

In [None]:
df_test.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,521.97,2388.03,8130.1,8.4441,0.03,393,2388,100.0,39.08,23.4166
3,1,4,0.0042,0.0,100.0,518.67,642.44,1584.12,1406.42,14.62,...,521.38,2388.05,8132.9,8.3917,0.03,391,2388,100.0,39.0,23.3737
4,1,5,0.0014,0.0,100.0,518.67,642.51,1587.19,1401.92,14.62,...,522.15,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.413


In [None]:
df_test.shape


(13096, 26)

In [None]:
df_truth = pd.read_csv("truth.txt", sep = ' ', header=None)

In [None]:
df_truth.head()

Unnamed: 0,0,1
0,112,
1,98,
2,69,
3,82,
4,91,


Here also same case so we drop on column

In [None]:
df_truth = pd.read_csv("truth.txt", sep = ' ', header=None).drop([1],axis=1)
df_truth.head()

Unnamed: 0,0
0,112
1,98
2,69
3,82
4,91


In [None]:
df_truth.columns = ['more']
df_truth['id'] = df_truth.index+1

In [None]:
df_truth.head()

Unnamed: 0,more,id
0,112,1
1,98,2
2,69,3
3,82,4
4,91,5


Grouping by ```id``` for maximum value of ```cycle``` to find RUL (remaning useful life)

In [None]:
rul = pd.DataFrame(df_test.groupby('id')['cycle'].max()).reset_index()

In [None]:
rul.head()

Unnamed: 0,id,cycle
0,1,31
1,2,49
2,3,126
3,4,106
4,5,98


In [None]:
rul.columns = ['id', 'max']

In [None]:
rul.head()

Unnamed: 0,id,max
0,1,31
1,2,49
2,3,126
3,4,106
4,5,98


* ```df_truth``` Contains the Remaining Useful Life (RUL) of each test engine at the last cycle in the test data.

* The column ```more``` in ```df_truth``` means "how many more cycles until failure" from the last point in ```df_test```

In [None]:
df_truth.head()

Unnamed: 0,more,id
0,112,1
1,98,2
2,69,3
3,82,4
4,91,5


Actucal failure cycle or Remaning to failure **RTF** is given by:

currentcycle + remaning useful life

i.e.

 ```df_truth['more']``` + ```rul['max']```

In [None]:
df_truth['rtf'] = df_truth['more']+rul['max']
df_truth.head()

Unnamed: 0,more,id,rtf
0,112,1,143
1,98,2,147
2,69,3,195
3,82,4,188
4,91,5,189


We needed ```more``` just to calculate ```rtf``` now we can drop it

In [None]:
df_truth.drop('more', axis = 1, inplace = True)

In [None]:
df_test = df_test.merge(df_truth, on=['id'], how='left')

In test, the engines have not failed yet — we only get partial data.
So we don’t know from the test file when each engine will fail.

**That info is given separately in truth.txt as:**

“Engine X will fail Y cycles after the last recorded cycle.”

so we first find **RTF**

then for **TTF**:

* actual life cycle or **RTF** - current cycle


and we can then remove rtf



In [None]:
df_test['ttf']=df_test['rtf'] - df_test['cycle']
df_test.drop('rtf', axis=1, inplace=True)
df_test.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,ttf
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735,142
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916,141
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,2388.03,8130.1,8.4441,0.03,393,2388,100.0,39.08,23.4166,140
3,1,4,0.0042,0.0,100.0,518.67,642.44,1584.12,1406.42,14.62,...,2388.05,8132.9,8.3917,0.03,391,2388,100.0,39.0,23.3737,139
4,1,5,0.0014,0.0,100.0,518.67,642.51,1587.19,1401.92,14.62,...,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.413,138


Now lets calculate **TTF** for training set too.

Here the machine runs until it actually fails

so for TTF we find maximum of cycle for particular machine id and then we apply :
* ttf = max (cycle) - cycle

for all the rows and a point will come for where the ttf will be zero, hence failure!

In [None]:
df['ttf'] = df.groupby(['id'])['cycle'].transform(max)-df['cycle']
df.head()

  df['ttf'] = df.groupby(['id'])['cycle'].transform(max)-df['cycle']


Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,ttf
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187


In [None]:
df['ttf'].unique()

array([191, 190, 189, 188, 187, 186, 185, 184, 183, 182, 181, 180, 179,
       178, 177, 176, 175, 174, 173, 172, 171, 170, 169, 168, 167, 166,
       165, 164, 163, 162, 161, 160, 159, 158, 157, 156, 155, 154, 153,
       152, 151, 150, 149, 148, 147, 146, 145, 144, 143, 142, 141, 140,
       139, 138, 137, 136, 135, 134, 133, 132, 131, 130, 129, 128, 127,
       126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114,
       113, 112, 111, 110, 109, 108, 107, 106, 105, 104, 103, 102, 101,
       100,  99,  98,  97,  96,  95,  94,  93,  92,  91,  90,  89,  88,
        87,  86,  85,  84,  83,  82,  81,  80,  79,  78,  77,  76,  75,
        74,  73,  72,  71,  70,  69,  68,  67,  66,  65,  64,  63,  62,
        61,  60,  59,  58,  57,  56,  55,  54,  53,  52,  51,  50,  49,
        48,  47,  46,  45,  44,  43,  42,  41,  40,  39,  38,  37,  36,
        35,  34,  33,  32,  31,  30,  29,  28,  27,  26,  25,  24,  23,
        22,  21,  20,  19,  18,  17,  16,  15,  14,  13,  12,  1

Now lets apply a threshold for converting this label into binary classification column (with 0s and 1s)


so lets say we want the thresold to be 'x' then
* ttf < 30  = failure
* ttf > 30 = okay

In [None]:
df_train1=df.copy()
df_test1=df_test.copy()
period=30
df_train1['label_bc'] = df_train1['ttf'].apply(lambda x: 1 if x <= period else 0)
df_test1['label_bc'] = df_test1['ttf'].apply(lambda x: 1 if x <= period else 0)


In [None]:
df_train1.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s14,s15,s16,s17,s18,s19,s20,s21,ttf,label_bc
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191,0
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190,0
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189,0
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188,0
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187,0


In [None]:
df_test1.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s14,s15,s16,s17,s18,s19,s20,s21,ttf,label_bc
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735,142,0
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916,141,0
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,8130.1,8.4441,0.03,393,2388,100.0,39.08,23.4166,140,0
3,1,4,0.0042,0.0,100.0,518.67,642.44,1584.12,1406.42,14.62,...,8132.9,8.3917,0.03,391,2388,100.0,39.0,23.3737,139,0
4,1,5,0.0014,0.0,100.0,518.67,642.51,1587.19,1401.92,14.62,...,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.413,138,0


In [None]:
df_train1['label_bc'].unique()

array([0, 1])

In [None]:
df_test1['label_bc'].unique()

array([0, 1])

In [None]:
features_col_name=['setting1', 'setting2', 'setting3', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11',
                   's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']
target_col_name='label_bc'

In [None]:
sc=MinMaxScaler()
df_train1[features_col_name]=sc.fit_transform(df_train1[features_col_name])
df_test1[features_col_name]=sc.transform(df_test1[features_col_name])

In [None]:
df_train1.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s14,s15,s16,s17,s18,s19,s20,s21,ttf,label_bc
0,1,1,0.45977,0.166667,0.0,0.0,0.183735,0.406802,0.309757,0.0,...,0.199608,0.363986,0.0,0.333333,0.0,0.0,0.713178,0.724662,191,0
1,1,2,0.609195,0.25,0.0,0.0,0.283133,0.453019,0.352633,0.0,...,0.162813,0.411312,0.0,0.333333,0.0,0.0,0.666667,0.731014,190,0
2,1,3,0.252874,0.75,0.0,0.0,0.343373,0.369523,0.370527,0.0,...,0.171793,0.357445,0.0,0.166667,0.0,0.0,0.627907,0.621375,189,0
3,1,4,0.54023,0.5,0.0,0.0,0.343373,0.256159,0.331195,0.0,...,0.174889,0.166603,0.0,0.333333,0.0,0.0,0.573643,0.662386,188,0
4,1,5,0.390805,0.333333,0.0,0.0,0.349398,0.257467,0.404625,0.0,...,0.174734,0.402078,0.0,0.416667,0.0,0.0,0.589147,0.704502,187,0


# Lets try Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
X_train = df_train1[features_col_name]
Y_train = df_train1[target_col_name]

X_test = df_test1[features_col_name]
Y_test = df_test1[target_col_name]

In [None]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
rf_model.fit(X_train, Y_train)

In [None]:
# Make predictions on the test data
y_pred_default = rf_model.predict(X_test)

# Accuracy
accuracy = accuracy_score(Y_test, y_pred_default)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Confusion Matrix
cm = confusion_matrix(Y_test, y_pred_default)
print('Confusion Matrix:')
print(cm)


Accuracy: 98.69%
Confusion Matrix:
[[12722    42]
 [  129   203]]


# With Hyperparameter Tuning


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],           # Number of trees
    'max_depth': [10, 20, None],               # Max depth of tree
    'min_samples_split': [2, 5, 10],           # Min samples to split an internal node
    'min_samples_leaf': [1, 2, 4],             # Min samples at a leaf node
    'bootstrap': [True, False]                 # Sampling method
}


In [None]:
rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid,
                           cv=3,
                           n_jobs=-1,
                           verbose=2,
                           scoring='accuracy')


In [None]:
grid_search.fit(X_train, Y_train)

Fitting 3 folds for each of 162 candidates, totalling 486 fits


In [None]:
print("Best Parameters:\n", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)


Best Parameters:
 {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Best Accuracy:  0.9599631622315933


In [None]:
best_rf_model = grid_search.best_estimator_
y_pred_tuned = best_rf_model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Test Accuracy:", accuracy_score(Y_test, y_pred_tuned))
print("Confusion Matrix:\n", confusion_matrix(Y_test, y_pred_tuned))
print("Classification Report:\n", classification_report(Y_test, y_pred_tuned))


Test Accuracy: 0.9871716554673182
Confusion Matrix:
 [[12722    42]
 [  126   206]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99     12764
           1       0.83      0.62      0.71       332

    accuracy                           0.99     13096
   macro avg       0.91      0.81      0.85     13096
weighted avg       0.99      0.99      0.99     13096

