# 2.   FEATURE ENGINEERING

Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Set default Seaborn style

In [2]:
sns.set(style="whitegrid")

Load the training and test datasets

In [3]:
train_data_path = '../data/CMaps/train_FD001.txt'
test_data_path = '../data/CMaps/test_FD001.txt'
rul_data_path = '../data/CMaps/RUL_FD001.txt'

Define column names based on the dataset's structure

In [4]:
col_names = [
    'engine_id', 'time_in_cycles', 
    'operational_setting_1', 'operational_setting_2', 'operational_setting_3'
] + [f'sensor_{i}' for i in range(1, 27)]  # This creates sensor_1 to sensor_21

Load the train and test datasets

In [5]:
train_df = pd.read_csv(train_data_path, sep=' ', header=None, names=col_names)
test_df = pd.read_csv(test_data_path, sep=' ', header=None, names=col_names)

Checking the shape of dataframes

In [6]:
print("The shape of the train dataframe is: ", train_df.shape)
print("The shape of the test dataframe is: ", train_df.shape)

The shape of the train dataframe is:  (20631, 31)
The shape of the test dataframe is:  (20631, 31)


Drop columns that are completely empty (if any)

In [7]:
train_df.dropna(axis=1, how='all', inplace=True)
test_df.dropna(axis=1, how='all', inplace=True)

Checking if shape has changed

In [8]:
print("The shape of the train dataframe is: ", train_df.shape)
print("The shape of the test dataframe is: ", train_df.shape)

The shape of the train dataframe is:  (20631, 26)
The shape of the test dataframe is:  (20631, 26)


Load the Remaining Useful Life (RUL) data for the test set

In [9]:
rul_df = pd.read_csv(rul_data_path, header=None, names=['RUL'])

Checking the data

In [10]:
rul_df.head()

Unnamed: 0,RUL
0,112
1,98
2,69
3,82
4,91


Add RUL values to the test dataframe
Since the RUL values correspond to each engine in the test set, we add them directly to the test_df

In [11]:
test_df['RUL'] = test_df['engine_id'].map(lambda x: rul_df.loc[x - 1, 'RUL'])

Now, for each engine, the RUL value is the same for all cycles but corresponds to the final cycle for that engine.
Let's verify the first few rows


In [12]:
test_df[['engine_id', 'time_in_cycles', 'RUL']].head()

Unnamed: 0,engine_id,time_in_cycles,RUL
0,1,1,112
1,1,2,112
2,1,3,112
3,1,4,112
4,1,5,112


In [13]:
test_df[['engine_id', 'time_in_cycles', 'RUL']].tail()

Unnamed: 0,engine_id,time_in_cycles,RUL
13091,100,194,20
13092,100,195,20
13093,100,196,20
13094,100,197,20
13095,100,198,20


Each cycle for an engine will have the same RUL value that corresponds to the remaining cycles for that engine, starting from the last recorded cycle in the test data.

## 2.1 Rolling Statistics

Let's create rolling means and rolling standard deviations for each sensor.

In [14]:
window_size = 5  # Set the window size for rolling calculations


Create rolling mean and rolling std for each sensor

In [15]:
for sensor in [f'sensor_{i}' for i in range(1, 22)]:
    train_df[f'{sensor}_rolling_mean'] = train_df[sensor].rolling(window=window_size).mean()
    train_df[f'{sensor}_rolling_std'] = train_df[sensor].rolling(window=window_size).std()
    test_df[f'{sensor}_rolling_mean'] = test_df[sensor].rolling(window=window_size).mean()
    test_df[f'{sensor}_rolling_std'] = test_df[sensor].rolling(window=window_size).std()

Drop NaN values that result from the rolling mean and std calculations

In [16]:
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

Check the first few rows to ensure the features have been added

In [17]:
train_df[['engine_id', 'time_in_cycles', 'sensor_2', 'sensor_2_rolling_mean', 'sensor_2_rolling_std']].head()

Unnamed: 0,engine_id,time_in_cycles,sensor_2,sensor_2_rolling_mean,sensor_2_rolling_std
4,1,5,642.37,642.208,0.234776
5,1,6,642.1,642.264,0.128374
6,1,7,642.48,642.33,0.139463
7,1,8,642.56,642.372,0.17427
8,1,9,642.12,642.326,0.208519


## 2.2 Delta Features (Difference between Consecutive Cycles)
Calculate the delta (difference) between consecutive cycles for each sensor.

In [18]:
for sensor in [f'sensor_{i}' for i in range(1, 22)]:
    train_df[f'{sensor}_delta'] = train_df[sensor] - train_df.groupby('engine_id')[sensor].shift(1)
    test_df[f'{sensor}_delta'] = test_df[sensor] - test_df.groupby('engine_id')[sensor].shift(1)

Drop NaN values resulting from the shift operation

In [19]:
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

Check the first few rows to ensure the delta features have been added

In [20]:
train_df[['engine_id', 'time_in_cycles', 'sensor_2', 'sensor_2_delta']].head()

Unnamed: 0,engine_id,time_in_cycles,sensor_2,sensor_2_delta
5,1,6,642.1,-0.27
6,1,7,642.48,0.38
7,1,8,642.56,0.08
8,1,9,642.12,-0.44
9,1,10,641.71,-0.41


## 2.3 Cumulative Features (Cumulative Sum and Mean)
Create cumulative sum and cumulative mean for each sensor.

In [21]:
for sensor in [f'sensor_{i}' for i in range(1, 22)]:
    train_df[f'{sensor}_cumsum'] = train_df[sensor].cumsum()
    train_df[f'{sensor}_cummean'] = train_df[sensor].expanding().mean()
    test_df[f'{sensor}_cumsum'] = test_df[sensor].cumsum()
    test_df[f'{sensor}_cummean'] = test_df[sensor].expanding().mean()

  train_df[f'{sensor}_cummean'] = train_df[sensor].expanding().mean()
  test_df[f'{sensor}_cumsum'] = test_df[sensor].cumsum()
  test_df[f'{sensor}_cummean'] = test_df[sensor].expanding().mean()
  train_df[f'{sensor}_cumsum'] = train_df[sensor].cumsum()
  train_df[f'{sensor}_cummean'] = train_df[sensor].expanding().mean()
  test_df[f'{sensor}_cumsum'] = test_df[sensor].cumsum()
  test_df[f'{sensor}_cummean'] = test_df[sensor].expanding().mean()
  train_df[f'{sensor}_cumsum'] = train_df[sensor].cumsum()
  train_df[f'{sensor}_cummean'] = train_df[sensor].expanding().mean()
  test_df[f'{sensor}_cumsum'] = test_df[sensor].cumsum()
  test_df[f'{sensor}_cummean'] = test_df[sensor].expanding().mean()
  train_df[f'{sensor}_cumsum'] = train_df[sensor].cumsum()
  train_df[f'{sensor}_cummean'] = train_df[sensor].expanding().mean()
  test_df[f'{sensor}_cumsum'] = test_df[sensor].cumsum()
  test_df[f'{sensor}_cummean'] = test_df[sensor].expanding().mean()


Check the first few rows to ensure the cumulative features have been added

In [22]:
train_df[['engine_id', 'time_in_cycles', 'sensor_2', 'sensor_2_cumsum', 'sensor_2_cummean']].head()

Unnamed: 0,engine_id,time_in_cycles,sensor_2,sensor_2_cumsum,sensor_2_cummean
5,1,6,642.1,642.1,642.1
6,1,7,642.48,1284.58,642.29
7,1,8,642.56,1927.14,642.38
8,1,9,642.12,2569.26,642.315
9,1,10,641.71,3210.97,642.194


## 2.4 Lag Features (Values from Previous Cycles)
Create lag features that capture previous cycle values. Let's capture values from the previous cycle for each sensor.

In [23]:
for sensor in [f'sensor_{i}' for i in range(1, 22)]:
    train_df[f'{sensor}_lag_1'] = train_df.groupby('engine_id')[sensor].shift(1)
    test_df[f'{sensor}_lag_1'] = test_df.groupby('engine_id')[sensor].shift(1)

  train_df[f'{sensor}_lag_1'] = train_df.groupby('engine_id')[sensor].shift(1)
  test_df[f'{sensor}_lag_1'] = test_df.groupby('engine_id')[sensor].shift(1)
  train_df[f'{sensor}_lag_1'] = train_df.groupby('engine_id')[sensor].shift(1)
  test_df[f'{sensor}_lag_1'] = test_df.groupby('engine_id')[sensor].shift(1)
  train_df[f'{sensor}_lag_1'] = train_df.groupby('engine_id')[sensor].shift(1)
  test_df[f'{sensor}_lag_1'] = test_df.groupby('engine_id')[sensor].shift(1)
  train_df[f'{sensor}_lag_1'] = train_df.groupby('engine_id')[sensor].shift(1)
  test_df[f'{sensor}_lag_1'] = test_df.groupby('engine_id')[sensor].shift(1)
  train_df[f'{sensor}_lag_1'] = train_df.groupby('engine_id')[sensor].shift(1)
  test_df[f'{sensor}_lag_1'] = test_df.groupby('engine_id')[sensor].shift(1)
  train_df[f'{sensor}_lag_1'] = train_df.groupby('engine_id')[sensor].shift(1)
  test_df[f'{sensor}_lag_1'] = test_df.groupby('engine_id')[sensor].shift(1)
  train_df[f'{sensor}_lag_1'] = train_df.groupby('engine_id')[se

Drop NaN values that result from the lag features

In [24]:
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

Check the first few rows to ensure the lag features have been added

In [25]:
train_df[['engine_id', 'time_in_cycles', 'sensor_2', 'sensor_2_lag_1']].head()

Unnamed: 0,engine_id,time_in_cycles,sensor_2,sensor_2_lag_1
6,1,7,642.48,642.1
7,1,8,642.56,642.48
8,1,9,642.12,642.56
9,1,10,641.71,642.12
10,1,11,642.28,641.71


## 2.5. Rolling Percentile (Additional transformation)
Calculate rolling percentiles (e.g., 10th and 90th percentile) for each sensor.

In [26]:
for sensor in [f'sensor_{i}' for i in range(1, 22)]:
    train_df[f'{sensor}_rolling_10th'] = train_df[sensor].rolling(window=window_size).quantile(0.1)
    train_df[f'{sensor}_rolling_90th'] = train_df[sensor].rolling(window=window_size).quantile(0.9)
    test_df[f'{sensor}_rolling_10th'] = test_df[sensor].rolling(window=window_size).quantile(0.1)
    test_df[f'{sensor}_rolling_90th'] = test_df[sensor].rolling(window=window_size).quantile(0.9)

  train_df[f'{sensor}_rolling_10th'] = train_df[sensor].rolling(window=window_size).quantile(0.1)
  train_df[f'{sensor}_rolling_90th'] = train_df[sensor].rolling(window=window_size).quantile(0.9)
  test_df[f'{sensor}_rolling_10th'] = test_df[sensor].rolling(window=window_size).quantile(0.1)
  test_df[f'{sensor}_rolling_90th'] = test_df[sensor].rolling(window=window_size).quantile(0.9)
  train_df[f'{sensor}_rolling_10th'] = train_df[sensor].rolling(window=window_size).quantile(0.1)
  train_df[f'{sensor}_rolling_90th'] = train_df[sensor].rolling(window=window_size).quantile(0.9)
  test_df[f'{sensor}_rolling_10th'] = test_df[sensor].rolling(window=window_size).quantile(0.1)
  test_df[f'{sensor}_rolling_90th'] = test_df[sensor].rolling(window=window_size).quantile(0.9)
  train_df[f'{sensor}_rolling_10th'] = train_df[sensor].rolling(window=window_size).quantile(0.1)
  train_df[f'{sensor}_rolling_90th'] = train_df[sensor].rolling(window=window_size).quantile(0.9)
  test_df[f'{sensor}_rolling

Drop NaN values resulting from the rolling percentile calculations

In [27]:
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

Check the first few rows to ensure the rolling percentiles have been added

In [28]:
train_df[['engine_id', 'time_in_cycles', 'sensor_2', 'sensor_2_rolling_10th', 'sensor_2_rolling_90th']].head()

Unnamed: 0,engine_id,time_in_cycles,sensor_2,sensor_2_rolling_10th,sensor_2_rolling_90th
10,1,11,642.28,641.874,642.528
11,1,12,642.06,641.85,642.448
12,1,13,643.07,641.85,642.754
13,1,14,642.35,641.85,642.782
14,1,15,642.43,642.148,642.814


## 2.6. Feature Interaction (Product of sensors)
You can also create new features by interacting different sensor values. For example, you could take the product of two sensors.

In [29]:
train_df['sensor_1_sensor_2_interaction'] = train_df['sensor_1'] * train_df['sensor_2']
test_df['sensor_1_sensor_2_interaction'] = test_df['sensor_1'] * test_df['sensor_2']

  train_df['sensor_1_sensor_2_interaction'] = train_df['sensor_1'] * train_df['sensor_2']
  test_df['sensor_1_sensor_2_interaction'] = test_df['sensor_1'] * test_df['sensor_2']


Check the first few rows to ensure the interaction feature has been added

In [30]:
train_df[['engine_id', 'time_in_cycles', 'sensor_1', 'sensor_2', 'sensor_1_sensor_2_interaction']].head()

Unnamed: 0,engine_id,time_in_cycles,sensor_1,sensor_2,sensor_1_sensor_2_interaction
10,1,11,518.67,642.28,333131.3676
11,1,12,518.67,642.06,333017.2602
12,1,13,518.67,643.07,333541.1169
13,1,14,518.67,642.35,333167.6745
14,1,15,518.67,642.43,333209.1681


Summary of engineered features

In [31]:
train_df.describe()

Unnamed: 0,engine_id,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_17_rolling_90th,sensor_18_rolling_10th,sensor_18_rolling_90th,sensor_19_rolling_10th,sensor_19_rolling_90th,sensor_20_rolling_10th,sensor_20_rolling_90th,sensor_21_rolling_10th,sensor_21_rolling_90th,sensor_1_sensor_2_interaction
count,20423.0,20423.0,20423.0,20423.0,20423.0,20423.0,20423.0,20423.0,20423.0,20423.0,...,20423.0,20423.0,20423.0,20423.0,20423.0,20423.0,20423.0,20423.0,20423.0,20423.0
mean,51.536209,109.898791,-8e-06,2e-06,100.0,518.67,642.684118,1590.561123,1408.999687,14.62,...,394.055496,2388.0,2388.0,100.0,100.0,38.721022,38.9086,23.232931,23.344809,333340.971688
std,29.219596,68.372872,0.002188,0.000293,0.0,6.525784e-11,0.500361,6.132682,9.00027,3.382266e-12,...,1.350983,0.0,0.0,0.0,0.0,0.160895,0.155586,0.096134,0.093711,259.522156
min,1.0,3.0,-0.0087,-0.0006,100.0,518.67,641.21,1571.04,1382.25,14.62,...,390.6,2388.0,2388.0,100.0,100.0,38.188,38.392,22.93864,23.03132,332576.3907
25%,26.0,54.0,-0.0015,-0.0002,100.0,518.67,642.33,1586.31,1402.42,14.62,...,393.0,2388.0,2388.0,100.0,100.0,38.622,38.814,23.17395,23.28686,333157.3011
50%,52.0,105.0,0.0,0.0,100.0,518.67,642.65,1590.14,1408.1,14.62,...,394.0,2388.0,2388.0,100.0,100.0,38.74,38.924,23.2433,23.3538,333323.2755
75%,77.0,157.0,0.0015,0.0003,100.0,518.67,643.01,1594.42,1414.63,14.62,...,395.0,2388.0,2388.0,100.0,100.0,38.84,39.026,23.3032,23.41368,333509.9967
max,100.0,362.0,0.0087,0.0006,100.0,518.67,644.53,1616.91,1441.49,14.62,...,398.8,2388.0,2388.0,100.0,100.0,39.15,39.334,23.45984,23.59458,334298.3751
