# **Feature Engineering**

This notebook applies feature engineering techniques such as rolling window statistics, normalization, and dimensionality reduction (PCA). These transformations help capture degradation patterns while reducing noise in sensor data.

In [10]:
import pandas as pd # Import necessary libraries for data analysis and wrangling
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [5]:
url = "https://raw.githubusercontent.com/Akarsh-Doki/Doki-NASA-engine-failure-ML-project/refs/heads/main/data/processed/eda_data.csv"
data = pd.read_csv(url)
data.head(20)

Unnamed: 0,unit_number,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_measurement_1,sensor_measurement_2,sensor_measurement_8,sensor_measurement_10,sensor_measurement_11,sensor_measurement_14,sensor_measurement_15,sensor_measurement_16,RUL
0,1,1,34.9983,0.84,100.0,449.44,555.32,2222.65,1.02,42.02,8048.56,9.3461,0.02,87.5
1,1,2,41.9982,0.8408,100.0,445.0,549.9,2211.57,1.02,42.2,8072.3,9.3774,0.02,87.5
2,1,3,24.9988,0.6218,60.0,462.54,537.31,1915.11,0.94,36.69,7864.87,10.8941,0.02,87.5
3,1,4,42.0077,0.8416,100.0,445.0,549.51,2211.58,1.02,41.96,8068.66,9.3528,0.02,87.5
4,1,5,25.0005,0.6203,60.0,462.54,537.07,1915.1,0.94,36.89,7861.23,10.8963,0.02,87.5
5,1,6,25.0045,0.6205,60.0,462.54,537.02,1915.15,0.94,36.78,7868.87,10.8912,0.02,87.5
6,1,7,42.0043,0.8409,100.0,445.0,549.74,2211.62,1.02,42.19,8075.54,9.3753,0.02,87.5
7,1,8,20.002,0.7002,100.0,491.19,607.44,2323.87,1.08,44.27,8049.26,9.2369,0.02,87.5
8,1,9,41.9995,0.8407,100.0,445.0,549.33,2211.61,1.02,42.3,8065.78,9.3878,0.02,87.5
9,1,10,42.0011,0.84,100.0,445.0,549.33,2211.56,1.02,42.02,8069.11,9.3957,0.02,87.5


**Rolling Feature Engineering**

We create new features that summarize sensor and operational setting behavior over time.

Instead of using raw values at each cycle, rolling statistics capture *trends* (slopes, averages, variability).

This helps the model detect gradual degradation patterns that are predictive of Remaining Useful Life (RUL).

In [8]:
# Make a copy of the dataset so we can safely add new rolling features
rolling_data = data.copy()

# Define operational and sensor columns
op_cols = [column for column in data.columns if "operational" in column]
sensor_cols = [column for column in data.columns if "sensor" in column]

# 1. Rolling Standard Deviation (window=5 cycles)
#    - Std shows variability over the window, which can indicate instability or early fault signals.rolling_data[op_cols + sensor_cols] = rolling_data.groupby("unit_number")[op_cols + sensor_cols].rolling(window=5, min_periods=1).mean().reset_index(level=0, drop=True)
rolling_data[[f'{col}_std' for col in op_cols + sensor_cols]] = (
    rolling_data.groupby("unit_number")[op_cols + sensor_cols]
    .rolling(window=5, min_periods=1).std()
    .reset_index(level=0, drop=True)
)

# 2. Rolling Slope (window=5 cycles)
#    - Measures the *rate of change* of each sensor signal.
#    - Positive slope = increasing trend (e.g., rising temperature or vibration).
#    - Negative slope = decreasing trend (e.g., dropping pressure).
#    - This can reveal whether an engine is degrading quickly or stabilizing.
for col in op_cols + sensor_cols:
    rolling_data[f'{col}_slope'] = rolling_data.groupby("unit_number")[col].diff(periods=5).rolling(window=5, min_periods=1).mean().reset_index(level=0, drop=True)

# 3. Rolling Variance (window=20 cycles)
#    - Captures longer-term fluctuations in sensor signals.
#    - High variance can signal abnormal oscillations or instability in the engine
for col in op_cols + sensor_cols:
    rolling_data[f'{col}_var'] = rolling_data.groupby("unit_number")[col].rolling(window=20, min_periods=1).var().reset_index(level=0, drop=True)

# 4. Normalized Cycle Index
#    - Each engine runs for a different number of cycles before failure.
#    - Normalized cycle = (current cycle / engine's max cycle).
#    - This gives the model a relative measure of how far along the engine is in its lifecycle.
max_cycles = data.groupby("unit_number")["time_in_cycles"].max().reset_index()
max_cycles.columns = ["unit_number", "max_cycles"]

rolling_data = rolling_data.merge(max_cycles, on="unit_number", how="left")
rolling_data["normalized_cycle"] = rolling_data["time_in_cycles"] / rolling_data["max_cycles"]

# Drop helper column since it's no longer needed
rolling_data = rolling_data.drop(columns=["max_cycles"])

rolling_data

Unnamed: 0,unit_number,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_measurement_1,sensor_measurement_2,sensor_measurement_8,sensor_measurement_10,sensor_measurement_11,...,operational_setting_3_var,sensor_measurement_1_var,sensor_measurement_2_var,sensor_measurement_8_var,sensor_measurement_10_var,sensor_measurement_11_var,sensor_measurement_14_var,sensor_measurement_15_var,sensor_measurement_16_var,normalized_cycle
0,1,1,34.9983,0.8400,100.0,449.44,555.32,2222.65,1.02,42.02,...,,,,,,,,,,0.003876
1,1,2,41.9982,0.8408,100.0,445.00,549.90,2211.57,1.02,42.20,...,0.000000,9.856800,14.688200,61.383200,0.000000,0.016200,281.793800,0.000490,0.00000,0.007752
2,1,3,24.9988,0.6218,60.0,462.54,537.31,1915.11,0.94,36.69,...,533.333333,83.162533,85.374100,30432.024933,0.002133,9.800233,12888.801433,0.782944,0.00000,0.011628
3,1,4,42.0077,0.8416,100.0,445.00,549.51,2211.58,1.02,41.96,...,400.000000,68.861700,57.916067,22550.762958,0.001600,7.219625,9940.036025,0.589493,0.00000,0.015504
4,1,5,25.0005,0.6203,60.0,462.54,537.07,1915.10,0.94,36.89,...,480.000000,80.662680,67.373770,27049.550470,0.001920,8.344670,12098.198030,0.708325,0.00000,0.019380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87745,259,119,35.0015,0.8403,100.0,449.44,555.56,2223.24,1.02,41.96,...,269.473684,779.047047,1680.549805,28260.105015,0.017340,13.806499,9116.955668,0.721466,0.00002,0.580488
87746,259,120,42.0066,0.8405,100.0,445.00,549.42,2212.19,1.02,42.00,...,214.736842,818.488045,1627.973813,23216.720336,0.016451,11.850784,7384.184108,0.593299,0.00002,0.585366
87747,259,121,42.0061,0.8400,100.0,445.00,549.65,2212.29,1.02,42.15,...,151.578947,856.309824,1573.086300,17708.944038,0.015529,9.661706,5407.908738,0.460406,0.00002,0.590244
87748,259,122,0.0024,0.0003,100.0,518.67,642.58,2388.05,1.30,47.47,...,151.578947,856.309824,1575.231234,17708.944038,0.015529,9.578006,5545.886352,0.460561,0.00002,0.595122


**PCA (Principal Component Analysis)**

PCA is a dimensionality reduction technique.

It transforms correlated features (like sensors and operational settings).


***Why use PCA here?***


*  Many sensors are strongly correlated (e.g temperature/pressure pairs).
*  High correlation adds redundancy and noise, making it harder for the models to generalize.
*  PCA reduces thi redundancy by keeping the main variance directions.
*  We keep only the top components that explain most of the variance, which reduces dimensionality while preserving useful signal.

***Steps:***


1.   Standardize features (zero mean, unit variance), which ensures all sensors are on the same scale.
2.   Apply PCA to extract 10 principal components.
3.   Store components as new features (PC1, PC2, ... ,PC10) for use in training instead of (or alongside) original sensors

This helps improve training efficiency and may reduce overfitting

In [11]:
# Step 1: Standardize the operational + sensor data
standardized_data = StandardScaler().fit_transform(rolling_data[op_cols+sensor_cols])

# Print first 20 rows to verify standardized values
print(pd.DataFrame(standardized_data, columns=op_cols + sensor_cols).head(20))

# Step 2: Run PCA to extract top 10 components
pca = PCA(n_components=10)
X_pca = pca.fit_transform(standardized_data)

# Print summary: number of components and their explained variance ratio
# Explained variance ratio = % of dataset variance captured by each component
print(f"Number of components selected: {pca.n_components_}")
print(pca.explained_variance_ratio_)

# Step 3: Create a DataFrame for the PCA features (PC1–PC10)
X_pca_df = pd.DataFrame(X_pca, columns=[f"PC{i+1}" for i in range(X_pca.shape[1])])

# Print first 20 rows of transformed data
print("\n\n")
print(X_pca_df.head(20))

    operational_setting_1  operational_setting_2  operational_setting_3  \
0                0.745887               0.864365               0.417857   
1                1.220153               0.866942               0.417857   
2                0.068389               0.161386              -2.393163   
3                1.220797               0.869520               0.417857   
4                0.068505               0.156554              -2.393163   
5                0.068776               0.157198              -2.393163   
6                1.220566               0.867265               0.417857   
7               -0.270160               0.413969               0.417857   
8                1.220241               0.866620               0.417857   
9                1.220349               0.864365               0.417857   
10               1.220471               0.864365               0.417857   
11              -1.625257              -1.838656               0.417857   
12              -0.270275

**Final Dataset (combined with the PCA data)**

In [12]:
final_data = rolling_data.reset_index(drop=True).join(X_pca_df) # Join the pca data with rolling data

display(final_data.head(30))

Unnamed: 0,unit_number,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_measurement_1,sensor_measurement_2,sensor_measurement_8,sensor_measurement_10,sensor_measurement_11,...,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
0,1,1,34.9983,0.84,100.0,449.44,555.32,2222.65,1.02,42.02,...,-1.415737,1.235317,-0.045534,0.059475,-0.314511,0.077781,-0.001137,-0.089143,0.01212,-0.009799
1,1,2,41.9982,0.8408,100.0,445.0,549.9,2211.57,1.02,42.2,...,-1.582346,1.573582,0.234995,-0.030723,0.010782,0.198608,0.035993,0.023566,-0.003982,0.004569
2,1,3,24.9988,0.6218,60.0,462.54,537.31,1915.11,0.94,36.69,...,-4.388519,-2.891644,0.017728,0.012535,-0.081822,0.073224,-0.014172,0.017833,-0.000877,0.013502
3,1,4,42.0077,0.8416,100.0,445.0,549.51,2211.58,1.02,41.96,...,-1.613489,1.557304,0.240857,-0.02759,-0.04501,0.197794,-0.019627,0.055663,0.002984,0.001893
4,1,5,25.0005,0.6203,60.0,462.54,537.07,1915.1,0.94,36.89,...,-4.38149,-2.899509,0.002798,0.021081,-0.092768,0.122572,0.026435,-0.0078,-0.021465,0.005065
5,1,6,25.0045,0.6205,60.0,462.54,537.02,1915.15,0.94,36.78,...,-4.36595,-2.869543,0.030833,-0.001252,-0.05047,0.058358,-0.002838,0.003893,-0.013726,0.005712
6,1,7,42.0043,0.8409,100.0,445.0,549.74,2211.62,1.02,42.19,...,-1.573216,1.588635,0.246945,-0.040247,0.03068,0.175612,0.030804,0.023808,-0.005389,0.001308
7,1,8,20.002,0.7002,100.0,491.19,607.44,2323.87,1.08,44.27,...,0.576352,0.308131,-1.462428,0.027715,-0.134315,0.017583,0.05611,0.060628,0.025171,0.001896
8,1,9,41.9995,0.8407,100.0,445.0,549.33,2211.61,1.02,42.3,...,-1.603,1.548477,0.2166,-0.011682,-0.024404,0.252677,0.068483,0.01129,-0.008233,-0.009578
9,1,10,42.0011,0.84,100.0,445.0,549.33,2211.56,1.02,42.02,...,-1.624119,1.545871,0.23716,-0.022029,-0.015696,0.197418,0.019538,0.043957,0.031709,-0.006027
