## Data Preprocessing

### Load the data

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import StandardScaler

In [2]:
os.getcwd()

'c:\\Users\\amman\\Documents\\MLOPS\\Aircraft-Engine-Predictive-Maintenance\\notebooks'

In [3]:
os.chdir("../")
os.getcwd()

'c:\\Users\\amman\\Documents\\MLOPS\\Aircraft-Engine-Predictive-Maintenance'

In [11]:
data_path = r"data/raw/CMAPSSData"
train_file = os.path.join(data_path, "train_FD001.txt")
test_file = os.path.join(data_path, "test_FD001.txt")
rul_file = os.path.join(data_path, "RUL_FD001.txt")

# Column names based on dataset description
column_names = ["unit", "time", "setting_1", "setting_2", "setting_3"] + [f"sensor_{i}" for i in range(1, 22)]

# Load data
train_df = pd.read_csv(train_file, sep='\s+', header=None, names=column_names, engine='python')
test_df = pd.read_csv(test_file, sep='\s+', header=None, names=column_names, engine='python')
rul_df = pd.read_csv(rul_file, names=["RUL"])

#### Add RUL columns for Train Set and Test Set

In [12]:
# Add RUL for Train set
rul_max = train_df.groupby("unit")["time"].max().reset_index()
rul_max.columns = ["unit", "max_time"]
train_df = train_df.merge(rul_max, on="unit", how="left")
train_df["RUL"] = train_df["max_time"] - train_df["time"]
train_df.drop(columns=["max_time"], inplace=True)

# Since the true RUL values for the test set are only provided for the last time cycle of each enginge, 
# the test set is subsetted to represent the same
test_df = test_df.groupby('unit').last().reset_index()
test_df["RUL"] = rul_df.values

In [13]:
train_df.head()

Unnamed: 0,unit,time,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187


In [14]:
test_df.head()

Unnamed: 0,unit,time,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1,31,-0.0006,0.0004,100.0,518.67,642.58,1581.22,1398.91,14.62,...,2388.06,8130.11,8.4024,0.03,393,2388,100.0,38.81,23.3552,112
1,2,49,0.0018,-0.0001,100.0,518.67,642.55,1586.59,1410.83,14.62,...,2388.09,8126.9,8.4505,0.03,391,2388,100.0,38.81,23.2618,98
2,3,126,-0.0016,0.0004,100.0,518.67,642.88,1589.75,1418.89,14.62,...,2388.14,8131.46,8.4119,0.03,395,2388,100.0,38.93,23.274,69
3,4,106,0.0012,0.0004,100.0,518.67,642.78,1594.53,1406.88,14.62,...,2388.11,8133.64,8.4634,0.03,395,2388,100.0,38.58,23.2581,82
4,5,98,-0.0013,-0.0004,100.0,518.67,642.27,1589.94,1419.36,14.62,...,2388.15,8125.74,8.4362,0.03,394,2388,100.0,38.75,23.4117,91


#### Apply Scaling to Features

In [15]:
# Select sensor and settings columns to scale
scale_columns = ["setting_1", "setting_2", "setting_3"] + [f"sensor_{i}" for i in range(1, 22)]

# Extract sensor data
train_sensors = train_df[scale_columns]
test_sensors = test_df[scale_columns]

# Scale sensor data
scaler = StandardScaler()
train_sensors_scaled = scaler.fit_transform(train_sensors)
test_sensors_scaled = scaler.transform(test_sensors)

# Replace the original sensor columns with scaled data
train_df[scale_columns] = train_sensors_scaled
test_df[scale_columns] = test_sensors_scaled

In [16]:
train_df.head()

Unnamed: 0,unit,time,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1,1,-0.31598,-1.372953,0.0,0.0,-1.721725,-0.134255,-0.925936,-1.776357e-15,...,-1.05889,-0.269071,-0.603816,-1.387779e-17,-0.78171,0.0,0.0,1.348493,1.194427,191
1,1,2,0.872722,-1.03172,0.0,0.0,-1.06178,0.211528,-0.643726,-1.776357e-15,...,-0.363646,-0.642845,-0.275852,-1.387779e-17,-0.78171,0.0,0.0,1.016528,1.236922,190
2,1,3,-1.961874,1.015677,0.0,0.0,-0.661813,-0.413166,-0.525953,-1.776357e-15,...,-0.919841,-0.551629,-0.649144,-1.387779e-17,-2.073094,0.0,0.0,0.739891,0.503423,189
3,1,4,0.32409,-0.008022,0.0,0.0,-0.661813,-1.261314,-0.784831,-1.776357e-15,...,-0.224597,-0.520176,-1.971665,-1.387779e-17,-0.78171,0.0,0.0,0.352598,0.777792,188
4,1,5,-0.864611,-0.690488,0.0,0.0,-0.621816,-1.251528,-0.301518,-1.776357e-15,...,-0.780793,-0.521748,-0.339845,-1.387779e-17,-0.136018,0.0,0.0,0.463253,1.059552,187


In [17]:
test_df.head()

Unnamed: 0,unit,time,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1,31,-0.27026,1.35691,0.0,0.0,-0.20185,-1.51739,-1.113706,-1.776357e-15,...,-0.502695,-0.715188,-1.059765,-1.387779e-17,-0.136018,0.0,0.0,-0.034694,0.605041,112
1,2,49,0.827003,-0.349255,0.0,0.0,-0.261846,-0.641513,0.210682,-1.776357e-15,...,-0.085548,-0.883465,0.22276,-1.387779e-17,-1.427402,0.0,0.0,-0.034694,-0.25779,98
2,3,126,-0.727453,1.35691,0.0,0.0,0.3981,-0.1261,1.106199,-1.776357e-15,...,0.609696,-0.644417,-0.80646,-1.387779e-17,1.155367,0.0,0.0,0.629236,-0.145087,69
3,4,106,0.552687,1.35691,0.0,0.0,0.198117,0.653544,-0.228188,-1.776357e-15,...,0.192549,-0.530136,0.566722,-1.387779e-17,1.155367,0.0,0.0,-1.307226,-0.291971,82
4,5,98,-0.590295,-1.372953,0.0,0.0,-0.821799,-0.09511,1.158419,-1.776357e-15,...,0.748745,-0.944275,-0.158531,-1.387779e-17,0.509675,0.0,0.0,-0.366659,1.126989,91


#### Remove unecessary columns

We will drop the sensors that showed constant values for the whole timeseried data. We will also drop the operational settings columns for this analysis. Additionally, the unit and time data will be removed as they are deemed not important for prediction purposes

In [18]:
drop_sensors = ['sensor_1','sensor_5','sensor_6','sensor_10','sensor_16','sensor_18','sensor_19']
drop_labels = ["unit", "time", "setting_1", "setting_2", "setting_3"]+drop_sensors

train_df = train_df.drop(drop_labels, axis=1)
test_df = test_df.drop(drop_labels, axis=1)

In [19]:
train_df.head()

Unnamed: 0,sensor_2,sensor_3,sensor_4,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_17,sensor_20,sensor_21,RUL
0,-1.721725,-0.134255,-0.925936,1.121141,-0.516338,-0.862813,-0.266467,0.334262,-1.05889,-0.269071,-0.603816,-0.78171,1.348493,1.194427,191
1,-1.06178,0.211528,-0.643726,0.43193,-0.798093,-0.958818,-0.191583,1.174899,-0.363646,-0.642845,-0.275852,-0.78171,1.016528,1.236922,190
2,-0.661813,-0.413166,-0.525953,1.008155,-0.234584,-0.557139,-1.015303,1.364721,-0.919841,-0.551629,-0.649144,-2.073094,0.739891,0.503423,189
3,-0.661813,-1.261314,-0.784831,1.222827,0.188048,-0.713826,-1.539489,1.961302,-0.224597,-0.520176,-1.971665,-0.78171,0.352598,0.777792,188
4,-0.621816,-1.251528,-0.301518,0.714393,-0.516338,-0.457059,-0.977861,1.052871,-0.780793,-0.521748,-0.339845,-0.136018,0.463253,1.059552,187


In [20]:
test_df.head()

Unnamed: 0,sensor_2,sensor_3,sensor_4,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_17,sensor_20,sensor_21,RUL
0,-0.20185,-1.51739,-1.113706,1.188932,-0.234584,-0.400453,-1.165071,0.510525,-0.502695,-0.715188,-1.059765,-0.136018,-0.034694,0.605041,112
1,-0.261846,-0.641513,0.210682,0.172064,0.047171,-0.927118,0.48237,0.442731,-0.085548,-0.883465,0.22276,-1.427402,-0.034694,-0.25779,98
2,0.3981,-0.1261,1.106199,-0.878699,0.892435,-0.723788,1.268649,-0.791108,0.609696,-0.644417,-0.80646,1.155367,0.629236,-0.145087,69
3,0.198117,0.653544,-0.228188,-0.822207,0.469803,-0.631407,0.407486,0.632553,0.192549,-0.530136,0.566722,1.155367,-1.307226,-0.291971,82
4,-0.821799,-0.09511,1.158419,-0.087802,0.047171,-0.50959,-0.303908,-0.56061,0.748745,-0.944275,-0.158531,0.509675,-0.366659,1.126989,91


### Save Preprocessed Data

In [21]:
train_df.to_csv("data/processed/train_FD001_processed", index=False)
test_df.to_csv("data/processed/test_FD001_processed", index=False)