## Data Preprocessing

### Load the data

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import StandardScaler

In [2]:
os.getcwd()

'c:\\Users\\amman\\Documents\\MLOPS\\Aircraft-Engine-Predictive-Maintenance\\notebooks'

In [3]:
os.chdir("../")
os.getcwd()

'c:\\Users\\amman\\Documents\\MLOPS\\Aircraft-Engine-Predictive-Maintenance'

In [10]:
data_path = r"data/raw/CMAPSSData"
train_file = os.path.join(data_path, "train_FD001.txt")
test_file = os.path.join(data_path, "test_FD001.txt")

# Column names based on dataset description
column_names = ["unit", "time", "setting_1", "setting_2", "setting_3"] + [f"sensor_{i}" for i in range(1, 22)]

# Load data
train_df = pd.read_csv(train_file, sep='\s+', header=None, names=column_names, engine='python')
test_df = pd.read_csv(test_file, sep='\s+', header=None, names=column_names, engine='python')

#### Add RUL columns for Train and Test Sets

In [11]:
# Add RUL for Train set
rul_max = train_df.groupby("unit")["time"].max().reset_index()
rul_max.columns = ["unit", "max_time"]
train_df = train_df.merge(rul_max, on="unit", how="left")
train_df["RUL"] = train_df["max_time"] - train_df["time"]
train_df.drop(columns=["max_time"], inplace=True)

# Add RUL for Test set
rul_max = test_df.groupby("unit")["time"].max().reset_index()
rul_max.columns = ["unit", "max_time"]
test_df = test_df.merge(rul_max, on="unit", how="left")
test_df["RUL"] = test_df["max_time"] - test_df["time"]
test_df.drop(columns=["max_time"], inplace=True)

In [12]:
train_df.head()

Unnamed: 0,unit,time,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,47.13,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15,1.3,47.28,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187


In [13]:
test_df.head()

Unnamed: 0,unit,time,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,21.61,553.9,2388.04,9050.17,1.3,47.2,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735,30
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,21.61,554.85,2388.01,9054.42,1.3,47.5,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916,29
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,21.61,554.11,2388.05,9056.96,1.3,47.5,521.97,2388.03,8130.1,8.4441,0.03,393,2388,100.0,39.08,23.4166,28
3,1,4,0.0042,0.0,100.0,518.67,642.44,1584.12,1406.42,14.62,21.61,554.07,2388.03,9045.29,1.3,47.28,521.38,2388.05,8132.9,8.3917,0.03,391,2388,100.0,39.0,23.3737,27
4,1,5,0.0014,0.0,100.0,518.67,642.51,1587.19,1401.92,14.62,21.61,554.16,2388.01,9044.55,1.3,47.31,522.15,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.413,26


#### Apply Scaling to Features

In [14]:
# Select sensor and settings columns to scale
scale_columns = ["setting_1", "setting_2", "setting_3"] + [f"sensor_{i}" for i in range(1, 22)]

# Extract sensor data
train_sensors = train_df[scale_columns]
test_sensors = test_df[scale_columns]

# Scale sensor data
scaler = StandardScaler()
train_sensors_scaled = scaler.fit_transform(train_sensors)
test_sensors_scaled = scaler.transform(test_sensors)

# Replace the original sensor columns with scaled data
train_df[scale_columns] = train_sensors_scaled
test_df[scale_columns] = test_sensors_scaled

In [15]:
train_df.head()

Unnamed: 0,unit,time,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1,1,-0.31598,-1.372953,0.0,0.0,-1.721725,-0.134255,-0.925936,-1.776357e-15,0.141683,1.121141,-0.516338,-0.862813,0.0,-0.266467,0.334262,-1.05889,-0.269071,-0.603816,-1.387779e-17,-0.78171,0.0,0.0,1.348493,1.194427,191
1,1,2,0.872722,-1.03172,0.0,0.0,-1.06178,0.211528,-0.643726,-1.776357e-15,0.141683,0.43193,-0.798093,-0.958818,0.0,-0.191583,1.174899,-0.363646,-0.642845,-0.275852,-1.387779e-17,-0.78171,0.0,0.0,1.016528,1.236922,190
2,1,3,-1.961874,1.015677,0.0,0.0,-0.661813,-0.413166,-0.525953,-1.776357e-15,0.141683,1.008155,-0.234584,-0.557139,0.0,-1.015303,1.364721,-0.919841,-0.551629,-0.649144,-1.387779e-17,-2.073094,0.0,0.0,0.739891,0.503423,189
3,1,4,0.32409,-0.008022,0.0,0.0,-0.661813,-1.261314,-0.784831,-1.776357e-15,0.141683,1.222827,0.188048,-0.713826,0.0,-1.539489,1.961302,-0.224597,-0.520176,-1.971665,-1.387779e-17,-0.78171,0.0,0.0,0.352598,0.777792,188
4,1,5,-0.864611,-0.690488,0.0,0.0,-0.621816,-1.251528,-0.301518,-1.776357e-15,0.141683,0.714393,-0.516338,-0.457059,0.0,-0.977861,1.052871,-0.780793,-0.521748,-0.339845,-1.387779e-17,-0.136018,0.0,0.0,0.463253,1.059552,187


In [16]:
test_df.head()

Unnamed: 0,unit,time,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1,1,1.055599,1.015677,0.0,0.0,0.678077,-0.85355,-1.19148,-1.776357e-15,0.141683,0.601408,-0.798093,-0.682579,0.0,-1.277396,0.415614,-0.919841,-0.954235,-0.985107,-1.387779e-17,-0.78171,0.0,0.0,0.241943,0.774097,30
1,1,2,-1.230366,-1.03172,0.0,0.0,-1.941707,-0.338137,-1.501467,-1.776357e-15,0.141683,1.674769,-1.220725,-0.490117,0.0,-0.154141,1.012195,-0.502695,-0.216648,-1.649034,-1.387779e-17,-0.136018,0.0,0.0,1.127183,0.941305,29
2,1,3,0.141213,0.333211,0.0,0.0,-0.441831,-0.584426,-0.843717,-1.776357e-15,0.141683,0.838677,-0.657216,-0.375093,0.0,-0.154141,0.754581,-0.919841,-0.715712,0.052112,-1.387779e-17,-0.136018,0.0,0.0,1.459148,1.172256,28
3,1,4,1.924266,-0.008022,0.0,0.0,-0.481827,-1.044384,-0.279297,-1.776357e-15,0.141683,0.793483,-0.93897,-0.90357,0.0,-0.977861,-0.045381,-0.641744,-0.568929,-1.345067,-1.387779e-17,-1.427402,0.0,0.0,1.016528,0.775945,27
4,1,5,0.644125,-0.008022,0.0,0.0,-0.341839,-0.54365,-0.779276,-1.776357e-15,0.141683,0.89517,-1.220725,-0.937081,0.0,-0.865536,0.998637,-0.919841,-0.745069,-1.041101,-1.387779e-17,-2.073094,0.0,0.0,0.9612,1.138999,26


### Save Preprocessed Data

In [17]:
train_df.to_csv("data/processed/train_FD001_processed")
test_df.to_csv("data/processed/test_FD001_processed")