In [8]:
# Clone our GitHub repository into the Colab environment
!git clone https://github.com/GSebs/ml-health-forecasting-transformer.git

# cd into the project directory
%cd ml-health-forecasting-transformer

!pip install -r requirements.txt

Cloning into 'ml-health-forecasting-transformer'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 11 (delta 0), reused 11 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (11/11), 1.73 MiB | 12.24 MiB/s, done.
/content/ml-health-forecasting-transformer
Collecting pytorch-lightning (from -r requirements.txt (line 3))
  Downloading pytorch_lightning-2.5.6-py3-none-any.whl.metadata (20 kB)
Collecting mlflow (from -r requirements.txt (line 12))
  Downloading mlflow-3.6.0-py3-none-any.whl.metadata (31 kB)
Collecting evidently (from -r requirements.txt (line 13))
  Downloading evidently-0.7.16-py3-none-any.whl.metadata (11 kB)
Collecting torchmetrics>0.7.0 (from pytorch-lightning->-r requirements.txt (line 3))
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning->-r requirements.txt (line 3))
  

In [13]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np 
import os 
import torch
import warnings
warnings.filterwarnings('ignore')

In [24]:
DATA_DIR = 'data'
CLEAN_FILE = 'train_FD001_clean.parquet'
SEQUENCE_LENGTH = 30 # we set the lookback window to 30 time steps (model will see past 30 cycles to predict RUL)
MAX_RUL = 125
STD_THRESHOLD = 0.01

INDEX_COLS = ['engine_id','time_cycles']
SETTINGS_COLS = [f'op_setting_{i}' for i in range(1,4)]
SENSOR_COLS = [f'sensor_{i}' for i in range(1,22)]

try:
    df = pd.read_parquet(os.path.join(DATA_DIR, CLEAN_FILE))
    print("Cleaned data file found and loaded.")

except FileNotFoundError:
    print("Error: Parquet file not found. Ensure the repo was cloned and dependencies installed.")
    exit()

'''
To prevent the model from getting distracted from very high RUL values we cap the RUL at 125
So any RUL value above 125 is set to 125
This way the model will focus on the lower RUL values because we know that a model is healthy with RUL > 125
'''

df['RUL_capped'] = df['RUL'].clip(upper=MAX_RUL)

Cleaned data file found and loaded.


In [25]:
#Drop near-constant sensors

std_sensors = df[SENSOR_COLS].std()
#any sensor with std dev less than the threshold is considered near-constant
constant_sensors = std_sensors[std_sensors < STD_THRESHOLD].index.tolist()
#so then in the final features we drop these sensors that are near-constant (so we only look at non-constant sensors)
#constant sesnsors are irrelevant for predicting model health because if they reamin constant during the entire lifespan of the model then it shows that the senror is irrelevant in predicting model health
final_features = SETTINGS_COLS + [cols for cols in SENSOR_COLS if cols not in constant_sensors]

print(f"Features originally considered: {len(SETTINGS_COLS) + len(SENSOR_COLS)}")
print(f"Sensors identified as near-constant and dropped: {constant_sensors}")
print(f"Final features used for Transformer: {len(final_features)}")

Features originally considered: 24
Sensors identified as near-constant and dropped: ['sensor_1', 'sensor_5', 'sensor_6', 'sensor_10', 'sensor_16', 'sensor_18', 'sensor_19']
Final features used for Transformer: 17


In [26]:
#Normalization

feature_scaler = MinMaxScaler()
rul_scaler = MinMaxScaler()

#scale all values of the features and the RUL_capped to [0, 1]
df[final_features] = feature_scaler.fit_transform(df[final_features])
df['RUL_NORMALIZED'] = rul_scaler.fit_transform(df['RUL_capped'].values.reshape(-1, 1))

print("Features and RUL normalized to [0, 1] range.")

Features and RUL normalized to [0, 1] range.


In [27]:
#this function will take the flat DataFrame and reshape it into sequences for time series modeling

'''
We are trying to take all the engines which each have their own independent sensors 
and sequence all the sensors per engine into a set of sequences where each sequence is 30 cycles long

Then we use Time Windowing to let the transformer see overlaping windows of 30 cyles at a time to predict the RUL
based on the last cycle in that window
This allows the transformer to learn temporal patterns (transformer is able to learn continous patterns) 
over the last 30 cycles to predict the RUL at the end of that window
'''

def create_sequences(df,sequence_length, features, target):
    X, y = [], []

    #combine features and target columns into a numpy array
    data = df[features + [target]].values #much more computationally efficent for GPU processing since nuumpy array slicing is faster than pandas DataFrame slicing 
    
    #get all the cycles for each engine_id
    for engine_id in df['engine_id'].unique():
        engine_data = data[df['engine_id']== engine_id] #creates numpy arrays for each engine_id contiaining only the cycle data for that engine_id in each array

        for i in range(len(engine_data) - sequence_length + 1):
            X.append(engine_data[i:i+sequence_length, :-1]) #This slices the rows, taking exactly 30 cycles of history, starting from cycle i (It takes all columns except the last one. The last column is the target RUL value.)
            y.append(engine_data[i+sequence_length-1, -1]) #This calculates the index of the last cycle within the current 30-cycle window and selects the last column of that final cycle, which is the single, normalized RUL value

    return np.array(X), np.array(y)

X_seq, y_seq = create_sequences(df, SEQUENCE_LENGTH, final_features, 'RUL_NORMALIZED')

print(f"\nGenerated Sequences:")
print(f"Input Sequences (X_seq) shape: {X_seq.shape} (Samples x Window Size x Features)")
print(f"Target Labels (y_seq) shape: {y_seq.shape}")

# Time-Ordered Split (80% Train, 20% Validation)
#So the first (earliest) historical 80% of the sequences are used for training and the last 20% for validation
#gaurentees that future data is not used to predict past data
TRAIN_SPLIT_INDEX = int(len(X_seq) * 0.8)

X_train = X_seq[:TRAIN_SPLIT_INDEX]
y_train = y_seq[:TRAIN_SPLIT_INDEX]
X_val = X_seq[TRAIN_SPLIT_INDEX:]
y_val = y_seq[TRAIN_SPLIT_INDEX:]

print("\nFinal Split Shapes:")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")


Generated Sequences:
Input Sequences (X_seq) shape: (17731, 30, 17) (Samples x Window Size x Features)
Target Labels (y_seq) shape: (17731,)

Final Split Shapes:
X_train shape: (14184, 30, 17)
y_train shape: (14184,)
X_val shape: (3547, 30, 17)
y_val shape: (3547,)


In [28]:
import joblib

# Create a directory to save model artifacts (scalers and data)
# so that this way we can correctly preprocess new, raw test data later on
artifacts_dir = os.path.join(DATA_DIR, 'artifacts')
os.makedirs(artifacts_dir, exist_ok=True)

# Save Scalers (Needed to transform test data and reverse RUL predictions)
joblib.dump(feature_scaler, os.path.join(artifacts_dir, 'feature_scaler.pkl'))
joblib.dump(rul_scaler, os.path.join(artifacts_dir, 'rul_scaler.pkl'))

# Save the final NumPy arrays (ready for PyTorch)
np.save(os.path.join(artifacts_dir, 'X_train.npy'), X_train)
np.save(os.path.join(artifacts_dir, 'y_train.npy'), y_train)
np.save(os.path.join(artifacts_dir, 'X_val.npy'), X_val)
np.save(os.path.join(artifacts_dir, 'y_val.npy'), y_val)

print(f"\nFeature engineering complete. Artifacts saved in {artifacts_dir}")


Feature engineering complete. Artifacts saved in data/artifacts
