# Data Preparation

In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
behrad3d_nasa_cmaps_path = kagglehub.dataset_download('behrad3d/nasa-cmaps')

print('Data source import complete.')


Using Colab cache for faster access to the 'nasa-cmaps' dataset.
Data source import complete.


### Load Dataset and Define Columns

In [4]:
import os
import pandas as pd

# print to confirm
print(behrad3d_nasa_cmaps_path)

# Path to CMaps folder
cmaps_path = os.path.join(behrad3d_nasa_cmaps_path, "CMaps")

# Define columns
columns = [
    "engine_id", "time_in_cycles",
    "op_setting_1", "op_setting_2", "op_setting_3"
] + [f"sensor_{i}" for i in range(1, 22)]

# Load datasets

train_df = pd.read_csv(os.path.join(cmaps_path, "train_FD001.txt"),
                       sep=r"\s+", header=None, names=columns, index_col=False)
test_df = pd.read_csv(os.path.join(cmaps_path, "test_FD001.txt"),
                      sep=r"\s+", header=None, names=columns, index_col=False)
rul_df = pd.read_csv(os.path.join(cmaps_path, "RUL_FD001.txt"),
                     header=None, names=["RUL"])


/kaggle/input/nasa-cmaps


###Drop columns with constant values as they provide no predictive power


In [5]:
constant_cols = [col for col in train_df.columns if train_df[col].nunique() == 1]

print("Columns with constant values:", constant_cols)
train_df.drop(constant_cols, axis=1, inplace=True, errors='ignore')
test_df.drop(constant_cols, axis=1, inplace=True, errors='ignore')

Columns with constant values: ['op_setting_3', 'sensor_1', 'sensor_5', 'sensor_10', 'sensor_16', 'sensor_18', 'sensor_19']


### Calculate RUL for the training data


In [6]:
# Calculate the maximum cycle for each engine
max_cycles_df = train_df.groupby('engine_id')['time_in_cycles'].max().reset_index()
max_cycles_df.columns = ['engine_id', 'max_cycles']

# Merge the max cycle info back into the training dataframe
train_df = pd.merge(train_df, max_cycles_df, on='engine_id', how='left')

# Calculate RUL
train_df['RUL'] = train_df['max_cycles'] - train_df['time_in_cycles']

# Drop helper column
train_df.drop(columns=['max_cycles'], inplace=True)

# Cap RUL at 125
train_df['RUL'] = train_df['RUL'].clip(upper=125)

print("RUL column calculated, capped at 125, and added to the training data.")
train_df.head()


RUL column calculated, capped at 125, and added to the training data.


Unnamed: 0,engine_id,time_in_cycles,op_setting_1,op_setting_2,sensor_2,sensor_3,sensor_4,sensor_6,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_17,sensor_20,sensor_21,RUL
0,1,1,-0.0007,-0.0004,641.82,1589.7,1400.6,21.61,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.419,125
1,1,2,0.0019,-0.0003,642.15,1591.82,1403.14,21.61,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,392,39.0,23.4236,125
2,1,3,-0.0043,0.0003,642.35,1587.99,1404.2,21.61,554.26,2388.08,9052.94,47.27,522.42,2388.03,8133.23,8.4178,390,38.95,23.3442,125
3,1,4,0.0007,0.0,642.35,1582.79,1401.87,21.61,554.45,2388.11,9049.48,47.13,522.86,2388.08,8133.83,8.3682,392,38.88,23.3739,125
4,1,5,-0.0019,-0.0002,642.37,1582.85,1406.22,21.61,554.0,2388.06,9055.15,47.28,522.19,2388.04,8133.8,8.4294,393,38.9,23.4044,125


# EDA + Feature Building

In [7]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler


### Set a seed for reproducibility


In [8]:
# Define a seed value for reproducibility
SEED_VALUE = 42

# 1. Set the `PYTHONHASHSEED` environment variable
import os
os.environ['PYTHONHASHSEED'] = str(SEED_VALUE)

# 2. Set the `python` built-in random seed
import random
random.seed(SEED_VALUE)

# 3. Set the `numpy` random seed
import numpy as np
np.random.seed(SEED_VALUE)

# 4. Set the `tensorflow` random seed
import tensorflow as tf
tf.random.set_seed(SEED_VALUE)

print("Random seeds are set for reproducibility.")

Random seeds are set for reproducibility.


In [9]:
def generate_sequences(df, sequence_length, feature_cols):
    """
    Generates sequences of data and corresponding targets for the LSTM model.
    The target is the RUL at the end of each sequence.
    """
    sequences, targets = [], []

    # Generate sequences for each unique engine_id
    for engine_id in df['engine_id'].unique():
        engine_df = df[df['engine_id'] == engine_id]

        # Create sequences of length sequence_length
        # The loop iterates up to the point where a full sequence can be formed
        for i in range(len(engine_df) - sequence_length + 1):
            seq = engine_df[feature_cols].iloc[i:i+sequence_length].values
            target = engine_df['RUL'].iloc[i+sequence_length-1]
            sequences.append(seq)
            targets.append(target)

    return np.array(sequences), np.array(targets)

# Identify feature columns
feature_cols = train_df.columns.drop(['engine_id', 'time_in_cycles', 'RUL'])

# Scale features using MinMaxScaler
scaler = MinMaxScaler()
train_df[feature_cols] = scaler.fit_transform(train_df[feature_cols])
test_df[feature_cols] = scaler.transform(test_df[feature_cols])

# Generate sequences
sequence_length = 50
X_train_lstm, y_train_lstm = generate_sequences(train_df, sequence_length, feature_cols)

### Prepare the test data, taking only the last sequence for each engine


In [10]:
# We need to find the last sequence for each engine in the test set
X_test_lstm = []

for engine_id in test_df['engine_id'].unique():
    engine_df = test_df[test_df['engine_id'] == engine_id]

    # Get the last 'sequence_length' measurements
    last_sequence = engine_df[feature_cols].tail(sequence_length).values

    # If an engine has fewer cycles than sequence_length, we pad with zeros at the beginning
    if len(last_sequence) < sequence_length:
        padded_sequence = np.zeros((sequence_length, len(feature_cols)))
        padded_sequence[-len(last_sequence):] = last_sequence
        X_test_lstm.append(padded_sequence)
    else:
        X_test_lstm.append(last_sequence)

X_test_lstm = np.array(X_test_lstm)

# The ground truth RUL values correspond to these test sequences
y_test_lstm = rul_df['RUL'].values

print(f"LSTM Test features shape: {X_test_lstm.shape}")
print(f"LSTM Test targets shape: {y_test_lstm.shape}")

LSTM Test features shape: (100, 50, 17)
LSTM Test targets shape: (100,)


# Model Training & Validation

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Determine the input shape from our training data (sequence_length, num_features)
input_shape = (X_train_lstm.shape[1], X_train_lstm.shape[2])

# Build the LSTM model
lstm_model = Sequential()

# Add the explicit Input layer
# lstm_model.add(Input(shape=input_shape))
lstm_model.add(LSTM(128, return_sequences=True, input_shape=input_shape))
lstm_model.add(Dropout(0.3))

# First LSTM layer with Dropout
lstm_model.add(LSTM(64, return_sequences=True,activation='tanh'))
lstm_model.add(Dropout(0.3))

# Second LSTM layer with Dropout
lstm_model.add(LSTM(units=32, activation='tanh'))
lstm_model.add(Dropout(0.3))

lstm_model.add(Dense(64, activation='relu'))
lstm_model.add(Dense(1))


# Display the model's architecture
lstm_model.summary()

In [12]:
# Compile the model
lstm_model.compile(optimizer='adam',
                   loss='mean_squared_error',
                   metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [22]:
# Define the EarlyStopping callback
# It will monitor the validation loss and stop training after 10 epochs of no improvement.
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model and store the training history
print("Training the LSTM model...")
history = lstm_model.fit(X_train_lstm, y_train_lstm,
                         epochs=100,
                         batch_size=32,
                         validation_split=0.2,
                         callbacks=[early_stop],
                         verbose=2)

print("Training complete.")

Training the LSTM model...
Epoch 1/100
394/394 - 53s - 135ms/step - loss: 235.6095 - root_mean_squared_error: 15.3496 - val_loss: 170.1419 - val_root_mean_squared_error: 13.0438
Epoch 2/100
394/394 - 79s - 201ms/step - loss: 222.1821 - root_mean_squared_error: 14.9058 - val_loss: 164.9498 - val_root_mean_squared_error: 12.8433
Epoch 3/100
394/394 - 87s - 220ms/step - loss: 217.1165 - root_mean_squared_error: 14.7349 - val_loss: 229.6832 - val_root_mean_squared_error: 15.1553
Epoch 4/100
394/394 - 78s - 197ms/step - loss: 208.7185 - root_mean_squared_error: 14.4471 - val_loss: 176.2305 - val_root_mean_squared_error: 13.2752
Epoch 5/100
394/394 - 81s - 207ms/step - loss: 203.4636 - root_mean_squared_error: 14.2641 - val_loss: 188.1248 - val_root_mean_squared_error: 13.7159
Epoch 6/100
394/394 - 50s - 128ms/step - loss: 195.0103 - root_mean_squared_error: 13.9646 - val_loss: 183.1942 - val_root_mean_squared_error: 13.5349
Epoch 7/100
394/394 - 84s - 213ms/step - loss: 192.1441 - root_mean

In [24]:
# Make predictions on the LSTM test set
y_pred_lstm = lstm_model.predict(X_test_lstm)

# The model outputs a 2D array, so we flatten it to a 1D array for evaluation
y_pred_lstm = y_pred_lstm.flatten()

print("Predictions made on the test set using the LSTM model.")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step 
Predictions made on the test set using the LSTM model.


In [25]:
def nasa_score(y_true, y_pred):
    """
    Calculates the NASA C-MAPSS scoring function.

    Args:
        y_true (array-like): The actual RUL values.
        y_pred (array-like): The predicted RUL values.

    Returns:
        float: The total score.
    """
    score = 0
    # Calculate the difference d for each prediction
    d = y_pred - y_true

    # Apply the asymmetric scoring function
    for d_i in d:
        if d_i < 0:
            # Penalty for early predictions
            score += np.exp(-d_i / 13.0) - 1
        else:
            # Heavier penalty for late predictions
            score += np.exp(d_i / 10.0) - 1

    return score

# Calculate the NASA score on the validation set predictions

In [27]:
from sklearn.metrics import mean_squared_error, r2_score
# --- Evaluate against the ORIGINAL, UNCLIPPED ground truth RUL values ---

# Calculate the Root Mean Squared Error (RMSE)
rmse_lstm = np.sqrt(mean_squared_error(y_test_lstm, y_pred_lstm))
print(f"LSTM Test RMSE: {rmse_lstm:.2f}")

# Calculate the R-squared (R2) Score
r2_lstm = r2_score(y_test_lstm, y_pred_lstm)
print(f"LSTM Test R-squared Score: {r2_lstm:.2f}")

# Calculate the NASA score
nasa_score_lstm = nasa_score(y_test_lstm, y_pred_lstm)
print(f"LSTM Test NASA Score: {nasa_score_lstm:.2f}")

LSTM Test RMSE: 16.26
LSTM Test R-squared Score: 0.85
LSTM Test NASA Score: 512.04


In [18]:
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score

mae = mean_absolute_error(y_test_lstm, y_pred_lstm)

r2 = r2_score(y_test_lstm, y_pred_lstm)

accuracy_percent = r2 * 100

print("Accuracy (%):", accuracy_percent)
print("MAE:", mae)
print("R2 Score:", r2)

Accuracy (%): 86.3892138004303
MAE: 11.385472297668457
R2 Score: 0.863892138004303


# Deployment via Web Interface