<a href="https://colab.research.google.com/github/DGuilherme/PMTese/blob/main/CNNTransformerAllDatasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
!pip install tensorflow scikit-learn

# Imports
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt



In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Compare all datasets

In [None]:
def run_predictive_maintenance(dataset_name, scaler_type='minmax', seq_length=50):
  # Load data
  column_names = ['id', 'cycle', 'setting1', 'setting2', 'setting3'] + [f'sensor{i}' for i in range(1, 22)]
  train_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/train_{dataset_name}.txt/train_{dataset_name}.txt', delim_whitespace=True, header=None)
  test_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/test_{dataset_name}.txt/test_{dataset_name}.txt', delim_whitespace=True, header=None)
  rul_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/RUL_{dataset_name}.txt/RUL_{dataset_name}.txt', delim_whitespace=True, header=None)


  train_data.columns = column_names
  test_data.columns = column_names
  rul_data.columns = ['RUL']

  # Generate RUL for training data
  max_cycle = train_data.groupby('id')['cycle'].max().reset_index().rename(columns={'cycle': 'cycle_max'})
  train_data = train_data.merge(max_cycle, on='id')
  train_data['RUL'] = train_data['cycle_max'] - train_data['cycle']
  train_data.drop(columns=['cycle_max'], inplace=True)

  # Generate RUL for test data
  max_cycle_test = test_data.groupby('id')['cycle'].max().reset_index().rename(columns={'cycle': 'cycle_max'})
  # Merge last observed cycle and ground-truth RUL on 'id'
  test_data = test_data.merge(max_cycle_test, on='id')
  test_data = test_data.merge(rul_data, on='id')

  test_data['RUL'] = test_data['RUL'] + test_data['max_cycle'] - test_data['cycle']
  test_data.drop(columns=['max_cycle'], inplace=True)

  # Select useful features (as suggested in papers)
  useful_sensor_cols = ['setting1', 'setting2', 'setting3'] + \
      [f'sensor{i}' for i in [2, 3, 4, 7, 8, 11, 12, 13, 14, 15, 17, 20, 21]]

  train_data = train_data[['id', 'cycle'] + useful_sensor_cols + ['RUL']]
  test_data = test_data[['id', 'cycle'] + useful_sensor_cols + ['RUL']]

  # Normalize sensor values
  scaler = MinMaxScaler()
  train_data[useful_sensor_cols] = scaler.fit_transform(train_data[useful_sensor_cols])
  test_data[useful_sensor_cols] = scaler.transform(test_data[useful_sensor_cols])

  # Windowing function
  def create_sequences(data, sequence_length=30):
      sequences = []
      labels = []
      engines = data['id'].unique()
      for engine_id in engines:
          engine_data = data[data['id'] == engine_id]
          feature_data = engine_data[useful_sensor_cols].values
          label_data = engine_data['RUL'].values
          for i in range(len(feature_data) - sequence_length + 1):
              sequences.append(feature_data[i:i+sequence_length])
              labels.append(label_data[i+sequence_length-1])
      return np.array(sequences), np.array(labels)

  # Create sequences
  X_train, y_train = create_sequences(train_data)
  X_test, y_test = create_sequences(test_data)

  print("Training set:", X_train.shape, y_train.shape)
  print("Testing set:", X_test.shape, y_test.shape)

  # Transformer Encoder
  def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
      # Attention block
      attn_output = layers.MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(inputs, inputs)
      attn_output = layers.Dropout(dropout)(attn_output)
      x = layers.LayerNormalization(epsilon=1e-6)(inputs + attn_output)  # Add & Norm

      # Feed Forward block
      ff_output = layers.Conv1D(filters=ff_dim, kernel_size=3, padding="same", activation="relu")(x)
      ff_output = layers.Dropout(dropout)(ff_output)
      ff_output = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(ff_output)
      x = layers.LayerNormalization(epsilon=1e-6)(x + ff_output)  # Add & Norm

      return x

  # Build Model
  def build_model(input_shape, head_size=64, num_heads=4, ff_dim=128, num_transformer_blocks=2, mlp_units=[128], dropout=0.1, mlp_dropout=0.1):
      inputs = keras.Input(shape=input_shape)
      x = inputs
      for _ in range(num_transformer_blocks):
          x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

      x = layers.GlobalAveragePooling1D()(x)
      for units in mlp_units:
          x = layers.Dense(units, activation="relu")(x)
          x = layers.Dropout(mlp_dropout)(x)
      outputs = layers.Dense(1)(x)  # Predict RUL
      return keras.Model(inputs, outputs)

  # Compile Model
  input_shape = X_train.shape[1:]  # (window_size, num_features)
  model = build_model(input_shape)

  model.compile(
      loss="mse",
      optimizer=keras.optimizers.Adam(learning_rate=1e-4),
      metrics=["mae"]
  )

  model.summary()

  # Early stopping
  early_stopping = keras.callbacks.EarlyStopping(
  monitor="val_loss", patience=10, restore_best_weights=True)

  # Train
  history = model.fit(
      X_train, y_train,
      validation_split=0.2,
      epochs=100,
      batch_size=64,
      callbacks=[early_stopping]
  )

  # Plot loss curves
  plt.plot(history.history['loss'], label="Training Loss")
  plt.plot(history.history['val_loss'], label="Validation Loss")
  plt.legend()
  plt.show()

  # Evaluate
  #y_pred = model.predict(X_test).flatten()
  y_pred = model.predict(X_test)
  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  mae = mean_absolute_error(y_test, y_pred)

  print(f"Test RMSE: {rmse:.2f}")
  print(f"Test MAE: {mae:.2f}")

  # Optional: Plot true vs predicted RUL
  plt.figure(figsize=(10,5))
  plt.plot(y_test[:200], label="True RUL")
  plt.plot(y_pred[:200], label="Predicted RUL")
  plt.legend()
  plt.xlabel("Sample Index")
  plt.ylabel("RUL")
  plt.title("True vs Predicted RUL (First 200 Samples)")
  plt.show()

  return model, y_pred, rmse, mae


In [None]:
def compare_datasets(datasets, scaler_type='standard', seq_length=50):
  results = []
  for dataset_name in datasets:
    model, predictions, mse, mae = run_predictive_maintenance(dataset_name, scaler_type, seq_length)
    rmse = np.sqrt(mse)  # Calculate RMSE
    results.append([dataset_name, rmse, mae])

  # Create a pandas DataFrame for the results
  results_df = pd.DataFrame(results, columns=['Dataset', 'RMSE', 'MAE'])
  return results_df

In [None]:
# Define the list of datasets
datasets = ['FD001', 'FD002', 'FD003', 'FD004']  # Add your dataset names here

# Run the comparison
results_df = compare_datasets(datasets, scaler_type='minmax', seq_length=60)

# Display the results table
display(results_df)

  train_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/train_{dataset_name}.txt/train_{dataset_name}.txt', delim_whitespace=True, header=None)
  test_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/test_{dataset_name}.txt/test_{dataset_name}.txt', delim_whitespace=True, header=None)
  rul_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/RUL_{dataset_name}.txt/RUL_{dataset_name}.txt', delim_whitespace=True, header=None)
  super().__init__(**kwargs)


Epoch 1/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 118ms/step - loss: 9158.2041 - val_loss: 7778.4106
Epoch 2/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 122ms/step - loss: 7496.5732 - val_loss: 6640.6499
Epoch 3/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 116ms/step - loss: 6373.9878 - val_loss: 5746.5684
Epoch 4/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 114ms/step - loss: 5414.8989 - val_loss: 5048.7661
Epoch 5/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 130ms/step - loss: 4942.2334 - val_loss: 4514.6846
Epoch 6/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 111ms/step - loss: 4352.6245 - val_loss: 4116.7588
Epoch 7/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 127ms/step - loss: 3949.4915 - val_loss: 3827.7087
Epoch 8/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 132ms/step - loss: 

  train_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/train_{dataset_name}.txt/train_{dataset_name}.txt', delim_whitespace=True, header=None)
  test_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/test_{dataset_name}.txt/test_{dataset_name}.txt', delim_whitespace=True, header=None)
  rul_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/RUL_{dataset_name}.txt/RUL_{dataset_name}.txt', delim_whitespace=True, header=None)
  super().__init__(**kwargs)


Epoch 1/50
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 119ms/step - loss: 8667.1641 - val_loss: 6602.0430
Epoch 2/50
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 119ms/step - loss: 5824.4185 - val_loss: 4837.2402
Epoch 3/50
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 119ms/step - loss: 4286.1221 - val_loss: 3944.8264
Epoch 4/50
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 128ms/step - loss: 3641.6609 - val_loss: 3603.5732
Epoch 5/50
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 128ms/step - loss: 3380.9634 - val_loss: 3502.2151
Epoch 6/50
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 130ms/step - loss: 3317.4009 - val_loss: 3478.6079
Epoch 7/50
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 130ms/step - loss: 3290.3787 - val_loss: 3474.0959
Epoch 8/50
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 124ms/step - loss: 

  train_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/train_{dataset_name}.txt/train_{dataset_name}.txt', delim_whitespace=True, header=None)
  test_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/test_{dataset_name}.txt/test_{dataset_name}.txt', delim_whitespace=True, header=None)
  rul_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/RUL_{dataset_name}.txt/RUL_{dataset_name}.txt', delim_whitespace=True, header=None)
  super().__init__(**kwargs)


Epoch 1/50
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 123ms/step - loss: 20194.4727 - val_loss: 17353.0215
Epoch 2/50
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 122ms/step - loss: 16968.2695 - val_loss: 15223.0166
Epoch 3/50
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 120ms/step - loss: 14566.4756 - val_loss: 13490.4277
Epoch 4/50
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 121ms/step - loss: 13150.9883 - val_loss: 12092.7939
Epoch 5/50
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 123ms/step - loss: 12249.2109 - val_loss: 10985.4512
Epoch 6/50
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 120ms/step - loss: 10584.5752 - val_loss: 10113.9014
Epoch 7/50
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 122ms/step - loss: 10016.6924 - val_loss: 9452.9268
Epoch 8/50
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 122ms/

  train_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/train_{dataset_name}.txt/train_{dataset_name}.txt', delim_whitespace=True, header=None)
  test_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/test_{dataset_name}.txt/test_{dataset_name}.txt', delim_whitespace=True, header=None)
  rul_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/RUL_{dataset_name}.txt/RUL_{dataset_name}.txt', delim_whitespace=True, header=None)
  super().__init__(**kwargs)


Epoch 1/50
[1m610/610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 121ms/step - loss: 15505.4883 - val_loss: 11850.2305
Epoch 2/50
[1m610/610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 125ms/step - loss: 10715.7793 - val_loss: 8914.5820
Epoch 3/50
[1m610/610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 120ms/step - loss: 8224.7959 - val_loss: 7388.7969
Epoch 4/50
[1m610/610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 120ms/step - loss: 6940.1992 - val_loss: 6721.7988
Epoch 5/50
[1m610/610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 121ms/step - loss: 6439.3896 - val_loss: 6498.5073
Epoch 6/50
[1m610/610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 119ms/step - loss: 6357.2793 - val_loss: 6447.5029
Epoch 7/50
[1m610/610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 119ms/step - loss: 6282.1709 - val_loss: 6438.3862
Epoch 8/50
[1m610/610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 128ms/step - los

Unnamed: 0,Dataset,RMSE,MAE
0,FD001,5.627075,21.991097
1,FD002,7.017875,37.647487
2,FD003,7.065597,34.344263
3,FD004,8.523684,50.577161
