<a href="https://colab.research.google.com/github/DGuilherme/PMTese/blob/main/notebooks/Transformer/Simple/TransformerAllDatasetsComparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install tensorflow pandas scikit-learn

import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
def run_predictive_maintenance(dataset_name, scaler_type='minmax', seq_length=50):

  # Load data
  column_names = ['id', 'cycle', 'setting1', 'setting2', 'setting3'] + [f'sensor{i}' for i in range(1, 22)]
  train_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/train_{dataset_name}.txt/train_{dataset_name}.txt', delim_whitespace=True, header=None)
  test_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/test_{dataset_name}.txt/test_{dataset_name}.txt', delim_whitespace=True, header=None)
  rul_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/RUL_{dataset_name}.txt/RUL_{dataset_name}.txt', delim_whitespace=True, header=None)


  train_data.columns = column_names
  test_data.columns = column_names
  rul_data.columns = ['RUL']

  # Generate RUL for training data
  max_cycle = train_data.groupby('id')['cycle'].max()
  train_data = train_data.merge(max_cycle.reset_index(), on='id', suffixes=('', '_max'))
  train_data['RUL'] = train_data['cycle_max'] - train_data['cycle']
  train_data.drop(columns=['cycle_max'], inplace=True)

  # RUL for test data
  max_cycle_test = test_data.groupby('id')['cycle'].max().reset_index()
  max_cycle_test.columns = ['id', 'max_cycle']
  rul_data.columns = ['RUL']
  max_cycle_test['RUL'] = rul_data['RUL']
  test_data = test_data.merge(max_cycle_test, on='id')
  test_data['RUL'] = test_data['RUL'] + test_data['max_cycle'] - test_data['cycle']
  test_data.drop(columns=['max_cycle'], inplace=True)

  useful_sensor_cols = ['setting1', 'setting2', 'setting3'] + \
      [f'sensor{i}' for i in [2, 3, 4, 7, 8, 11, 12, 13, 14, 15, 17, 20, 21]]

  train_data = train_data[['id', 'cycle'] + useful_sensor_cols + ['RUL']]
  test_data = test_data[['id', 'cycle'] + useful_sensor_cols + ['RUL']]

  # Choose scaler based on scaler_type
  if scaler_type == 'standard':
      scaler = StandardScaler()
  elif scaler_type == 'minmax':
      scaler = MinMaxScaler()
  else:
      raise ValueError("Invalid scaler_type. Choose 'standard' or 'minmax'.")

  # Normalize sensor values
  scaler = StandardScaler()
  train_data[useful_sensor_cols] = scaler.fit_transform(train_data[useful_sensor_cols])
  test_data[useful_sensor_cols] = scaler.transform(test_data[useful_sensor_cols])


  # Create sequences
  def create_sequences(data, seq_length=50):
      sequences = []
      for unit in data['id'].unique():
          unit_data = data[data['id'] == unit].reset_index(drop=True)
          for start in range(len(unit_data) - seq_length + 1):
              end = start + seq_length
              seq_X = unit_data.iloc[start:end, 2:-1].values  # all sensor + setting cols
              seq_y = unit_data.iloc[end-1]['RUL']
              sequences.append((seq_X, seq_y))
      return sequences

  seq_length = 50
  train_seqs = create_sequences(train_data, seq_length)
  test_seqs = create_sequences(test_data, seq_length)

  train_X, val_X, train_y, val_y = train_test_split(
      [seq[0] for seq in train_seqs],
      [seq[1] for seq in train_seqs],
      test_size=0.2,
      random_state=42
  )

  # Convert to numpy arrays
  train_X = np.array(train_X)
  train_y = np.array(train_y)
  val_X = np.array(val_X)
  val_y = np.array(val_y)
  test_X = np.array([seq[0] for seq in test_seqs])
  test_y = np.array([seq[1] for seq in test_seqs])

  # Define Transformer
  class TimeSeriesTransformer(tf.keras.Model):
      def __init__(self, input_dim, model_dim, num_heads, num_layers, output_dim):
          super(TimeSeriesTransformer, self).__init__()
          self.input_proj = tf.keras.layers.Dense(model_dim)

          self.encoder_layers = []
          for _ in range(num_layers):
              self.encoder_layers.append([
                  tf.keras.layers.LayerNormalization(),
                  tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=model_dim),
                  tf.keras.Sequential([
                      tf.keras.layers.Dense(model_dim * 4, activation='relu'),
                      tf.keras.layers.Dense(model_dim)
                  ]),
              ])

          self.global_pool = tf.keras.layers.GlobalAveragePooling1D()
          self.output_layer = tf.keras.layers.Dense(output_dim)

      def call(self, inputs, training=False):
          x = self.input_proj(inputs)

          for norm, mha, ffn in self.encoder_layers:
              attn_output = mha(x, x)
              x = norm(x + attn_output)
              ff_output = ffn(x)
              x = norm(x + ff_output)

          x = self.global_pool(x)
          return self.output_layer(x)

  # Instantiate model
  input_dim = train_X.shape[2]
  model_dim = 64
  num_heads = 8
  num_layers = 4
  output_dim = 1

  model = TimeSeriesTransformer(input_dim, model_dim, num_heads, num_layers, output_dim)
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                loss='mse',
                metrics=['mae'])

  # Train
  history = model.fit(
      train_X, train_y,
      validation_data=(val_X, val_y),
      epochs=20,
      batch_size=32
  )

  # Predict
  predictions = model.predict(test_X)

  # Evaluate
  return model, predictions, mean_squared_error(test_y, predictions), mean_absolute_error(test_y, predictions)


In [7]:
def compare_datasets(datasets, scaler_type='standard', seq_length=50):
    results = []
    for dataset_name in datasets:
        model, predictions, mse, mae = run_predictive_maintenance(dataset_name, scaler_type, seq_length)
        rmse = np.sqrt(mse)  # Calculate RMSE
        results.append([dataset_name, rmse, mae])

    # Create a pandas DataFrame for the results
    results_df = pd.DataFrame(results, columns=['Dataset', 'RMSE', 'MAE'])
    return results_df

In [8]:
# Define the list of datasets
datasets = ['FD001', 'FD002', 'FD003', 'FD004']  # Add your dataset names here

# Run the comparison
results_df = compare_datasets(datasets, scaler_type='minmax', seq_length=30)

# Display the results table
display(results_df)

  train_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/train_{dataset_name}.txt/train_{dataset_name}.txt', delim_whitespace=True, header=None)
  test_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/test_{dataset_name}.txt/test_{dataset_name}.txt', delim_whitespace=True, header=None)
  rul_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/RUL_{dataset_name}.txt/RUL_{dataset_name}.txt', delim_whitespace=True, header=None)


Epoch 1/20
[1m394/394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 482ms/step - loss: 7457.5371 - mae: 67.3832 - val_loss: 3676.3411 - val_mae: 46.9944
Epoch 2/20
[1m394/394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 451ms/step - loss: 3309.5415 - mae: 43.6886 - val_loss: 2149.7900 - val_mae: 33.1701
Epoch 3/20
[1m394/394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 485ms/step - loss: 2115.8708 - mae: 33.8456 - val_loss: 1779.8966 - val_mae: 32.0318
Epoch 4/20
[1m394/394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 474ms/step - loss: 1807.7722 - mae: 32.5238 - val_loss: 1782.9747 - val_mae: 32.1338
Epoch 5/20
[1m394/394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 455ms/step - loss: 2661.9441 - mae: 40.3509 - val_loss: 3253.9285 - val_mae: 45.8821
Epoch 6/20
[1m394/394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 476ms/step - loss: 3217.0007 - mae: 46.0054 - val_loss: 3254.7253 - val_mae: 45.9473
Epoch 7/20
[1m3

  train_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/train_{dataset_name}.txt/train_{dataset_name}.txt', delim_whitespace=True, header=None)
  test_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/test_{dataset_name}.txt/test_{dataset_name}.txt', delim_whitespace=True, header=None)
  rul_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/RUL_{dataset_name}.txt/RUL_{dataset_name}.txt', delim_whitespace=True, header=None)


Epoch 1/20
[1m1026/1026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m498s[0m 470ms/step - loss: 6024.9189 - mae: 59.8365 - val_loss: 3324.3486 - val_mae: 46.4995
Epoch 2/20
[1m1026/1026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m506s[0m 475ms/step - loss: 3368.3965 - mae: 46.9465 - val_loss: 3324.4470 - val_mae: 46.4829
Epoch 3/20
[1m1026/1026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m497s[0m 470ms/step - loss: 3344.7617 - mae: 46.7450 - val_loss: 3325.2173 - val_mae: 46.4354
Epoch 4/20
[1m1026/1026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m503s[0m 471ms/step - loss: 3329.4814 - mae: 46.6338 - val_loss: 3324.3662 - val_mae: 46.5225
Epoch 5/20
[1m1026/1026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m482s[0m 470ms/step - loss: 3349.1670 - mae: 46.8771 - val_loss: 3325.5742 - val_mae: 46.4225
Epoch 6/20
[1m1026/1026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m508s[0m 475ms/step - loss: 3328.7832 - mae: 46.8005 - val_loss: 3324.5239 - val_mae: 46.4740
Epoc

  train_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/train_{dataset_name}.txt/train_{dataset_name}.txt', delim_whitespace=True, header=None)
  test_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/test_{dataset_name}.txt/test_{dataset_name}.txt', delim_whitespace=True, header=None)
  rul_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/RUL_{dataset_name}.txt/RUL_{dataset_name}.txt', delim_whitespace=True, header=None)


Epoch 1/20
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m254s[0m 482ms/step - loss: 16880.3457 - mae: 96.8534 - val_loss: 8665.2266 - val_mae: 64.3671
Epoch 2/20
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 483ms/step - loss: 8624.0928 - mae: 68.5658 - val_loss: 8006.9946 - val_mae: 68.7883
Epoch 3/20
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 480ms/step - loss: 8175.9712 - mae: 69.9397 - val_loss: 7142.9287 - val_mae: 63.2695
Epoch 4/20
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m258s[0m 472ms/step - loss: 7418.9390 - mae: 65.0536 - val_loss: 7455.7188 - val_mae: 66.5499
Epoch 5/20
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m265s[0m 478ms/step - loss: 7017.4565 - mae: 62.8241 - val_loss: 8011.5908 - val_mae: 68.6096
Epoch 6/20
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 484ms/step - loss: 8143.5303 - mae: 70.1167 - val_loss: 8005.7778 - val_mae: 68.9373
Epoch 7/20
[1m

  train_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/train_{dataset_name}.txt/train_{dataset_name}.txt', delim_whitespace=True, header=None)
  test_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/test_{dataset_name}.txt/test_{dataset_name}.txt', delim_whitespace=True, header=None)
  rul_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/RUL_{dataset_name}.txt/RUL_{dataset_name}.txt', delim_whitespace=True, header=None)


Epoch 1/20
[1m1227/1227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m588s[0m 467ms/step - loss: 11424.8438 - mae: 81.4910 - val_loss: 6410.5083 - val_mae: 63.6667
Epoch 2/20
[1m1227/1227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m617s[0m 464ms/step - loss: 6179.9951 - mae: 62.9423 - val_loss: 6410.0894 - val_mae: 64.1450
Epoch 3/20
[1m1227/1227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m663s[0m 497ms/step - loss: 6298.4849 - mae: 63.7576 - val_loss: 6406.7554 - val_mae: 63.9093
Epoch 4/20
[1m1227/1227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m621s[0m 496ms/step - loss: 6265.9771 - mae: 63.3727 - val_loss: 6407.8257 - val_mae: 64.0339
Epoch 5/20
[1m1227/1227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m620s[0m 495ms/step - loss: 6368.5034 - mae: 64.0344 - val_loss: 6406.7456 - val_mae: 63.8958
Epoch 6/20
[1m1227/1227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m624s[0m 497ms/step - loss: 6267.1606 - mae: 63.4407 - val_loss: 6406.8135 - val_mae: 63.8655
Epo

Unnamed: 0,Dataset,RMSE,MAE
0,FD001,52.345641,39.886798
1,FD002,65.015486,51.055007
2,FD003,97.100238,69.693253
3,FD004,101.496407,76.082064
