<a href="https://colab.research.google.com/github/DGuilherme/PMTese/blob/main/LSTMAllDatasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports


In [1]:
!pip install tensorflow

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam



In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Compare all datasets

In [None]:
def run_predictive_maintenance(dataset_name, scaler_type='minmax', seq_length=50):
  # Load data
  column_names = ['id', 'cycle', 'setting1', 'setting2', 'setting3'] + [f'sensor{i}' for i in range(1, 22)]
  train_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/train_{dataset_name}.txt/train_{dataset_name}.txt', delim_whitespace=True, header=None)
  test_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/test_{dataset_name}.txt/test_{dataset_name}.txt', delim_whitespace=True, header=None)
  rul_data = pd.read_csv(f'/content/drive/MyDrive/Python/predictive-maintenance-main/datasets/cmapss/RUL_{dataset_name}.txt/RUL_{dataset_name}.txt', delim_whitespace=True, header=None)

  train_data.columns = column_names
  test_data.columns = column_names
  rul_data.columns = ['RUL']

  # Generate RUL for training data
  #max_cycle = train_data.groupby('id')['cycle'].max().reset_index().rename(columns={'cycle': 'cycle_max'})
  #train_data = train_data.merge(max_cycle, on='id')
  #train_data['RUL'] = train_data['cycle_max'] - train_data['cycle']
  #train_data.drop(columns=['cycle_max'], inplace=True)

  # Generate RUL for test data
  #max_cycle_test = test_data.groupby('id')['cycle'].max().reset_index().rename(columns={'cycle': 'cycle_max'})
  # Merge last observed cycle and RUL on 'id'
  #test_data = test_data.merge(max_cycle_test, on='id')
  #test_data = test_data.merge(rul_data, on='id')

  #test_data['RUL'] = test_data['RUL'] + test_data['max_cycle'] - test_data['cycle']
  #test_data.drop(columns=['max_cycle'], inplace=True)

  # --- Generate RUL for training data ---
  max_cycle = train_data.groupby('id')['cycle'].max().reset_index().rename(columns={'cycle': 'cycle_max'})
  train_data = train_data.merge(max_cycle, on='id')

  if use_piecewise_rul:
      train_data['RUL'] = (train_data['cycle_max'] - train_data['cycle']).clip(upper=max_rul_cap)
  else:
      train_data['RUL'] = train_data['cycle_max'] - train_data['cycle']

  train_data.drop(columns=['cycle_max'], inplace=True)

  # --- Generate RUL for test data ---
  max_cycle_test = test_data.groupby('id')['cycle'].max().reset_index().rename(columns={'cycle': 'max_cycle'})
  test_data = test_data.merge(max_cycle_test, on='id')
  test_data = test_data.merge(rul_data, on='id')  # assumes rul_data has columns ['id', 'RUL']

  # Adjust RUL based on current cycle
  test_data['RUL'] = test_data['RUL'] + test_data['max_cycle'] - test_data['cycle']

  # Apply piecewise cap if needed
  if use_piecewise_rul:
      test_data['RUL'] = test_data['RUL'].clip(upper=max_rul_cap)

  test_data.drop(columns=['max_cycle'], inplace=True)

  # Select useful features (as suggested in papers)
  useful_sensor_cols = ['setting1', 'setting2', 'setting3'] + \
      [f'sensor{i}' for i in [2, 3, 4, 7, 8, 11, 12, 13, 14, 15, 17, 20, 21]]

  train_data = train_data[['id', 'cycle'] + useful_sensor_cols + ['RUL']]
  test_data = test_data[['id', 'cycle'] + useful_sensor_cols + ['RUL']]

  # Normalize sensor values
  scaler = MinMaxScaler()
  train_data[useful_sensor_cols] = scaler.fit_transform(train_data[useful_sensor_cols])
  test_data[useful_sensor_cols] = scaler.transform(test_data[useful_sensor_cols])

  def create_sequences(data, seq_length=50):
    sequences = []
    labels = []
    ids = data['id'].unique()

    for id in ids:
        id_data = data[data['id'] == id].reset_index(drop=True)
        num_cycles = len(id_data)

        if num_cycles < seq_length:
            continue

        for start in range(num_cycles - seq_length):
            seq = id_data.iloc[start:start+seq_length][useful_sensor_cols].values
            label = id_data.iloc[start+seq_length-1]['RUL']

            sequences.append(seq)
            labels.append(label)

    return np.array(sequences), np.array(labels)

  # Create sequences for training and testing
  seq_length = 50
  X_train, y_train = create_sequences(train_data, seq_length)
  X_test, y_test = create_sequences(test_data, seq_length)

  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

  # Build the LSTM model
  model = Sequential([
      LSTM(100, return_sequences=True, input_shape=(seq_length, len(useful_sensor_cols))),
      Dropout(0.2),
      LSTM(50, return_sequences=False),
      Dropout(0.2),
      Dense(1)
  ])

  model.compile(optimizer='adam', loss='mse')
  model.summary()

  # Train the model
  history = model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_val, y_val))

  # Predictions
  y_pred = model.predict(X_test)


  # Calculate metrics
  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  mae = mean_absolute_error(y_test, y_pred)

  print(f'RMSE: {rmse}')
  print(f'MAE: {mae}')

  return model, y_pred, rmse, mae



In [None]:
def compare_datasets(datasets, usefullfeatures, scaler_type='standard', seq_length=50,use_piecewise_rul = True):
  results = []
  max_rul_cap = 125
  for dataset_name in datasets:
    model, predictions, mse, mae = run_predictive_maintenance(dataset_name, scaler_type, seq_length)
    rmse = np.sqrt(mse)  # Calculate RMSE
    results.append([dataset_name, rmse, mae])

  # Create a pandas DataFrame for the results
  results_df = pd.DataFrame(results, columns=['Dataset', 'RMSE', 'MAE'])
  use_piecewise_rul = False
  return results_df

In [None]:
# Define the list of datasets
#datasets = ['FD001', 'FD002', 'FD003', 'FD004']
datasets = ['FD001']

fd001_usefullfeatures_correlation = ['sensor2', 'sensor3', 'sensor4', 'sensor7', 'sensor8', 'sensor9', 'sensor11', 'sensor12', 'sensor13', 'sensor15', 'sensor17', 'sensor20', 'sensor21']
fd002_usefullfeatures_correlation = ['setting1', 'setting2', 'setting3', 'sensor2', 'sensor8', 'sensor14']
fd001_usefullfeatures_boruta = ['sensor2', 'sensor3', 'sensor4', 'sensor7', 'sensor9', 'sensor11', 'sensor12', 'sensor14', 'sensor15', 'sensor20', 'sensor21']
fd002_usefullfeatures_boruta = ['sensor2', 'sensor3', 'sensor4', 'sensor7', 'sensor8', 'sensor9', 'sensor11', 'sensor12', 'sensor13', 'sensor14', 'sensor15', 'sensor21']



# Run the comparison
results_df = compare_datasets(datasets, fd001_usefullfeatures_correlation, scaler_type='minmax', seq_length=60)

# Display the results table
display(results_df)