In [1]:
import sys

import tensorflow.keras
import pandas as pd
import numpy as np
import sklearn as sk
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import platform
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import GRU, Dense
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

print(f"Python Platform: {platform.platform()}")
print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tensorflow.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

Python Platform: macOS-13.3.1-arm64-arm-64bit
Tensor Flow Version: 2.9.0
Keras Version: 2.9.0

Python 3.10.9 | packaged by conda-forge | (main, Feb  2 2023, 20:26:08) [Clang 14.0.6 ]
Pandas 1.5.3
Scikit-Learn 1.2.2
GPU is available


In [2]:
#Load data
df = pd.read_csv('data_cleaned.csv')
df = df.iloc[:, :16]
print(len(df.index))

#Instead of dropping rows where plant is off, convert the row to zero
df.loc[df['Coal Input'] < 30] = 0

32768


In [3]:
# apply MinMaxScaler to the dataframe
scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(df)

#convert the numpy scaled array back to a dataframe
df_scaled = pd.DataFrame(scaled_df, columns=df.columns)

In [4]:
df_scaled['Power Plant On'] = (df_scaled['Coal Input'] > 0).astype(int)

dataframes = []
indices = []
for index, row in df_scaled.iterrows():
    if row['Power Plant On'] == 0:
        if indices:
            dataframes.append(df_scaled.loc[indices].copy())
            indices = []
    else:
        indices.append(index)

# Check if there are any remaining indices after the loop ends
if indices:
    dataframes.append(df_scaled.loc[indices].copy())

# Print the number of different datasets and their lengths
print(f"Number of datasets: {len(dataframes)}")
#for i, dataset in enumerate(dataframes):
    #print(f"Dataset {i+1} length: {len(dataset)}")

# Calculate the total number of rows
total_rows = sum(len(dataset) for dataset in dataframes)
print(f"Total number of rows: {total_rows}")

Number of datasets: 32
Total number of rows: 22565


In [6]:
def timewindow_df(df, timeBackWindow, timeForwardWindow):
    # timeBackWindow = 5
    # timeForwardWindow = 2

    # Define column names for the first part
    cols_part1 = ['ECO SYS FW Supply SATN T', 'Economizer SYS FW Supply P', 'Live Steam TOT F',
                  'AMB Air T', 'Coal Input', 'BLR Primary Air F', 'BLR Secondary Air Total F',
                  'OVR FIR CORR Air NOZ 71-73 F', 'OVR FIR CORR Air NOZ 74-76 F',
                  'OVR FIR CORR Air NOZ 81-83 F', 'OVR FIR CORR Air NOZ 84-86 F']
    # Define column names for the second part
    cols_part2 = ['HP Steam Average Temp', 'Hot R/H Average Temp', 'ATT 1 m_dot', 'ATT 2 m_dot', 'R/H ATT m_dot']
    # Shift rows and concatenate for both parts
    df_temp = pd.DataFrame()

    for col in cols_part1 + cols_part2:
        # Shift columns forward for the first part
        if col in cols_part1:
            df_new = pd.concat([df[col].shift(-i) for i in range(timeBackWindow-1, -1, -1)], axis=1, 
                               keys=[f'{col}-{i}' for i in range(timeBackWindow)])
        # Shift columns backward for the second part
        else:
            shifted_cols = [df[col].shift(i) for i in range(timeForwardWindow)]
            df_new = pd.concat(shifted_cols, axis=1, 
                               keys=[f'{col}+{i}' for i in range(timeForwardWindow)])
        # Concatenate the resulting dataframe with df_temp, along the columns axis (axis=1)
        df_temp = pd.concat([df_temp, df_new], axis=1)

    return df_temp


In [7]:
#Time window the datasets
manipulated_dataframes = []

for df in dataframes:
    manipulated_df = timewindow_df(df,5,2)
    manipulated_dataframes.append(manipulated_df)


# Print the number of rows for each dataset in the manipulated dataframes
print(f"Number of manipulated datasets: {len(manipulated_dataframes)}")
#for i, dataset in enumerate(manipulated_dataframes):
    #print(f"Manipulated Dataset {i+1} length: {len(dataset)}")

# Calculate the total number of rows across all the manipulated datasets
total_rows = sum(len(dataset) for dataset in manipulated_dataframes)

# Print the total number of rows
print(f"Total number of rows: {total_rows}")

Number of manipulated datasets: 32
Total number of rows: 22565


In [8]:
# Remove rows with NaN values from each dataframe
manipulated_dataframes_cleaned = []
for dataset in manipulated_dataframes:
    cleaned_dataset = dataset.dropna()
    manipulated_dataframes_cleaned.append(cleaned_dataset)

# Print the number of rows for each dataset in the cleaned manipulated dataframes
print(f"Number of manipulated datasets: {len(manipulated_dataframes_cleaned)}")
#for i, dataset in enumerate(manipulated_dataframes_cleaned):
    #print(f"Manipulated Dataset {i+1} length: {len(dataset)}")

# Calculate the total number of rows across all the cleaned manipulated datasets
total_rows = sum(len(dataset) for dataset in manipulated_dataframes_cleaned)

# Print the total number of rows
print(f"Total number of rows: {total_rows}")

Number of manipulated datasets: 32
Total number of rows: 22412


In [9]:
# Concat datasets
processed_df = pd.concat(manipulated_dataframes_cleaned, ignore_index=True)

In [10]:
#Split data into the test
# x_data = df[['ECO SYS FW Supply SATN T', 'Economizer SYS FW Supply P', 'Live Steam TOT F','AMB Air T','Coal Input',
#              'BLR Primary Air F','BLR Secondary Air Total F','OVR FIR CORR Air NOZ 71-73 F','OVR FIR CORR Air NOZ 74-76 F',
#              'OVR FIR CORR Air NOZ 81-83 F','OVR FIR CORR Air NOZ 84-86 F',]].values
# y_data = df[['HP Steam Average Temp', 'Hot R/H Average Temp', 'ATT 1 m_dot', 'ATT 2 m_dot', 'R/H ATT m_dot']].values


x_cols = 5 * 11  # Adjust this based on the time windowing performed
x_data = processed_df.iloc[:, :x_cols].values
y_data = processed_df.iloc[:, x_cols:].values

# print("True")
# print(x_data.shape)
# print(y_data.shape)

#print("Adapated")
x_train = x_data[:int(0.9*x_data.shape[0]),:]
#print(x_train.shape)
x_test = x_data[int(0.9*x_data.shape[0]):,:]
#print(x_test.shape)
y_train = y_data[:int(0.9*y_data.shape[0]),:]
print(y_train.shape)
y_test = y_data[int(0.9*y_data.shape[0]):,:]
##print(y_test.shape)

(20170, 10)


In [21]:
class EncoderDecoderRNN(tf.keras.Model):
    def __init__(self, input_dim, latent_dim, output_dim):
        super(EncoderDecoderRNN, self).__init__()
        self.encoder_gru = GRU(latent_dim, return_state=True)
        self.decoder_gru = GRU(latent_dim, return_sequences=True)
        self.decoder_dense = Dense(output_dim)

    def call(self, inputs):
        encoder_inputs = inputs

        # Encoder
        _, state_h = self.encoder_gru(encoder_inputs)

        # Decoder
        decoder_outputs = self.decoder_gru(encoder_inputs, initial_state=state_h)
        decoder_outputs = self.decoder_dense(decoder_outputs)

        return decoder_outputs

# Example usage
input_dim = 11
latent_dim = 32
output_dim = 5

# Reshape the input data
x_train_encoder = x_train.reshape(-1, 5, 11)
x_test = x_test.reshape(-1, 5, 11)


# Reshape the target data
y_train = y_train.reshape(-1, 2, 5)
y_test = y_test.reshape(-1, 2, 5)


encoder_inputs = tf.keras.Input(shape=(5, input_dim))
model = EncoderDecoderRNN(input_dim, latent_dim, output_dim)
decoder_outputs = model(encoder_inputs)

# Training
model.compile(optimizer=RMSprop(), loss='mse')  # Customize optimizer and loss function as needed


In [22]:

# Fit the model
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test))



ValueError: Data cardinality is ambiguous:
  x sizes: 20170
  y sizes: 0
Make sure all arrays contain the same number of samples.