# Data processing pipeline

### Import of necessary python modules

In [None]:
import os
import shutil
from rich.progress import track
from sklearn import preprocessing
import numpy as np
from scipy import interpolate
import h5py

### Define the subdatasets

In [None]:
subdatasets = [
    "FD001",
    "FD002",
    "FD003",
    "FD004"
]

### Define hyperparameters

In [None]:
RUL_max = 125
features = 17
rtf = {
    "subdataset": "FD001",
    "unit": 24
}
cwd = "../data"

### Delete processed data folders

In [None]:
for subdataset in track(subdatasets, "Deleting old data..."):
    if os.path.exists(f"{cwd}/{subdataset}"):
        shutil.rmtree(f"{cwd}/{subdataset}")
    os.mkdir(f"{cwd}/{subdataset}")

### Iterate over all subdatasets and prepare data

In [None]:
for subdataset in track(subdatasets, "Processing data..."):
    print(f"Processing {subdataset}...")

    # Defining window size
    window_size = 40 if subdataset == "FD001" or subdataset == "FD003" else 60

    # Initializing the min-max scaler
    min_max_scaler = preprocessing.MinMaxScaler()

    # Import the raw datasets
    RUL = np.loadtxt(f"{cwd}/RUL_{subdataset}.txt")
    train = np.loadtxt(f"{cwd}/train_{subdataset}.txt")
    test = np.loadtxt(f"{cwd}/test_{subdataset}.txt")

    # Scale the data
    train[:, 2:] = min_max_scaler.fit_transform(train[:, 2:])
    test[:, 2:] = min_max_scaler.transform(test[:, 2:])

    # Delete sensors with irrelevant information and keep operation conditions
    train = np.delete(train, [5, 9, 10, 14, 20, 22, 23], axis=1)
    test = np.delete(test, [5, 9, 10, 14, 20, 22, 23], axis=1)

    # Initialize new arrays
    train_X = []
    train_y = []
    test_X = []
    test_y = []
    rtf_X = []
    rtf_y = []

    # Training set with sliding time window procedure
    for i in range(1, int(np.max(train[:, 0])) + 1):
        ind = np.where(train[:, 0] == i)
        ind = ind[0]
        data_temp = train[ind, :]
        for j in range(len(data_temp) - window_size + 1):
            train_X.append(data_temp[j:j + window_size, 2:].tolist())
            train_RUL = len(data_temp) - window_size - j
            if train_RUL > RUL_max:
                train_RUL = RUL_max
            train_y.append(train_RUL)

    # Test set with sliding time window procedure
    for i in range(1, int(np.max(test[:, 0])) + 1):
        ind = np.where(test[:, 0] == i)
        ind = ind[0]
        data_temp = test[ind, :]
        if len(data_temp) < window_size:
            data_temp_a = []
            for myi in range(data_temp.shape[1]):
                x1 = np.linspace(0, window_size - 1, len(data_temp))
                x_new = np.linspace(0, window_size - 1, window_size)
                tck = interpolate.splrep(x1, data_temp[:, myi])
                a = interpolate.splev(x_new, tck)
                data_temp_a.append(a.tolist())
            data_temp_a = np.array(data_temp_a)
            data_temp = data_temp_a.T
            data_temp = data_temp[:, 2:]
        else:
            data_temp = data_temp[-window_size:, 2:]

        data_temp = np.reshape(data_temp, (1, data_temp.shape[0], data_temp.shape[1])) 

        if i == 1:
            test_X = data_temp
        else:
            test_X = np.concatenate((test_X, data_temp), axis=0)

        if RUL[i - 1] > RUL_max:
            test_y.append(RUL_max)
        else:
            test_y.append(RUL[i - 1])

    # Save the processed data
    train_X = (np.array(train_X)).reshape(len(train_X), window_size, features)
    train_y = (np.array(train_y)/RUL_max).transpose()
    test_X = (np.array(test_X)).reshape(len(test_X), window_size, features)
    test_y = (np.array(test_y)/RUL_max).transpose()

    print(train_X.shape)
    print(train_y.shape)
    print(test_X.shape)
    print(test_y.shape)

    save_dir = f"{cwd}/{subdataset}"
    with h5py.File(f"{save_dir}/{subdataset}.h5", 'w') as f:
        f.create_dataset('X_train', data=train_X)
        f.create_dataset('Y_train', data=train_y)
        f.create_dataset('X_test', data=test_X)
        f.create_dataset('Y_test', data=test_y)

    # Creating the RTF dataset
    if rtf["subdataset"] == subdataset:
        ind = np.where(test[:, 0] == rtf["unit"])
        ind = ind[0]
        data_temp = test[ind, :]
        data_RUL = RUL[rtf["unit"] - 1]
        for j in range(len(data_temp) - window_size + 1):
            rtf_X.append(data_temp[j:j + window_size, 2:].tolist())
            test_RUL = len(data_temp) + data_RUL - window_size - j
            if test_RUL > RUL_max:
                test_RUL = RUL_max
            rtf_y.append(test_RUL)

        rtf_X = (np.array(rtf_X)).reshape(len(rtf_X), window_size, features)
        rtf_y = (np.array(rtf_y)/RUL_max).transpose()

        print(rtf_X.shape)
        print(rtf_y.shape)

        with h5py.File(f"{save_dir}/RTF.h5", 'w') as f:
            f.create_dataset('RTF_X', data=rtf_X)
            f.create_dataset('RTF_Y', data=rtf_y)