## Регрессия RUL различными методами

TO DO:
- LogisticRegression
- Random Forest 
- Gradient Boosting (sklearn, catboost, xgboost)

#### Импорт библиотек

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import gridspec
from PIL import Image
import itertools
from time import time

from sklearn.linear_model import LogisticRegression


from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from tensorflow import GradientTape
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint

import warnings
warnings.filterwarnings("ignore")

#### Чтение данных

In [2]:
df = pd.read_parquet('data/train_data.parquet')

In [3]:
test = pd.read_parquet('data/test_data.parquet')

Отшкалируем данные и удалим наны

In [4]:
# params = ['setting1', 'setting2', 'setting3', 's1', 's2', 's3',
#        's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
#        's15', 's16', 's17', 's18', 's19', 's20', 's21']

params = ['setting1', 'setting2', 's2', 's3', 's4', 's7', 
          's8', 's9', 's11', 's12', 's13', 's14', 's15', 's17', 
          's20', 's21']

In [5]:
scaler = StandardScaler()
df[params] = scaler.fit_transform(df[params])

In [6]:
test[params] = scaler.fit_transform(test[params])

#### Нейронные сети

Генерация последовательностей с окном 50

In [7]:
def seq_generator(data, seq_len, seq_cols):
    
    dt = data[seq_cols].values
    num_elem = dt.shape[0]
    for start, stop in zip(range(0, num_elem-seq_len), range(seq_len, num_elem)):
        yield df[start:stop, :]
        
def gen_labels(data, seq_len, label):
    
    dt = data[seq_cols].values
    num_elem = dt.shape[0]
    return dt[seq_len:num_elem, :]


In [8]:
def gen_sequence(id_df, seq_length, seq_cols):

    data_matrix = id_df[seq_cols].values
    num_elements = data_matrix.shape[0]
    # Iterate over two lists in parallel.
    # For example id1 have 192 rows and sequence_length is equal to 50
    # so zip iterate over two following list of numbers (0,142),(50,192)
    # 0 50 (start stop) -> from row 0 to row 50
    # 1 51 (start stop) -> from row 1 to row 51
    # 2 52 (start stop) -> from row 2 to row 52
    # ...
    # 141 191 (start stop) -> from row 141 to 191
    for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
        yield data_matrix[start:stop, :]
        
def gen_labels(id_df, seq_length, label):

    data_matrix = id_df[label].values
    num_elements = data_matrix.shape[0]
    # I have to remove the first seq_length labels
    # because for one id the first sequence of seq_length size have as target
    # the last label (the previus ones are discarded).
    # All the next id's sequences will have associated step by step one label as target.
    return data_matrix[seq_length:num_elements, :]

In [9]:
xtrain, xtest = [], []

seq_len = 50

for eid in df.id.unique():
    for seq in gen_sequence(df[df.id==eid], seq_len, params):
        xtrain.append(seq)
    for seq in gen_sequence(test[test.id==eid], seq_len, params):
        xtest.append(seq)
        
xtrain = np.asarray(xtrain)
xtest = np.asarray(xtest)

print('Xtrain shape: {},\nXtest shape: {}'.format(xtrain.shape, xtest.shape))

Xtrain shape: (15631, 50, 16),
Xtest shape: (8162, 50, 16)


In [10]:
ytrain, ytest = [],[]

for eid in df.id.unique():
    for label in gen_labels(df[df.id==eid], seq_len, ['RUL']):
        ytrain.append(label)
    for label in gen_labels(test[test.id==eid], seq_len, ['RUL']):
        ytest.append(label)
        
ytrain = np.asarray(ytrain).reshape(-1,1)
ytest = np.asarray(ytest).reshape(-1,1)

print('Ytrain shape: {},\nYtest shape: {}'.format(ytrain.shape, ytest.shape))

Ytrain shape: (15631, 1),
Ytest shape: (8162, 1)


#### LSTM Рекуррентные нейронные сети

In [11]:
model = Sequential()
model.add(LSTM(units=64,
                input_shape = (seq_len, len(params)),
               activation='tanh', 
               recurrent_activation='hard_sigmoid',
               return_sequences = True))
model.add(BatchNormalization())
model.add(LSTM(64,
              activation='tanh', 
               recurrent_activation='hard_sigmoid'))
# model.add(BatchNormalization())
# model.add(Dropout(0.5))
# model.add(LSTM(64,
#               activation='tanh', 
#                recurrent_activation='hard_sigmoid'))
model.add(BatchNormalization())
# model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation("linear"))
model.compile(loss="mse", optimizer="adam")
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 50, 64)            20736     
_________________________________________________________________
batch_normalization (BatchNo (None, 50, 64)            256       
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                33024     
_________________________________________________________________
batch_normalization_1 (Batch (None, 64)                256       
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
_________________________________________________________________
activation (Activation)      (None, 1)                 0         
Total params: 54,337
Trainable params: 54,081
Non-trainable params: 256
__________________________________________________

In [12]:
lr_decay = ReduceLROnPlateau(monitor='loss', 
                             patience=1, verbose=0, 
                             factor=0.5, min_lr=1e-8)
# Define Early Stopping:
early_stop = EarlyStopping(monitor='loss', min_delta=0, 
                           patience=30, verbose=1, mode='auto',
                           baseline=0, restore_best_weights=True)

In [13]:
BATCH_SIZE = 300
EPOCHS = 20

History = model.fit(xtrain, ytrain,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_split=0.2,
                    verbose=1,
                    callbacks=[lr_decay, early_stop])

Train on 12504 samples, validate on 3127 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [14]:
y_pred = model.predict(xtest)

In [15]:
pd.DataFrame({'Metrics': ['RMSE', 'MAE', 'R2'],    
              'Linear': [np.sqrt(mean_squared_error(y_pred, ytest)),mean_absolute_error(y_pred, ytest),r2_score(y_pred, ytest)]
              })

Unnamed: 0,Metrics,Linear
0,RMSE,68.754123
1,MAE,59.249052
2,R2,-1.440868
