## LSTM with Keras

© Davide Posillipo

The data used from this notebook cannot be shared for confidentiality issues. The notebook is shared for educational purposes only. 

This code is mainly a Keras implementation of the paper "Predictive Business Process Monitoring with LSTM Neural Networks", https://arxiv.org/abs/1612.02130.

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = None

In [3]:
from config import *

Source path (cwd): /Users/davideposillipo/Documents/Didattica/Sequences
Verbose mode is activated!
Tests mode is activated!


In [4]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split

## Load data

In [5]:
df = pd.read_pickle(dir_data_pro/'data_for_christmas_finished.pkl')
print(df.shape)
df.head()

(204687, 27)


Unnamed: 0,CE_CASE_ID,CE_ACTIVITY,CE_TIMESTAMP,CE_ACTIVITY_ORDER,event_label,tStamp_next,tStamp_prev,BPEM_event,BPEM_case,next_event,tDelta_next,tDelta_prev,tSince_casestart,tTotal,tRelativePosition,tSince_midnight,dWeekday,ranking,CE_GP_NR,CE_VNB_GP_NR,CE_VNB_GP_NAME,CE_LIEFERANT_NEU_NAME,CE_EVIPK_OH,CE_LIEFERANT_ALT_NAME_OH,CE_LIEFERANT_ALT_NR_OH,CE_SOURSYSTEM,CE_SPARTE_OH
518,R4-00000001000007018636,1011 / 0010: Erster Prozessschritt: Lieferbeginn,2017-06-01 09:06:10,1.0,3,2017-06-01 09:07:10,NaT,0,0,1011 / 0020: Auf Stamm- und Geschäftsdaten prüfen,60.0,0.0,0.0,97254.0,0.0,32770.0,3,1.0,1536146195,NS00000060,Westnetz GmbH,RWE Rhein-Ruhr AG,1,0,0,R4,1
519,R4-00000001000007018636,1011 / 0020: Auf Stamm- und Geschäftsdaten prüfen,2017-06-01 09:07:10,2.0,4,2017-06-01 09:07:12,2017-06-01 09:06:10,0,0,"1011 / 0030: Prüfen, ob Kündigung angestoßen w...",2.0,60.0,60.0,97254.0,0.000617,32830.0,3,2.0,1536146195,NS00000060,Westnetz GmbH,RWE Rhein-Ruhr AG,1,0,0,R4,1
520,R4-00000001000007018636,"1011 / 0030: Prüfen, ob Kündigung angestoßen w...",2017-06-01 09:07:12,3.0,5,2017-06-01 09:07:14,2017-06-01 09:07:10,0,0,1011 / 0090: ES101 (Anmeldeanfrage) an Verteil...,2.0,2.0,62.0,97254.0,0.000638,32832.0,3,3.0,1536146195,NS00000060,Westnetz GmbH,RWE Rhein-Ruhr AG,1,0,0,R4,1
521,R4-00000001000007018636,1011 / 0090: ES101 (Anmeldeanfrage) an Verteil...,2017-06-01 09:07:14,8.0,6,2017-06-01 09:07:21,2017-06-01 09:07:12,0,0,1011 / 0110: Termin für den Empfang von CA100 ...,7.0,2.0,64.0,97254.0,0.000658,32834.0,3,4.0,1536146195,NS00000060,Westnetz GmbH,RWE Rhein-Ruhr AG,1,0,0,R4,1
522,R4-00000001000007018636,1011 / 0110: Termin für den Empfang von CA100 ...,2017-06-01 09:07:21,10.0,7,2017-06-01 09:07:28,2017-06-01 09:07:14,0,0,1011 / 0250: Termin für Empfang von ES200 (Zuo...,7.0,7.0,71.0,97254.0,0.00073,32841.0,3,5.0,1536146195,NS00000060,Westnetz GmbH,RWE Rhein-Ruhr AG,1,0,0,R4,1


In [7]:
df.columns

Index(['CE_CASE_ID', 'CE_ACTIVITY', 'CE_TIMESTAMP', 'CE_ACTIVITY_ORDER',
       'event_label', 'tStamp_next', 'tStamp_prev', 'BPEM_event', 'BPEM_case',
       'next_event', 'tDelta_next', 'tDelta_prev', 'tSince_casestart',
       'tTotal', 'tRelativePosition', 'tSince_midnight', 'dWeekday', 'ranking',
       'CE_GP_NR', 'CE_VNB_GP_NR', 'CE_VNB_GP_NAME', 'CE_LIEFERANT_NEU_NAME',
       'CE_EVIPK_OH', 'CE_LIEFERANT_ALT_NAME_OH', 'CE_LIEFERANT_ALT_NR_OH',
       'CE_SOURSYSTEM', 'CE_SPARTE_OH'],
      dtype='object')

## Functions

In [8]:
def vectorize(X_enc, caseIDs, MAX_EVENTS=None):
    print("Vectorizing preprocessed data...")
 
    num_samples = X_enc.shape[0]
    num_features = X_enc.shape[1] 
    
    X_vec = np.zeros((num_samples, MAX_EVENTS, num_features), dtype=np.float32)

    i = 0
    for caseID in np.unique(caseIDs):

        rows = np.where(caseIDs == caseID)[0]
        case_array = X_enc[rows, :]
        len_sequence = case_array.shape[0]

        for j in range(1, len_sequence + 1):
            # input is array-content without caseID
            X_vec[i, MAX_EVENTS - j:] = case_array[:j, ] 
            i += 1

    return X_vec

In [9]:
def oh_encoder(df):
    print("OneHotEncoding categorical features (drop_first=True)...")
    
    categorical_features = list(df.select_dtypes(include=['object', 'category']).columns.values)
    df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)
    
    if verbose:
        print("Encoded the following  features:")
        for col in categorical_features:
             print(f"\t{col}")
        print("Shape after encoding:", df_encoded.shape)

    return df_encoded

## Prepare Data

### Remove unwanted features

In [10]:
df.columns

Index(['CE_CASE_ID', 'CE_ACTIVITY', 'CE_TIMESTAMP', 'CE_ACTIVITY_ORDER',
       'event_label', 'tStamp_next', 'tStamp_prev', 'BPEM_event', 'BPEM_case',
       'next_event', 'tDelta_next', 'tDelta_prev', 'tSince_casestart',
       'tTotal', 'tRelativePosition', 'tSince_midnight', 'dWeekday', 'ranking',
       'CE_GP_NR', 'CE_VNB_GP_NR', 'CE_VNB_GP_NAME', 'CE_LIEFERANT_NEU_NAME',
       'CE_EVIPK_OH', 'CE_LIEFERANT_ALT_NAME_OH', 'CE_LIEFERANT_ALT_NR_OH',
       'CE_SOURSYSTEM', 'CE_SPARTE_OH'],
      dtype='object')

In [11]:
X = df.drop(['CE_CASE_ID',
             'CE_TIMESTAMP',
             'CE_ACTIVITY_ORDER',
             'event_label',
             'tStamp_next',
             'tStamp_prev',
             'next_event',
             'tDelta_next',
             'tTotal',
             'tRelativePosition',
             'CE_VNB_GP_NR',
             'CE_VNB_GP_NAME'
            ], axis=1)

caseID = df['CE_CASE_ID'].to_numpy()
y_a = df['next_event'].to_numpy()
y_t = df['tDelta_next'].to_numpy()

### One-Hot-Encoding

In [12]:
X = oh_encoder(X)

OneHotEncoding categorical features (drop_first=True)...
Encoded the following  features:
	CE_ACTIVITY
	CE_LIEFERANT_NEU_NAME
	CE_SOURSYSTEM
Shape after encoding: (204687, 44)


In [13]:
lb = LabelBinarizer()
y_a_enc = lb.fit_transform(y_a); print("y_a_enc shape:", y_a_enc.shape)

y_a_enc shape: (204687, 29)


### Scale Features that do not suffer from data leakage

In [14]:
X['tSince_midnight'] = X['tSince_midnight'] / 86400
X['dWeekday'] = X['dWeekday'] / 7

In [15]:
MAX_EVENTS = df.groupby(['CE_CASE_ID'], sort=False).apply(lambda x: len(x)).max()
MAX_EVENTS

43

In [16]:
X_vec = vectorize(X.to_numpy(), caseID, MAX_EVENTS)
X_vec.shape

Vectorizing preprocessed data...


(204687, 43, 44)

In [27]:
X_vec[2, -1, : ]

array([0.0000000e+00, 0.0000000e+00, 2.0000000e+00, 6.2000000e+01,
       3.8000000e-01, 4.2857143e-01, 3.0000000e+00, 1.5361462e+09,
       1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
      dtype=float32)

In [25]:
X.columns

Index(['BPEM_event', 'BPEM_case', 'tDelta_prev', 'tSince_casestart',
       'tSince_midnight', 'dWeekday', 'ranking', 'CE_GP_NR', 'CE_EVIPK_OH',
       'CE_LIEFERANT_ALT_NAME_OH', 'CE_LIEFERANT_ALT_NR_OH', 'CE_SPARTE_OH',
       'CE_ACTIVITY_1011 - Lieferbeginn (Sicht Neuer Lieferant) / Abgelehnt [13] + Obsolet/Abgebrochen [05] + Storniert [04]',
       'CE_ACTIVITY_1011 - Lieferbeginn (Sicht Neuer Lieferant) / Zu lösender BPEM-Fall [12] + Workflow-Fehler [11]',
       'CE_ACTIVITY_1011 / 0010: Erster Prozessschritt: Lieferbeginn',
       'CE_ACTIVITY_1011 / 0020: Auf Stamm- und Geschäftsdaten prüfen',
       'CE_ACTIVITY_1011 / 0030: Prüfen, ob Kündigung angestoßen werden muss',
       'CE_ACTIVITY_1011 / 0040: Prozess 1021 starten (Kündigung zwischen Lieferanten)',
       'CE_ACTIVITY_1011 / 0050: Benachrichtigung von Prozess 1021 empfangen',
       'CE_ACTIVITY_1011 / 0060: Termin für Prozess 1021',
       'CE_ACTIVITY_1011 / 0070: Antwortergebnis von Prozess 1021 prüfen',
       'C

In [17]:
split_ratio = 0.20

caseID_train, caseID_test_temp, X_train, X_test_temp, y_a_train, y_a_test_temp, y_t_train, y_t_test_temp\
    = train_test_split(caseID, X_vec, y_a_enc, y_t, test_size=split_ratio, shuffle=True, stratify=y_a_enc, random_state=SEED)

# Split val/test split from test data
caseID_test, caseID_val, X_test, X_val, y_a_test, y_a_val, y_t_test, y_t_val\
    = train_test_split(caseID_test_temp, X_test_temp, y_a_test_temp, y_t_test_temp, test_size=0.5,
                       shuffle=True, stratify=y_a_test_temp, random_state=SEED)

In [None]:
np.save('data/caseID_test', caseID_test)
np.save('data/X_test', X_test)
np.save('data/y_a_test', y_a_test)
np.save('data/y_t_test', y_t_test)

### Scaling

In [28]:
# scaling the variables in the vectorized form
columns_to_rescale = ["tDelta_prev", "tSince_casestart"]
idx_columns_to_rescale = [a for (a, b) in enumerate(X.columns) if b in columns_to_rescale]

averages_train = np.mean(X_train, axis=(0, 1))[idx_columns_to_rescale]
averages_dict_train = {x: y for (x, y) in zip(idx_columns_to_rescale, averages_train)}

rescaling_parameters_train = {x: y for (x, y) in zip(columns_to_rescale, averages_train)}
rescaling_parameters_train["tSince_midnight"] = 86400
rescaling_parameters_train["dWeekday"] = 7
y_t_train_mean = np.mean(y_t_train)
rescaling_parameters_train["y_t_train_mean"] = y_t_train_mean

# rescaling using the information from the training set
for idx in idx_columns_to_rescale:
    X_train[:, :, idx] = X_train[:, :, idx]/averages_dict_train[idx]
    X_val[:, :, idx] = X_val[:, :, idx] / averages_dict_train[idx]
    X_test[:, :, idx] = X_test[:, :, idx] / averages_dict_train[idx]

In [29]:
# Scaling time target variable
y_t_train = y_t_train/y_t_train_mean
y_t_test = y_t_test/y_t_train_mean
y_t_val = y_t_val/y_t_train_mean

train_data = [X_train, y_a_train, y_t_train]
val_data = (X_val, y_a_val, y_t_val)
test_data = [X_test, y_a_test, y_t_test]

In [30]:
print(y_t_train_mean)

54673.161356710574


# Training

In [31]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard

from google.protobuf import struct_pb2
from tensorboard.plugins.hparams import summary as hparams_summary
from tensorboard.plugins.hparams import api_pb2

from evaluate_lstm import plot_history

# ref: https://www.tensorflow.org/guide/gpu
#from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
tf.debugging.set_log_device_placement(True)

Num GPUs Available:  0


In [32]:
def create_model(input_shape, event_output_shape, hparams=None):
    print('Creating the model...')

    if hparams:
        dropout = hparams['dropout_rate']
        num_units = hparams['num_units']
    else:
        dropout = 0.2
        num_units = 500

    # Define the input Layer
    input_layer = Input(shape=input_shape, name='input_layer')

    # Define the shared LSTM layer
    # "return_sequences" must be true when stacking LSTMs
    shared_layer = LSTM(num_units, implementation=2, return_sequences=True, dropout=dropout, recurrent_dropout=dropout)(input_layer)
    shared_batch = BatchNormalization()(shared_layer)

    # Define the layer specialized in event prediction
    event_layer = LSTM(num_units, implementation=2, dropout=dropout, recurrent_dropout=dropout)(shared_batch)
    event_batch = BatchNormalization()(event_layer)

    # Define the layer specialized in time prediction
    time_layer = LSTM(num_units, implementation=2, dropout=dropout, recurrent_dropout=dropout)(shared_batch)
    time_batch = BatchNormalization()(time_layer)

    # Define the output layers
    event_output = Dense(event_output_shape, activation='softmax', name='event_output')(event_batch)
    time_output = Dense(1, activation='relu', name='time_output')(time_batch)

    # Create the model
    # Includes all layers required in the computation of outputs given inputs
    model = Model(inputs=input_layer, outputs=[event_output, time_output])

    model.summary()

    print(hparams)

    return model


def create_callbacks(train_dir, hparams, tensorboard=False):
    print('Creating the callbacks...')

    if hparams:
        redLR_pat = hparams['redLR_pat']
        estop_pat = hparams['estop_pat']
    else:
        redLR_pat = 10
        estop_pat = 12

    callbacks = []

    # Model Checkpoint
    cp_dir = train_dir/'Checkpoints'
    cp_dir.mkdir(parents=True, exist_ok=True)
    cp_path = str(cp_dir) + '/cp_{epoch:04d}.h5'

    callbacks.append(ModelCheckpoint(cp_path, monitor='val_loss', verbose=1, save_best_only=True, mode='auto'))

    # Create callbacks for model fitting
    callbacks.append(EarlyStopping(monitor='val_loss', patience=estop_pat))
    callbacks.append(ReduceLROnPlateau(monitor='val_loss', verbose=2, factor=0.5, patience=redLR_pat,
                                       mode='auto', min_delta=0.0001, cooldown=0, min_lr=0))

    if tensorboard:
        callbacks.append(TensorBoard(dir_hparams / 'keras'))
        print(" -> Added Tensorboard to callbacks")

    print(" -> callbacks created")

    return callbacks


def load_model(model_path):
    return tf.keras.models.load_model(model_path)

In [33]:
def run_training(train_data, val_data, hparams=None, tboard=False, verbose=1):

    model_properties = f"Model_"\
        f"eps_{hparams['epochs']:03d}_" \
        f"units_{hparams['num_units']:03d}_" \
        f"drop_{hparams['dropout_rate']:.2f}_" \
        f"lr_{hparams['learning_rate']:.4f}_"\
        f"red_{hparams['redLR_pat']:02d}_" \
        f"stop_{hparams['estop_pat']:02d}" #\
        #+ str(datetime.now()).replace(" ", "_")

    train_dir = dir_models / model_properties

    # Initialise data and parameters
    X_train, y_a_train, y_t_train = train_data

    if hparams:
        learning_rate = hparams['learning_rate']
        epochs = hparams['epochs']
    else:
        learning_rate = 0.002
        epochs = 200

    """ Creating and fitting the model """
    # Create the model
    model = create_model(X_train.shape[1:], y_a_train.shape[1], hparams)

    # Compile the model
    # compile_model(model, learning_rate)
    print("Compiling the model...")
    # https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Nadam
    model.compile(optimizer=Nadam(lr=learning_rate, epsilon=1e-08, clipvalue=3.),
                  loss={'event_output': 'categorical_crossentropy', 'time_output': 'mae'}, # mae vs mse?
                  #loss_weights={'event_output': 10., 'time_output': .5},
                  metrics=['accuracy', 'mae'])

    # Create the callbacks
    callbacks = create_callbacks(train_dir, hparams, tensorboard=tboard)

    # Fit the model
    history = model.fit(X_train, {'event_output': y_a_train, 'time_output': y_t_train},
                        validation_data=(val_data[0], {'event_output': y_a_val, 'time_output': val_data[2]}),
                        batch_size=128, epochs=epochs, callbacks=None, verbose=verbose)
                        #batch_size=32, epochs=epochs, callbacks=callbacks, verbose=verbose)
        
    """ Postprocessing """

    # Save the model
    model.save(train_dir / 'final_model.h5')

    # Save the training history
    hist_df = pd.DataFrame.from_dict(history.history)
    with open(train_dir / 'trainHistoryDict.csv', 'w', newline='') as handle:
        hist_df.to_csv(handle, index=False)

    # Load the training history
    hist_df = pd.read_csv(train_dir / 'trainHistoryDict.csv')
    hist_df = hist_df.to_dict()

    # Plot the training history and save plots
    plot_history(hist_df, train_dir)

    return model, history, train_dir


def init_training(train_data, val_data, units_list, dr_list, lr_list, epochs, redLR_patience, es_patience):
    models = []
    histories = []
    train_dirs = []

    for num_units in units_list:
        for dropout_rate in dr_list:
            for learning_rate in lr_list:
                hparams = {
                    'epochs': epochs,
                    'num_units': num_units,
                    'dropout_rate': dropout_rate,
                    'learning_rate': learning_rate,
                    'redLR_pat': redLR_patience,
                    'estop_pat': es_patience
                }

                model, history, train_dir = run_training(train_data, val_data, hparams)

                models.append(model)
                histories.append(history)
                train_dirs.append(train_dir)

    return models, histories, train_dirs

In [34]:
"""
# Start Training and Evaluate the Model
#
"""
units_list = [500]              # Layer Output Dimension Space
dr_list = [0.3]                 # dropout rates
lr_list = [0.001]               # learning rates
epochs = 2                    # Number of Iterations
redLR_patience = 10             # Patience before reducing the Learning rate
es_patience = 20                # Patience before early stopping the Learning Process

models, histories, train_dirs = init_training(train_data, val_data, units_list, dr_list, lr_list, epochs, redLR_patience, es_patience)

Creating the model...
Executing op RandomUniform in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Sub in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Add in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarIsInitializedOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op LogicalNot in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Assert in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RandomStandardNormal in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Qr in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op DiagPart in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Sign in device /job:loca

Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:

KeyboardInterrupt: 