In [77]:
from log import logger
logger.info('Importing libraries...')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#import the neccessary module
from helper import Model
from helper import Auxiliary
# Modelling
from keras.models import Sequential
from keras import layers
from keras.backend import clear_session

[38;5;39m2023-03-24 13:18:18,097 -     INFO - Importing libraries... (2238536406.py:2)[0m


Get train test data


In [94]:
#df_train, df_test = Auxiliary.train_test_split(Auxiliary.preprocess_data(Model.get_csv('BPI_Challenge_2012.csv')))
logger.info('Reading data...')
df_train = Model.get_csv('train.csv')
df_test = Model.get_csv('test.csv')

#only select needed columns
logger.info("Selecting columns:'case:concept:name', 'concept:name', 'time:timestamp', 'Next Time', 'lifecycle:transition'")
df_train = df_train[['case:concept:name', 'concept:name', 'time:timestamp', 'Next Time', 'lifecycle:transition']]
df_test = df_test[['case:concept:name', 'concept:name', 'time:timestamp', 'Next Time', 'lifecycle:transition']]

#remove entries where there is NaN
logger.info('Removing NaN values...')
df_train = df_train.dropna()
df_test = df_test.dropna()
df_train = df_train.replace(-1, 0)
df_test = df_test.replace(-1, 0)

#eliminate outliers
logger.info('Eliminating outliers...')
#q__train_low = df_train["Next Time"].quantile(0.03)
q__train_hi  = df_train["Next Time"].quantile(0.97)

#df_train = df_train[(df_train["Next Time"] < q__train_hi) & (df_train["Next Time"] > q__train_low)]
df_train = df_train[df_train["Next Time"] < q__train_hi]
#q_test_low = df_test["Next Time"].quantile(0.01)
q_test_hi = df_test["Next Time"].quantile(0.97)
#df_test = df_test[(df_test["Next Time"] < q_test_hi) & (df_test["Next Time"] > q_test_low)]
df_test = df_test[df_test["Next Time"] < q_test_hi]

#One hot encoding of the activities
logger.info('One hot encoding of the activities...')
df_train['concept:name'] = df_train['concept:name'].apply(lambda x: Auxiliary.one_hot_encode(x, 24))
df_test['concept:name'] = df_test['concept:name'].apply(lambda x: Auxiliary.one_hot_encode(x, 24))


from datetime import datetime
#get week day from timestamp and hot encode it
logger.info('Pulling the week day from the timestamp and hot encoding it...')
df_train['time:timestamp'] = pd.to_datetime(df_train['time:timestamp'])

df_train['Week Day'] = df_train['time:timestamp'].apply(lambda x: x.strftime('%w'))

df_train['Week Day'] = df_train['Week Day'].apply(lambda x: Auxiliary.one_hot_encode(int(x), 7))

df_test['time:timestamp'] = pd.to_datetime(df_test['time:timestamp'])

df_test['Week Day'] = df_test['time:timestamp'].apply(lambda x: int(x.strftime('%w')))

df_test['Week Day'] = df_test['Week Day'].apply(lambda x: Auxiliary.one_hot_encode(int(x), 7))

#drop timestamp
logger.info('Dropping timestamp...')
df_train.drop(['time:timestamp'], axis=1, inplace=True)
df_test.drop(['time:timestamp'], axis=1, inplace=True)

#encode lifecycle:transition
logger.info('Encoding lifecycle:transition...')
df_train['lifecycle:transition'] = df_train['lifecycle:transition'].apply(lambda x: Auxiliary.one_hot_encode(x, df_train['lifecycle:transition'].nunique()))
df_test['lifecycle:transition'] = df_test['lifecycle:transition'].apply(lambda x: Auxiliary.one_hot_encode(x, df_test['lifecycle:transition'].nunique()))

[38;5;39m2023-03-24 13:24:16,169 -     INFO - Reading data... (910505981.py:2)[0m
[38;5;39m2023-03-24 13:24:16,338 -     INFO - Selecting columns:'case:concept:name', 'concept:name', 'time:timestamp', 'Next Time', 'lifecycle:transition' (910505981.py:7)[0m
[38;5;39m2023-03-24 13:24:16,344 -     INFO - Removing NaN values... (910505981.py:12)[0m
[38;5;39m2023-03-24 13:24:16,356 -     INFO - Eliminating outliers... (910505981.py:19)[0m
[38;5;39m2023-03-24 13:24:16,364 -     INFO - One hot encoding of the activities... (910505981.py:31)[0m
[38;5;39m2023-03-24 13:24:16,489 -     INFO - Pulling the week day from the timestamp and hot encoding it... (910505981.py:38)[0m
[38;5;39m2023-03-24 13:24:16,953 -     INFO - Dropping timestamp... (910505981.py:52)[0m
[38;5;39m2023-03-24 13:24:16,958 -     INFO - Encoding lifecycle:transition... (910505981.py:57)[0m


Encode the concept:name

In [95]:
#x_train = df_train[['org:resource', 'lifecycle:transition','concept:name','case:AMOUNT_REQ','month', 'day']]
logger.info('Splitting data into x and y...')
x_train = df_train[['concept:name', 'lifecycle:transition', 'Week Day']]
y_train = df_train[['Next Time']]
#x_test = df_test[['org:resource', 'lifecycle:transition','concept:name','case:AMOUNT_REQ','month', 'day']]
x_test = df_test[['concept:name' , 'lifecycle:transition', 'Week Day']]
y_test = df_test[['Next Time']]

[38;5;39m2023-03-24 13:24:37,701 -     INFO - Splitting data into x and y... (4105737707.py:2)[0m


In [96]:
print(x_train.shape)
print(y_train.shape)

(37696, 3)
(37696, 1)


Normalizing y values 

In [97]:
#normalize y values

split_location = y_train.shape[0]
logger.info('Normalizing y values...')
y_df = pd.concat([y_train, y_test])

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

scaler.fit(y_df)

y_df = scaler.transform(y_df)

#Resplit as numpy arrays
y_train = y_df[0:split_location]
y_test = y_df[split_location:]

x_train = x_train.values
x_test = x_test.values

[38;5;39m2023-03-24 13:24:44,265 -     INFO - Normalizing y values... (700907669.py:4)[0m


In [98]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(37696, 3) (37696, 1) (16684, 3) (16684, 1)


In [99]:
logger.info('Reshaping x values from 3D to 2D...')
temp_array = []
real_x_train = []
real_x_test = []
for index,value in enumerate(x_train):
    temp_array = []
    temp_array = value[0]
    temp_array = np.append(temp_array, value[1])
    temp_array = np.append(temp_array, value[2])
    real_x_train.append(temp_array)
for index,value in enumerate(x_test):
    temp_array = []
    temp_array = value[0]
    temp_array = np.append(temp_array, value[1])
    temp_array = np.append(temp_array, value[2])
    real_x_test.append(temp_array)

x_train = np.array(real_x_train)
x_test = np.array(real_x_test)

[38;5;39m2023-03-24 13:24:48,175 -     INFO - Reshaping x values from 3D to 2D... (789070195.py:1)[0m


RFR Hyperparameter optimizer through grid search

In [None]:
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

RFR_space = {'criterion': hp.choice('criterion', ['squared_error', 'absolute_error', 'poisson', 'friedman_mse']),
             'max_depth': hp.choice('max_depth', range(1, 10)),
             'min_samples_split': hp.choice('min_samples_split', range(2, 10)),
             'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10)),
             'min_weight_fraction_leaf': hp.uniform('min_weight_fraction_leaf', 0, 0.5),
             'max_features': hp.choice('max_features', ['sqrt', 'log2', None]),
             'max_leaf_nodes': hp.choice('max_leaf_nodes', range(2, 10)),
             'min_impurity_decrease': hp.uniform('min_impurity_decrease', 0, 0.5),
             'verbose': 1,
             'n_jobs': None
             }
def RFR_objective(space):
    model = RandomForestRegressor(**space)
    accuracy = cross_val_score(model, x_train, y_train, cv = 5).mean()
    return {'loss': -accuracy, 'status': STATUS_OK}
    
trials = Trials()

best_RFR = fmin(fn = RFR_objective,
            space = RFR_space,
            algo = tpe.suggest,
            max_evals = 50,
            trials = trials)


best_RFR

'criterion' : 'friedman_mse'
'max_depth' : 9
'max_features': 'log2'
'max_leaf_nodes' : 6
'min_impurity_decrease' : 0.3345
'min_samples_leaf' : 5
'min_samples_split' : 5
'min_weight_fraction_leaf': 0.000216


In [101]:
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor(criterion = 'friedman_mse',
                                max_depth = 9,
                                max_features= 'log2',
                                max_leaf_nodes= 6,
                                min_impurity_decrease= 0.3345,
                                min_samples_leaf= 5,
                                min_samples_split= 5,
                                min_weight_fraction_leaf= 0.000216,
                                )

model = regr.fit(x_train, y_train)

regr2 = RandomForestRegressor()

model2 = regr2.fit(x_train, y_train)


y_test_pred = model.predict(x_test)
y_2_test_pred = model2.predict(x_test)

#save 
import sklearn.metrics as sm
import scipy.stats as stats

print("Grid searched hyperparameters metrics")
print("Mean absolute error =", round(sm.mean_absolute_error(y_test, y_test_pred), 2)) 
print("Mean squared error =", round(sm.mean_squared_error(y_test, y_test_pred), 2)) 
print("Median absolute error =", round(sm.median_absolute_error(y_test, y_test_pred), 2)) 
print("Explain variance score =", round(sm.explained_variance_score(y_test, y_test_pred), 2)) 
print("R2 score =", round(sm.r2_score(y_test, y_test_pred), 2))
print("Z score =",  np.average(stats.zscore(y_test_pred)))
print('\n')
print("No hyperparameters metrics")
print("Mean absolute error =", round(sm.mean_absolute_error(y_test, y_2_test_pred), 2)) 
print("Mean squared error =", round(sm.mean_squared_error(y_test, y_2_test_pred), 2)) 
print("Median absolute error =", round(sm.median_absolute_error(y_test, y_2_test_pred), 2)) 
print("Explain variance score =", round(sm.explained_variance_score(y_test, y_2_test_pred), 2)) 
print("R2 score =", round(sm.r2_score(y_test, y_2_test_pred), 2))
print("Z score =",  np.average(stats.zscore(y_2_test_pred)))


  model = regr.fit(x_train, y_train)
  model2 = regr2.fit(x_train, y_train)


Grid searched hyperparameters metrics
Mean absolute error = 0.04
Mean squared error = 0.01
Median absolute error = 0.02
Explain variance score = -0.0
R2 score = -0.01
Z score = 1.490589531982948e-17


No hyperparameters metrics
Mean absolute error = 0.05
Mean squared error = 0.02
Median absolute error = 0.0
Explain variance score = -0.15
R2 score = -0.16
Z score = -3.172826289506561e-17


In [28]:
clear_session()

In [None]:
print(x_train[4])

In [102]:
import numpy as npimport
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.svm import SVR

regressor = SVR(kernel = 'rbf')
regressor.fit(x_train, y_train)
svm_y_test = regressor.predict(x_test)

import sklearn.metrics as sm
import scipy.stats as stats

print("SVM metrics")
print("Mean absolute error =", round(sm.mean_absolute_error(y_test, svm_y_test), 2)) 
print("Mean squared error =", round(sm.mean_squared_error(y_test, svm_y_test), 2)) 
print("Median absolute error =", round(sm.median_absolute_error(y_test, svm_y_test), 2)) 
print("Explain variance score =", round(sm.explained_variance_score(y_test, svm_y_test), 2)) 
print("R2 score =", round(sm.r2_score(y_test, svm_y_test), 2))
print("Z score =",  np.average(stats.zscore(svm_y_test)))

  y = column_or_1d(y, warn=True)


SVM metrics
Mean absolute error = 0.1
Mean squared error = 0.02
Median absolute error = 0.08
Explain variance score = -0.0
R2 score = -0.22
Z score = -1.9335075643435957e-16


In [103]:

# importing the libraries
from keras.models import Sequential
from keras.layers import Dense

In [104]:
clear_session()

In [106]:
# create ANN model
model = Sequential()
model.add(layers.Dense(10, input_dim=x_train.shape[1], activation='relu'))
model.add(layers.Dense(4))


model.add(layers.Dense(1, activation='sigmoid'))


#model.compile(loss = 'binary_crossentropy', optimizer='rmsprop' metrics = ['accuracy'], )
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics = ['accuracy'])



In [108]:
# Fitting the ANN to the Training set
history = model.fit(x_train, y_train, batch_size=10, epochs=100, verbose= 1, validation_data=(x_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [111]:
nn_y_test = model.predict(x_test)

print("SVM metrics")
print("Mean absolute error =", round(sm.mean_absolute_error(y_test, nn_y_test), 2)) 
print("Mean squared error =", round(sm.mean_squared_error(y_test, nn_y_test), 2)) 
print("Median absolute error =", round(sm.median_absolute_error(y_test, nn_y_test), 2)) 
print("Explain variance score =", round(sm.explained_variance_score(y_test, nn_y_test), 2)) 
print("R2 score =", round(sm.r2_score(y_test, nn_y_test), 2))
print("Z score =",  np.average(stats.zscore(nn_y_test)))

SVM metrics
Mean absolute error = 0.04
Mean squared error = 0.02
Median absolute error = 0.0
Explain variance score = -0.09
R2 score = -0.12
Z score = -4.344237e-09


In [109]:
loss_and_accuracy = model.evaluate(x_train, y_train, verbose=False)
print("Training Loss and Accuracy:", loss_and_accuracy)

loss_and_accuracy = model.evaluate(x_test, y_test, verbose=False)
print("Testing Loss and Accuracy:", loss_and_accuracy)

Training Loss and Accuracy: [0.06900038570165634, 0.12372665852308273]
Testing Loss and Accuracy: [0.22466275095939636, 0.11082474142313004]


In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:

import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [None]:
# Defining a function to find the best parameters for ANN
def FunctionFindBestParams(x_train, y_train, x_test, y_test):
    
    # Defining the list of hyper parameters to try
    #batch_size_list=[5, 10, 15, 20]
    batch_size_list=[5,10]
    #epoch_list  =   [5, 10, 50, 100]
    epoch_list = [5,10]
    
    import pandas as pd
    SearchResultsData=pd.DataFrame(columns=['TrialNumber', 'Parameters', 'Accuracy'])
    
    # initializing the trials
    TrialNumber=0
    for batch_size_trial in batch_size_list:
        for epochs_trial in epoch_list:
            TrialNumber+=1
            # create ANN model
            model = Sequential()
            # Defining the first layer of the model
            model.add(Dense(units=5, input_dim=x_train.shape[1], kernel_initializer='normal', activation='relu'))
 
            # Defining the Second layer of the model
            model.add(Dense(units=5, kernel_initializer='normal', activation='relu'))
 
            # The output neuron is a single fully connected node 
            # Since we will be predicting a single number
            model.add(Dense(1, kernel_initializer='normal'))
 
            # Compiling the model
            model.compile(loss='mean_squared_error', optimizer='adam')
 
            # Fitting the ANN to the Training set
            model.fit(x_train, y_train ,batch_size = batch_size_trial, epochs = epochs_trial, verbose=0)
 
            error = y_test-model.predict(x_test)
            error = np.nan_to_num(error, nan=0, posinf=0, neginf=0)
            MAPE = np.mean(100 * (np.abs(error) / y_test))

            
            
            # printing the results of the current iteration
            print(TrialNumber, 'Parameters:','batch_size:', batch_size_trial,'-', 'epochs:',epochs_trial, 'Accuracy:', 100-MAPE)
            
            SearchResultsData.append(pd.DataFrame(data=[[TrialNumber, str(batch_size_trial)+'-'+str(epochs_trial), 100-MAPE]],
                                                                    columns=['TrialNumber', 'Parameters', 'Accuracy'] ))
    return(SearchResultsData)

In [None]:
print(y_train)

In [None]:
# Calling the function
ResultsData=FunctionFindBestParams(x_train, y_train, x_test, y_test)

In [None]:
ResultsData.plot(x='Parameters', y='Accuracy', figsize=(15,4), kind='line')