In [1]:
import pickle
import joblib
import os 
import random

import pandas as pd
import numpy as np

from numpy import mean, std
from joblib import dump, load
from train_test_split import train_test_split_custom
from datetime import timedelta
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, Normalizer

# HERE YOU HAVE TO CHOOSE BPI_Challenge_2012 or BPI_Challenge_2017
chosen_dataset = 'BPI_Challenge_2017'

# RFC for next event prediction

In [31]:
# extract the 1-3 lags
df = pd.read_csv(f'../data/{chosen_dataset}_naive.csv')[:1000]

df['concept:name - lag_1'] = df.groupby('case:concept:name')['concept:name'].shift(1).fillna('absent')
df['concept:name - lag_2'] = df.groupby('case:concept:name')['concept:name'].shift(2).fillna('absent')
df['concept:name - lag_3'] = df.groupby('case:concept:name')['concept:name'].shift(3).fillna('absent')
df['concept:name - lag_4'] = df.groupby('case:concept:name')['concept:name'].shift(4).fillna('absent')
df['concept:name - lag_5'] = df.groupby('case:concept:name')['concept:name'].shift(5).fillna('absent')
df['concept:name - lag_6'] = df.groupby('case:concept:name')['concept:name'].shift(6).fillna('absent')
df['concept:name - lag_7'] = df.groupby('case:concept:name')['concept:name'].shift(7).fillna('absent')
df['concept:name - lag_8'] = df.groupby('case:concept:name')['concept:name'].shift(8).fillna('absent')
df['concept:name - lag_9'] = df.groupby('case:concept:name')['concept:name'].shift(9).fillna('absent')
df['concept:name - lag_10'] = df.groupby('case:concept:name')['concept:name'].shift(10).fillna('absent')

# define target
df['next concept:name'] = df.groupby('case:concept:name')['concept:name'].shift(-1).fillna('END')

# # Split the DataFrame 
# df = df.iloc[:10000]

df

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,...,concept:name - lag_2,concept:name - lag_3,concept:name - lag_4,concept:name - lag_5,concept:name - lag_6,concept:name - lag_7,concept:name - lag_8,concept:name - lag_9,concept:name - lag_10,next concept:name
0,0,0,0,Created,User_1,A_Create Application,Application,Application_652823628,complete,2016-01-01 09:51:15.304,...,absent,absent,absent,absent,absent,absent,absent,absent,absent,A_Submitted
1,1,1,1,statechange,User_1,A_Submitted,Application,ApplState_1582051990,complete,2016-01-01 09:51:15.352,...,absent,absent,absent,absent,absent,absent,absent,absent,absent,W_Handle leads
2,2,2,2,Created,User_1,W_Handle leads,Workflow,Workitem_1298499574,schedule,2016-01-01 09:51:15.774,...,A_Create Application,absent,absent,absent,absent,absent,absent,absent,absent,W_Handle leads
3,3,3,3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1673366067,withdraw,2016-01-01 09:52:36.392,...,A_Submitted,A_Create Application,absent,absent,absent,absent,absent,absent,absent,W_Complete application
4,4,4,4,Created,User_1,W_Complete application,Workflow,Workitem_1493664571,schedule,2016-01-01 09:52:36.403,...,W_Handle leads,A_Submitted,A_Create Application,absent,absent,absent,absent,absent,absent,A_Concept
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,995,657,Obtained,User_43,W_Complete application,Workflow,Workitem_1533178514,resume,2016-01-04 08:04:37.242,...,W_Complete application,W_Complete application,W_Complete application,A_Concept,W_Complete application,W_Handle leads,W_Handle leads,A_Submitted,A_Create Application,END
996,996,996,4600,Created,User_19,A_Create Application,Application,Application_1446096185,complete,2016-01-04 08:08:21.242,...,absent,absent,absent,absent,absent,absent,absent,absent,absent,W_Complete application
997,997,997,4601,Created,User_19,W_Complete application,Workflow,Workitem_656543675,schedule,2016-01-04 08:08:21.255,...,absent,absent,absent,absent,absent,absent,absent,absent,absent,W_Complete application
998,998,998,4602,Obtained,User_19,W_Complete application,Workflow,Workitem_1808202340,start,2016-01-04 08:08:21.261,...,A_Create Application,absent,absent,absent,absent,absent,absent,absent,absent,A_Concept


In [32]:
df.columns

Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'Action', 'org:resource',
       'concept:name', 'EventOrigin', 'EventID', 'lifecycle:transition',
       'time:timestamp', 'case:LoanGoal', 'case:ApplicationType',
       'case:concept:name', 'case:RequestedAmount', 'FirstWithdrawalAmount',
       'NumberOfTerms', 'Accepted', 'MonthlyCost', 'Selected', 'CreditScore',
       'OfferedAmount', 'OfferID', 'start_timestamp', 'time:timestamp diff',
       'timestamp_date', 'day_of_the_week', 'Weekday', 'working_hours',
       'vacation_day', 'next time:timestamp', 'next time:timestamp naive',
       'next concept:name naive', 'concept:name - lag_1',
       'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4',
       'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7',
       'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10',
       'next concept:name'],
      dtype='object')

In [33]:
# Prepare data

df_train, df_test = train_test_split_custom(df=df, test_size=0.2, lags=True)

columns = ['concept:name' , 'concept:name - lag_1', 'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4', 'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7', 'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10', 'next concept:name']
label_encoders = {}
for column in columns:
        label_encoder = LabelEncoder()
        df_test[column] = label_encoder.fit_transform(df_test[column])
        df_train[column] = label_encoder.fit_transform(df_train[column])
        df[column] = label_encoder.fit_transform(df[column])
        label_encoders[column] = label_encoder

X_train = df_train[['concept:name', 'concept:name - lag_1', 'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4', 'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7', 'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10']]
X_test = df_test[['concept:name', 'concept:name - lag_1', 'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4', 'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7', 'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10']]

y_train = df_train[['next concept:name']]
y_test = df_test[['next concept:name']]

print(f"""
    inputs: {[col for col in X_test.columns]} \n
    target: {[col for col in y_test.columns]}
""")

if not os.path.exists('../model_weights/random_forest.pkl'):

    rf_clf = RandomForestClassifier(n_jobs=-1)

    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [23, 30, 40],
        'max_depth': [15, 20, 25],
        'min_samples_split': [2, 10, 15],
        'min_samples_leaf': [1, 2, 4]
    }

    # Perform nested cross-validation
    inner_cv = KFold(n_splits=5, shuffle=False, random_state=None)  # 5-fold inner cross-validation

    grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, scoring='accuracy', cv=inner_cv)
    grid_search.fit(X_train, y_train.values.ravel())

    # Get the best model
    best_model = grid_search.best_estimator_

    # Print the results

    print(f"""
        Next label prediction:\n
        Score on the test set {best_model.score(X_test, y_test)}
        Best parameters: {grid_search.best_params_}
    """)

    # Save the best model to a file using pickle
    # Create the folder for model weights
    os.makedirs('../model_weights', exist_ok=True)
    model_filename = '../model_weights/random_forest.pkl'

    with open(model_filename, 'wb') as model_file:
        pickle.dump(best_model, model_file)
        
else: 
    with open('../model_weights/random_forest.pkl', 'rb') as f:
        best_model = pickle.load(f)

# Make predictions on the dataset for adding new column
df['next concept:name rfc'] = label_encoder.inverse_transform(best_model.predict(df[['concept:name', 'concept:name - lag_1', 'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4', 'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7', 'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10']]))

for column in columns:
        df[column] = label_encoders[column].inverse_transform(df[column])

df


    ######################################## TRAIN TEST INFO #######################################

      Train set ends with 2016-01-03 17:49:37.384

      Test set starts with: 2016-01-03 18:50:16.638

    ################################################################################################

    

    inputs: ['concept:name', 'concept:name - lag_1', 'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4', 'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7', 'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10'] 

    target: ['next concept:name']



Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,...,concept:name - lag_3,concept:name - lag_4,concept:name - lag_5,concept:name - lag_6,concept:name - lag_7,concept:name - lag_8,concept:name - lag_9,concept:name - lag_10,next concept:name,next concept:name rfc
0,0,0,0,Created,User_1,A_Create Application,Application,Application_652823628,complete,2016-01-01 09:51:15.304,...,absent,absent,absent,absent,absent,absent,absent,absent,A_Submitted,A_Incomplete
1,1,1,1,statechange,User_1,A_Submitted,Application,ApplState_1582051990,complete,2016-01-01 09:51:15.352,...,absent,absent,absent,absent,absent,absent,absent,absent,W_Handle leads,W_Call after offers
2,2,2,2,Created,User_1,W_Handle leads,Workflow,Workitem_1298499574,schedule,2016-01-01 09:51:15.774,...,absent,absent,absent,absent,absent,absent,absent,absent,W_Handle leads,W_Call after offers
3,3,3,3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1673366067,withdraw,2016-01-01 09:52:36.392,...,A_Create Application,absent,absent,absent,absent,absent,absent,absent,W_Complete application,O_Sent (online only)
4,4,4,4,Created,User_1,W_Complete application,Workflow,Workitem_1493664571,schedule,2016-01-01 09:52:36.403,...,A_Submitted,A_Create Application,absent,absent,absent,absent,absent,absent,A_Concept,O_Sent (online only)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,995,657,Obtained,User_43,W_Complete application,Workflow,Workitem_1533178514,resume,2016-01-04 08:04:37.242,...,W_Complete application,W_Complete application,A_Concept,W_Complete application,W_Handle leads,W_Handle leads,A_Submitted,A_Create Application,END,O_Sent (online only)
996,996,996,4600,Created,User_19,A_Create Application,Application,Application_1446096185,complete,2016-01-04 08:08:21.242,...,absent,absent,absent,absent,absent,absent,absent,absent,W_Complete application,A_Incomplete
997,997,997,4601,Created,User_19,W_Complete application,Workflow,Workitem_656543675,schedule,2016-01-04 08:08:21.255,...,absent,absent,absent,absent,absent,absent,absent,absent,W_Complete application,O_Sent (online only)
998,998,998,4602,Obtained,User_19,W_Complete application,Workflow,Workitem_1808202340,start,2016-01-04 08:08:21.261,...,absent,absent,absent,absent,absent,absent,absent,absent,A_Concept,A_Concept


In [34]:
df.to_csv(f'../data/{chosen_dataset}_rfc_xgboost.csv')

# XGBoost for time till next event prediction

In [25]:
df = pd.read_csv(f'../data/{chosen_dataset}_rfc_xgboost.csv')

# counting elapsed time
df['elapsed time:timestamp'] = df['time:timestamp diff'].shift(-1) 
df['elapsed time:timestamp'] = pd.to_timedelta(df['elapsed time:timestamp'])
df['elapsed time:timestamp'] = df['elapsed time:timestamp'].apply(lambda x: x.total_seconds()).fillna(-0.01)
df['time:timestamp'] = pd.to_datetime(df['time:timestamp'])
df['CreditScore'] = df['CreditScore'].fillna(0)

# lags for time between events in past 

df['elapsed time:timestamp - lag_1'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(1).fillna(-0.00000001)
df['elapsed time:timestamp - lag_2'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(2).fillna(-0.00000001)
df['elapsed time:timestamp - lag_3'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(3).fillna(-0.00000001)
df['elapsed time:timestamp - lag_4'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(4).fillna(-0.00000001)
df['elapsed time:timestamp - lag_5'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(5).fillna(-0.00000001)
df['elapsed time:timestamp - lag_6'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(6).fillna(-0.00000001)
df['elapsed time:timestamp - lag_7'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(7).fillna(-0.00000001)
df['elapsed time:timestamp - lag_8'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(8).fillna(-0.00000001)
df['elapsed time:timestamp - lag_9'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(9).fillna(-0.00000001)
df['elapsed time:timestamp - lag_10'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(10).fillna(-0.00000001)


# preprocess the columns before fitting
preprocessors = {}
for column in df.columns:
    if column == 'time:timestamp':
        df['year'] = df['time:timestamp'].dt.year
        df['month'] = df['time:timestamp'].dt.month
        df['day'] = df['time:timestamp'].dt.day
        df['hour'] = df['time:timestamp'].dt.hour

    elif column in ['concept:name', 'concept:name - lag_1', 'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4', 'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7', 'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10', 'next concept:name rfc', 'org:resource']:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        preprocessors[column] = le

    else:
        continue
    
# split the data on train and test dataframes 
df_train, df_test = train_test_split_custom(df=df, lags=False, test_size=0.2)

# define the input and outputs for the model before put in the train 

X = df[['concept:name',  # this X is for making prediction on the dataframe for saving results in the csv
                    'concept:name - lag_1', 
                    'concept:name - lag_2', 
                    'concept:name - lag_3', 
                    'concept:name - lag_4', 
                    'concept:name - lag_5', 
                    'concept:name - lag_6', 
                    'concept:name - lag_7', 
                    'concept:name - lag_8', 
                    'concept:name - lag_9', 
                    'concept:name - lag_10', 
                    'next concept:name rfc', 
                    'year', 
                    'month', 
                    'day', 
                    'hour',
                    'CreditScore',
                    'org:resource',
                    'elapsed time:timestamp - lag_1', 
                    'elapsed time:timestamp - lag_2', 
                    'elapsed time:timestamp - lag_3',
                    'elapsed time:timestamp - lag_4',
                    'elapsed time:timestamp - lag_5',
                    'elapsed time:timestamp - lag_6',
                    'elapsed time:timestamp - lag_7',
                    'elapsed time:timestamp - lag_8',
                    'elapsed time:timestamp - lag_9',
                    'elapsed time:timestamp - lag_10']]

X_train = df_train[['concept:name', 
                    'concept:name - lag_1', 
                    'concept:name - lag_2', 
                    'concept:name - lag_3', 
                    'concept:name - lag_4', 
                    'concept:name - lag_5', 
                    'concept:name - lag_6', 
                    'concept:name - lag_7', 
                    'concept:name - lag_8', 
                    'concept:name - lag_9', 
                    'concept:name - lag_10',  
                    'next concept:name rfc', 
                    'year', 
                    'month', 
                    'day', 
                    'hour',
                    'CreditScore', 
                    'org:resource',
                    'elapsed time:timestamp - lag_1',
                    'elapsed time:timestamp - lag_2',
                    'elapsed time:timestamp - lag_3',
                    'elapsed time:timestamp - lag_4',
                    'elapsed time:timestamp - lag_5',
                    'elapsed time:timestamp - lag_6',
                    'elapsed time:timestamp - lag_7',
                    'elapsed time:timestamp - lag_8',
                    'elapsed time:timestamp - lag_9',
                    'elapsed time:timestamp - lag_10']]

X_test = df_test[['concept:name', 
                    'concept:name - lag_1', 
                    'concept:name - lag_2', 
                    'concept:name - lag_3', 
                    'concept:name - lag_4', 
                    'concept:name - lag_5', 
                    'concept:name - lag_6', 
                    'concept:name - lag_7', 
                    'concept:name - lag_8', 
                    'concept:name - lag_9', 
                    'concept:name - lag_10',  
                    'next concept:name rfc', 
                    'year', 
                    'month', 
                    'day', 
                    'hour',
                    'CreditScore',
                    'org:resource',
                    'elapsed time:timestamp - lag_1', 
                    'elapsed time:timestamp - lag_2', 
                    'elapsed time:timestamp - lag_3',
                    'elapsed time:timestamp - lag_4',
                    'elapsed time:timestamp - lag_5',
                    'elapsed time:timestamp - lag_6',
                    'elapsed time:timestamp - lag_7',
                    'elapsed time:timestamp - lag_8',
                    'elapsed time:timestamp - lag_9',
                    'elapsed time:timestamp - lag_10']]

y_train = df_train['elapsed time:timestamp']

y_test = df_test[['elapsed time:timestamp']]

print(f"""
    inputs: {[col for col in X_test.columns]} \n
    target: {[col for col in y_test.columns]}
""")
# Define the parameter grid

param_grid = {
    'n_estimators': [85],
    'max_depth': [7],
    'learning_rate': [0.099]
}
# Initialize the model
model = XGBRegressor()

# Initialize GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the GridSearchCV using the transformed training set
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print(f"best params: {best_params}")

# Save the best model
model_filename = '../model_weights/xgboost.joblib'
dump(best_estimator, model_filename)
        
# Predict on the test set using the best estimator
y_pred_test = np.abs(best_estimator.predict(X_test))

# Evaluate the model on the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
print(f'Test RMSE: {rmse_test}')

r2_score_value = r2_score(y_test, y_pred_test)
print(f'R² score: {r2_score_value}')

# Update the dataframe with predictions from the best model
df['elapsed time:timestamp XGBoost'] = np.abs(np.where(best_estimator.predict(X) < 0, np.abs(best_estimator.predict(X)) / 10000, best_estimator.predict(X) / 1000))
df['elapsed time:timestamp'] = df['elapsed time:timestamp'].mask(df['elapsed time:timestamp'] < 0)
df = df.drop(columns=['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'year', 'month', 'day', 'hour', 'elapsed time:timestamp - lag_10', 'elapsed time:timestamp - lag_9', 'elapsed time:timestamp - lag_8', 'elapsed time:timestamp - lag_7', 'elapsed time:timestamp - lag_6', 'elapsed time:timestamp - lag_5', 'elapsed time:timestamp - lag_4', 'elapsed time:timestamp - lag_3', 'elapsed time:timestamp - lag_2', 'elapsed time:timestamp - lag_1', 'concept:name - lag_1', 'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4', 'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7', 'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10'], errors='ignore')
df['elapsed time:timestamp'] = pd.to_timedelta(df['elapsed time:timestamp'], unit='seconds')
df['elapsed time:timestamp XGBoost'] = pd.to_timedelta(df['elapsed time:timestamp XGBoost'], unit='seconds')
df['next time:timestamp XGBoost'] = df['time:timestamp'] + df['elapsed time:timestamp XGBoost']
df = df.drop(columns=['elapsed time:timestamp XGBoost', 'elapsed time:timestamp'], errors='ignore')

# Decode categorical columns
for column in df.columns:
    if column in ['concept:name', 'concept:name - lag_1', 'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4', 'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7', 'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10', 'next concept:name rfc', 'org:resource']:
        le = preprocessors[column]
        df[column] = le.inverse_transform(df[column])
    else:
        continue 

df.to_csv(f'../data/{chosed_dataset}_rfc_xgboost.csv')


    ######################################## TRAIN TEST INFO #######################################

      Train set ends with 2016-01-03 17:49:37.384000

      Test set starts with: 2016-01-03 18:50:16.638000

    ################################################################################################

    

    inputs: ['concept:name', 'concept:name - lag_1', 'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4', 'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7', 'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10', 'next concept:name rfc', 'year', 'month', 'day', 'hour', 'CreditScore', 'org:resource', 'elapsed time:timestamp - lag_1', 'elapsed time:timestamp - lag_2', 'elapsed time:timestamp - lag_3', 'elapsed time:timestamp - lag_4', 'elapsed time:timestamp - lag_5', 'elapsed time:timestamp - lag_6', 'elapsed time:timestamp - lag_7', 'elapsed time:timestamp - lag_8', 'elapsed time:timestamp - lag_9', 'elapsed time:t

best params: {'learning_rate': 0.099, 'max_depth': 7, 'n_estimators': 85}
Test RMSE: 26051.70642027279
R² score: -0.09438168509858569


# Traces prediction

In [59]:
df = pd.read_csv(f'../data/BPI_Challenge_2017_rfc_xgboost.csv')[:400000]

In [60]:
events_types = df['next concept:name'].unique()

In [61]:
events_types

array(['A_Submitted', 'W_Handle leads', 'W_Complete application',
       'A_Concept', 'A_Accepted', 'O_Create Offer', 'O_Created',
       'O_Sent (online only)', 'W_Call after offers', 'A_Complete',
       'O_Cancelled', 'O_Sent (mail and online)',
       'W_Validate application', 'A_Validating',
       'W_Call incomplete files', 'A_Incomplete', 'A_Cancelled',
       'O_Returned', 'END', 'A_Denied', 'O_Accepted', 'A_Pending',
       'O_Refused', 'W_Assess potential fraud',
       'W_Personal Loan collection'], dtype=object)

In [62]:
df_traces = pd.read_json(f'../data/traces_event_log_{chosen_dataset}.json').tail(1000)

In [63]:
df

Unnamed: 0.1,Unnamed: 0,Unnamed: 0.3,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,...,day_of_the_week,Weekday,working_hours,vacation_day,next time:timestamp,next time:timestamp naive,next concept:name naive,next concept:name,next concept:name rfc,next time:timestamp XGBoost
0,0,0,Created,User_1,A_Create Application,Application,Application_652823628,complete,2016-01-01 09:51:15.304,Existing loan takeover,...,Friday,1,1,1,2016-01-01 09:51:15.352,2016-01-01 09:51:15.344,A_Submitted,A_Submitted,A_Submitted,2016-01-01 09:51:15.505508164
1,1,1,statechange,User_1,A_Submitted,Application,ApplState_1582051990,complete,2016-01-01 09:51:15.352,Existing loan takeover,...,Friday,1,1,1,2016-01-01 09:51:15.774,2016-01-01 09:51:15.652,W_Handle leads,W_Handle leads,W_Handle leads,2016-01-01 09:51:16.147479476
2,2,2,Created,User_1,W_Handle leads,Workflow,Workitem_1298499574,schedule,2016-01-01 09:51:15.774,Existing loan takeover,...,Friday,1,1,1,2016-01-01 09:52:36.392,2016-01-01 09:51:49.854,W_Handle leads,W_Handle leads,W_Handle leads,2016-01-01 09:51:59.999872040
3,3,3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1673366067,withdraw,2016-01-01 09:52:36.392,Existing loan takeover,...,Friday,1,1,1,2016-01-01 09:52:36.403,2016-01-01 09:53:10.472,W_Complete application,W_Complete application,W_Complete application,2016-01-01 09:52:36.733504365
4,4,4,Created,User_1,W_Complete application,Workflow,Workitem_1493664571,schedule,2016-01-01 09:52:36.403,Existing loan takeover,...,Friday,1,1,1,2016-01-01 09:52:36.413,2016-01-01 09:52:40.113,A_Concept,A_Concept,A_Concept,2016-01-01 09:52:37.091013017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399995,399995,399995,Obtained,User_3,W_Call incomplete files,Workflow,Workitem_604155133,resume,2016-05-26 12:00:30.475,Existing loan takeover,...,Thursday,1,1,1,2016-05-26 12:03:31.546,2016-05-26 12:01:40.985,W_Call incomplete files,W_Call incomplete files,W_Call incomplete files,2016-05-26 12:01:12.854745483
399996,399996,399996,statechange,User_49,O_Sent (mail and online),Offer,OfferState_606841832,complete,2016-05-26 12:00:34.311,"Other, see explanation",...,Thursday,1,1,1,2016-05-27 07:55:43.270,2016-05-26 12:00:34.331,W_Call after offers,W_Call after offers,W_Call after offers,2016-05-26 12:01:18.229994904
399997,399997,399997,statechange,User_2,O_Sent (online only),Offer,OfferState_2125726975,complete,2016-05-26 12:00:56.629,Unknown,...,Thursday,1,1,1,2016-05-26 12:00:56.719,2016-05-26 12:05:15.059,W_Complete application,W_Complete application,W_Complete application,2016-05-26 12:00:57.974597506
399998,399998,399998,Deleted,User_2,W_Complete application,Workflow,Workitem_1467622843,complete,2016-05-26 12:00:56.719,Unknown,...,Thursday,1,1,1,2016-05-26 12:00:56.726,2016-05-26 12:01:00.429,W_Complete application,W_Call after offers,W_Call after offers,2016-05-26 12:01:07.857289452


In [64]:
# encoding of the events
label_encoder = LabelEncoder()
encoded_event_types = label_encoder.fit_transform(np.append(events_types, ['END', 'absent', 'A_Create Application', 'W_Shortened completion ']))
end_token = label_encoder.transform(['END'])[0]
print(encoded_event_types, end_token)

[ 8 23 22  3  0 13 14 18 20  2 12 17 26  9 21  6  1 16 10  5 11  7 15 19
 24 10 27  4 25] 10


In [65]:
# Prepare data

df_train, df_test = train_test_split_custom(df=df, test_size=0.2, lags=True)

df['concept:name - lag_1'] = df.groupby('case:concept:name')['concept:name'].shift(1).fillna('absent')
df['concept:name - lag_2'] = df.groupby('case:concept:name')['concept:name'].shift(2).fillna('absent')
df['concept:name - lag_3'] = df.groupby('case:concept:name')['concept:name'].shift(3).fillna('absent')
df['concept:name - lag_4'] = df.groupby('case:concept:name')['concept:name'].shift(4).fillna('absent')
df['concept:name - lag_5'] = df.groupby('case:concept:name')['concept:name'].shift(5).fillna('absent')
df['concept:name - lag_6'] = df.groupby('case:concept:name')['concept:name'].shift(6).fillna('absent')
df['concept:name - lag_7'] = df.groupby('case:concept:name')['concept:name'].shift(7).fillna('absent')
df['concept:name - lag_8'] = df.groupby('case:concept:name')['concept:name'].shift(8).fillna('absent')
df['concept:name - lag_9'] = df.groupby('case:concept:name')['concept:name'].shift(9).fillna('absent')
df['concept:name - lag_10'] = df.groupby('case:concept:name')['concept:name'].shift(10).fillna('absent')

columns = ['concept:name' , 'concept:name - lag_1', 'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4', 'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7', 'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10', 'next concept:name']

for column in columns:
        df_test[column] = label_encoder.transform(df_test[column])
        df_train[column] = label_encoder.transform(df_train[column])
        df[column] = label_encoder.transform(df[column])
        

X_train = df_train[['concept:name', 'concept:name - lag_1', 'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4', 'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7', 'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10']]
X_test = df_test[['concept:name', 'concept:name - lag_1', 'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4', 'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7', 'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10']]

y_train = df_train[['next concept:name']]
y_test = df_test[['next concept:name']]

print(f"""
    inputs: {[col for col in X_test.columns]} \n
    target: {[col for col in y_test.columns]}
""")

if not os.path.exists('../model_weights/random_forest_trace.pkl'):

    rf_clf = RandomForestClassifier(n_jobs=-1)

    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [23, 30, 40],
        'max_depth': [15, 20, 25],
        'min_samples_split': [2, 10, 15],
        'min_samples_leaf': [1, 2, 4]
    }

    inner_cv = KFold(n_splits=5, shuffle=False, random_state=None)  

    grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, scoring='accuracy', cv=inner_cv)
    grid_search.fit(X_train, y_train.values.ravel())

    # Get the best model
    rfc_model = grid_search.best_estimator_

    # Print the results

    print(f"""
        Next label prediction in trace:\n
        Score on the test set {rfc_model.score(X_test, y_test)}
        Best parameters: {grid_search.best_params_}
    """)

    # Save the best model to a file using pickle
    # Create the folder for model weights
    os.makedirs('../model_weights', exist_ok=True)
    model_filename = '../model_weights/random_forest_trace.pkl'

    with open(model_filename, 'wb') as model_file:
        pickle.dump(rfc_model, model_file)
        
else: 
    with open('../model_weights/random_forest_trace.pkl', 'rb') as f:
        rfc_model = pickle.load(f)

for column in columns:
    df[column] = label_encoder.inverse_transform(df[column])

# counting elapsed time
df['elapsed time:timestamp'] = df['time:timestamp diff'].shift(-1) 
df['elapsed time:timestamp'] = pd.to_timedelta(df['elapsed time:timestamp'])
df['elapsed time:timestamp'] = df['elapsed time:timestamp'].apply(lambda x: x.total_seconds()).fillna(-0.01)
df['time:timestamp'] = pd.to_datetime(df['time:timestamp'])

# lags for time between events in past 

df['elapsed time:timestamp - lag_1'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(1).fillna(-0.00000001)
df['elapsed time:timestamp - lag_2'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(2).fillna(-0.00000001)
df['elapsed time:timestamp - lag_3'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(3).fillna(-0.00000001)
df['elapsed time:timestamp - lag_4'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(4).fillna(-0.00000001)
df['elapsed time:timestamp - lag_5'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(5).fillna(-0.00000001)
df['elapsed time:timestamp - lag_6'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(6).fillna(-0.00000001)
df['elapsed time:timestamp - lag_7'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(7).fillna(-0.00000001)
df['elapsed time:timestamp - lag_8'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(8).fillna(-0.00000001)
df['elapsed time:timestamp - lag_9'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(9).fillna(-0.00000001)
df['elapsed time:timestamp - lag_10'] = df.groupby(by='case:concept:name')['elapsed time:timestamp'].shift(10).fillna(-0.00000001)

# lags for event
df['concept:name - lag_1'] = df.groupby('case:concept:name')['concept:name'].shift(1).fillna('absent')
df['concept:name - lag_2'] = df.groupby('case:concept:name')['concept:name'].shift(2).fillna('absent')
df['concept:name - lag_3'] = df.groupby('case:concept:name')['concept:name'].shift(3).fillna('absent')
df['concept:name - lag_4'] = df.groupby('case:concept:name')['concept:name'].shift(4).fillna('absent')
df['concept:name - lag_5'] = df.groupby('case:concept:name')['concept:name'].shift(5).fillna('absent')
df['concept:name - lag_6'] = df.groupby('case:concept:name')['concept:name'].shift(6).fillna('absent')
df['concept:name - lag_7'] = df.groupby('case:concept:name')['concept:name'].shift(7).fillna('absent')
df['concept:name - lag_8'] = df.groupby('case:concept:name')['concept:name'].shift(8).fillna('absent')
df['concept:name - lag_9'] = df.groupby('case:concept:name')['concept:name'].shift(9).fillna('absent')
df['concept:name - lag_10'] = df.groupby('case:concept:name')['concept:name'].shift(10).fillna('absent')

# preprocess the columns before fitting
for column in df.columns:
    if column == 'time:timestamp':
        df['year'] = df['time:timestamp'].dt.year
        df['month'] = df['time:timestamp'].dt.month
        df['day'] = df['time:timestamp'].dt.day
        df['hour'] = df['time:timestamp'].dt.hour

    elif column in ['next concept:name rfc', 'concept:name', 'concept:name - lag_1', 'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4', 'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7', 'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10']:
        df[column] = label_encoder.transform(df[column])

    else:
        continue
    
# split the data on train and test dataframes 
df_train, df_test = train_test_split_custom(df=df, lags=False, test_size=0.2)

# define the input and outputs for the model before put in the train 

X = df[['next concept:name rfc', 'concept:name',  # this X is for making prediction on the dataframe for saving results in the csv
                    'concept:name - lag_1', 
                    'concept:name - lag_2', 
                    'concept:name - lag_3', 
                    'concept:name - lag_4', 
                    'concept:name - lag_5', 
                    'concept:name - lag_6', 
                    'concept:name - lag_7', 
                    'concept:name - lag_8', 
                    'concept:name - lag_9', 
                    'concept:name - lag_10', 
                    'elapsed time:timestamp - lag_1', 
                    'elapsed time:timestamp - lag_2', 
                    'elapsed time:timestamp - lag_3',
                    'elapsed time:timestamp - lag_4',
                    'elapsed time:timestamp - lag_5',
                    'elapsed time:timestamp - lag_6',
                    'elapsed time:timestamp - lag_7',
                    'elapsed time:timestamp - lag_8',
                    'elapsed time:timestamp - lag_9',
                    'elapsed time:timestamp - lag_10']]

X_train = df_train[['next concept:name rfc', 'concept:name', 
                    'concept:name - lag_1', 
                    'concept:name - lag_2', 
                    'concept:name - lag_3', 
                    'concept:name - lag_4', 
                    'concept:name - lag_5', 
                    'concept:name - lag_6', 
                    'concept:name - lag_7', 
                    'concept:name - lag_8', 
                    'concept:name - lag_9', 
                    'concept:name - lag_10',  
                    'elapsed time:timestamp - lag_1',
                    'elapsed time:timestamp - lag_2',
                    'elapsed time:timestamp - lag_3',
                    'elapsed time:timestamp - lag_4',
                    'elapsed time:timestamp - lag_5',
                    'elapsed time:timestamp - lag_6',
                    'elapsed time:timestamp - lag_7',
                    'elapsed time:timestamp - lag_8',
                    'elapsed time:timestamp - lag_9',
                    'elapsed time:timestamp - lag_10']]

X_test = df_test[['next concept:name rfc', 'concept:name', 
                    'concept:name - lag_1', 
                    'concept:name - lag_2', 
                    'concept:name - lag_3', 
                    'concept:name - lag_4', 
                    'concept:name - lag_5', 
                    'concept:name - lag_6', 
                    'concept:name - lag_7', 
                    'concept:name - lag_8', 
                    'concept:name - lag_9', 
                    'concept:name - lag_10',  
                    'elapsed time:timestamp - lag_1', 
                    'elapsed time:timestamp - lag_2', 
                    'elapsed time:timestamp - lag_3',
                    'elapsed time:timestamp - lag_4',
                    'elapsed time:timestamp - lag_5',
                    'elapsed time:timestamp - lag_6',
                    'elapsed time:timestamp - lag_7',
                    'elapsed time:timestamp - lag_8',
                    'elapsed time:timestamp - lag_9',
                    'elapsed time:timestamp - lag_10']]

y_train = df_train['elapsed time:timestamp']

y_test = df_test[['elapsed time:timestamp']]

print(f"""
    inputs: {[col for col in X_test.columns]} \n
    target: {[col for col in y_test.columns]}
""")
# Define the parameter grid

param_grid = {
    'n_estimators': [85],
    'max_depth': [7],
    'learning_rate': [0.099]
}
# Initialize the model
model = XGBRegressor()

# Initialize GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the GridSearchCV using the transformed training set
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
xgboost_trace = grid_search.best_estimator_

print(f"best params: {best_params}")

# Save the best model
model_filename = '../model_weights/xgboost_trace.joblib'
dump(xgboost_trace, model_filename)
        
# Predict on the test set using the best estimator
y_pred_test = np.abs(xgboost_trace.predict(X_test))

# Evaluate the model on the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
print(f'Test RMSE: {rmse_test}')

r2_score_value = r2_score(y_test, y_pred_test)
print(f'R² score: {r2_score_value}')

# Update the dataframe with predictions from the best model
df['elapsed time:timestamp XGBoost'] = np.abs(np.where(xgboost_trace.predict(X) < 0, np.abs(xgboost_trace.predict(X)) / 10000, xgboost_trace.predict(X) / 1000))
df['elapsed time:timestamp'] = df['elapsed time:timestamp'].mask(df['elapsed time:timestamp'] < 0)
df = df.drop(columns=['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'year', 'month', 'day', 'hour', 'elapsed time:timestamp - lag_10', 'elapsed time:timestamp - lag_9', 'elapsed time:timestamp - lag_8', 'elapsed time:timestamp - lag_7', 'elapsed time:timestamp - lag_6', 'elapsed time:timestamp - lag_5', 'elapsed time:timestamp - lag_4', 'elapsed time:timestamp - lag_3', 'elapsed time:timestamp - lag_2', 'elapsed time:timestamp - lag_1', 'concept:name - lag_1', 'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4', 'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7', 'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10'], errors='ignore')
df['elapsed time:timestamp'] = pd.to_timedelta(df['elapsed time:timestamp'], unit='seconds')
df['elapsed time:timestamp XGBoost'] = pd.to_timedelta(df['elapsed time:timestamp XGBoost'], unit='seconds')
df['next time:timestamp XGBoost'] = df['time:timestamp'] + df['elapsed time:timestamp XGBoost']
df = df.drop(columns=['elapsed time:timestamp XGBoost', 'elapsed time:timestamp'], errors='ignore')

# Decode categorical columns
for column in df.columns:
    if column in ['concept:name', 'concept:name - lag_1', 'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4', 'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7', 'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10']:
        df[column] = label_encoder.inverse_transform(df[column])
    else:
        continue 


    ######################################## TRAIN TEST INFO #######################################

      Train set ends with 2016-04-27 11:57:51.650

      Test set starts with: 2016-04-27 13:53:37.971

    ################################################################################################

    

    inputs: ['concept:name', 'concept:name - lag_1', 'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4', 'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7', 'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10'] 

    target: ['next concept:name']


    ######################################## TRAIN TEST INFO #######################################

      Train set ends with 2016-04-27 11:57:51.650000

      Test set starts with: 2016-04-27 13:53:37.971000

    ################################################################################################

    

    inputs: ['next concept:name rfc', 'concept:name', 'co

In [198]:
predicted_traces_list = []
case_concept_names_list = []

with open('../model_weights/random_forest_trace.pkl', 'rb') as f:
        rfc_model = pickle.load(f)

for index, row in df_traces.iterrows():

      case_concept_name = row['case:concept:name']
      df_trace = pd.DataFrame(row['trace'])

      df_trace['concept:name - lag_1'] = df_trace['concept:name'].shift(1).fillna('absent')
      df_trace['concept:name - lag_2'] = df_trace['concept:name'].shift(2).fillna('absent')
      df_trace['concept:name - lag_3'] = df_trace['concept:name'].shift(3).fillna('absent')
      df_trace['concept:name - lag_4'] = df_trace['concept:name'].shift(4).fillna('absent')
      df_trace['concept:name - lag_5'] = df_trace['concept:name'].shift(5).fillna('absent')
      df_trace['concept:name - lag_6'] = df_trace['concept:name'].shift(6).fillna('absent')
      df_trace['concept:name - lag_7'] = df_trace['concept:name'].shift(7).fillna('absent')
      df_trace['concept:name - lag_8'] = df_trace['concept:name'].shift(8).fillna('absent')
      df_trace['concept:name - lag_9'] = df_trace['concept:name'].shift(9).fillna('absent')
      df_trace['concept:name - lag_10'] = df_trace['concept:name'].shift(10).fillna('absent')

      # find out the number of rows in the trace
      num_rows = len(df_trace)

      # Choose the upper bound for the random integer
      upper_bound = min(num_rows, 10)

      # Generate a random integer within the specified range
      random_index = random.randint(1, upper_bound)

      # slice the trace on random integer
      df_trace = df_trace.iloc[:random_index]
      
      # encode categorical columns in the trace 
      columns = ['concept:name', 'concept:name - lag_1', 'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4', 'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7', 'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10']
      for column in columns:
            df_trace[column] = label_encoder.transform(df_trace[column])
            
      
      event_rfc = df_trace[columns].iloc[[-1]].values

      # build the suffix dataframe for concept:name
      predicted_suffix = pd.DataFrame({'concept:name': [], 'time:timestamp': []})
      operator = True
      while operator:
            window = pd.DataFrame([event_rfc[:, :11][0].tolist()], columns=['concept:name', 'concept:name - lag_1', 'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4', 'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7', 'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10'])
            prediction_next_event = rfc_model.predict(window)
            event_rfc = np.insert(event_rfc, 0, prediction_next_event, axis=1)

            if (event_rfc[0,0] == end_token) and event_rfc.shape[1] > 15:
                  operator = False

            if event_rfc.shape[1] >= 80:
                  operator = False
                  
      # build the suffix dataframe for concept:name
      predicted_suffix['concept:name'] = event_rfc.tolist()[0][::-1]
      predicted_suffix['time:timestamp'] = [np.nan] * len(event_rfc.tolist()[0][::-1])

      # join the suffix dataframe with the trace
      df_trace = pd.concat([df_trace, predicted_suffix], ignore_index=True, axis=0)

      # define the next event
      df_trace['next concept:name rfc'] = df_trace['concept:name'].shift(-1)

      # encode categorical columns in the next event
      df_trace.loc[len(df_trace)-1, 'next concept:name rfc'] = label_encoder.transform(['END'])
      encoding_for_absent = label_encoder.transform(['absent'])[0]

      # computing lags for concept:name again 
      df_trace['concept:name - lag_2'] = df_trace['concept:name'].shift(2).fillna(encoding_for_absent)
      df_trace['concept:name - lag_3'] = df_trace['concept:name'].shift(3).fillna(encoding_for_absent)
      df_trace['concept:name - lag_1'] = df_trace['concept:name'].shift(1).fillna(encoding_for_absent)
      df_trace['concept:name - lag_4'] = df_trace['concept:name'].shift(4).fillna(encoding_for_absent)
      df_trace['concept:name - lag_5'] = df_trace['concept:name'].shift(5).fillna(encoding_for_absent)
      df_trace['concept:name - lag_6'] = df_trace['concept:name'].shift(6).fillna(encoding_for_absent)
      df_trace['concept:name - lag_7'] = df_trace['concept:name'].shift(7).fillna(encoding_for_absent)
      df_trace['concept:name - lag_8'] = df_trace['concept:name'].shift(8).fillna(encoding_for_absent)
      df_trace['concept:name - lag_9'] = df_trace['concept:name'].shift(9).fillna(encoding_for_absent)
      df_trace['concept:name - lag_10'] = df_trace['concept:name'].shift(10).fillna(encoding_for_absent)

      # counting elapsed time
      df_trace['time:timestamp'] = pd.to_datetime(df_trace['time:timestamp'], format='mixed')
      df_trace['time:timestamp diff'] = df_trace['time:timestamp'].diff()
      df_trace['elapsed time:timestamp'] = df_trace['time:timestamp diff'].shift(-1) 
      df_trace['elapsed time:timestamp'] = pd.to_timedelta(df_trace['elapsed time:timestamp'])
      df_trace['elapsed time:timestamp'] = df_trace['elapsed time:timestamp'].apply(lambda x: x.total_seconds()).fillna(-0.01)

      # counting lags for elapsed time again 
      df_trace['elapsed time:timestamp - lag_1'] = df_trace['elapsed time:timestamp'].shift(1).fillna(-0.00000001)
      df_trace['elapsed time:timestamp - lag_2'] = df_trace['elapsed time:timestamp'].shift(2).fillna(-0.00000001)
      df_trace['elapsed time:timestamp - lag_3'] = df_trace['elapsed time:timestamp'].shift(3).fillna(-0.00000001)
      df_trace['elapsed time:timestamp - lag_4'] = df_trace['elapsed time:timestamp'].shift(4).fillna(-0.00000001)
      df_trace['elapsed time:timestamp - lag_5'] = df_trace['elapsed time:timestamp'].shift(5).fillna(-0.00000001)
      df_trace['elapsed time:timestamp - lag_6'] = df_trace['elapsed time:timestamp'].shift(6).fillna(-0.00000001)
      df_trace['elapsed time:timestamp - lag_7'] = df_trace['elapsed time:timestamp'].shift(7).fillna(-0.00000001)
      df_trace['elapsed time:timestamp - lag_8'] = df_trace['elapsed time:timestamp'].shift(8).fillna(-0.00000001)
      df_trace['elapsed time:timestamp - lag_9'] = df_trace['elapsed time:timestamp'].shift(9).fillna(-0.00000001)
      df_trace['elapsed time:timestamp - lag_10'] = df_trace['elapsed time:timestamp'].shift(10).fillna(-0.00000001)

      # calculating the time predictions for the trace and suffix
      index = random_index
      xgboost_inputs = df_trace.drop(columns=['time:timestamp diff', 'time:timestamp', 'elapsed time:timestamp'], axis=1).loc[index]
      while index < len(df_trace) -1:
            xgboost_inputs = xgboost_inputs[['next concept:name rfc', 'concept:name', 'concept:name - lag_1', 'concept:name - lag_2', 'concept:name - lag_3', 'concept:name - lag_4', 'concept:name - lag_5', 'concept:name - lag_6', 'concept:name - lag_7', 'concept:name - lag_8', 'concept:name - lag_9', 'concept:name - lag_10', 'elapsed time:timestamp - lag_1', 'elapsed time:timestamp - lag_2', 'elapsed time:timestamp - lag_3', 'elapsed time:timestamp - lag_4', 'elapsed time:timestamp - lag_5', 'elapsed time:timestamp - lag_6', 'elapsed time:timestamp - lag_7', 'elapsed time:timestamp - lag_8', 'elapsed time:timestamp - lag_9', 'elapsed time:timestamp - lag_10']]
            df_trace.loc[index, 'time:timestamp'] = df_trace.loc[index-1, 'time:timestamp'] + pd.to_timedelta(np.abs(np.where(xgboost_trace.predict(pd.DataFrame(xgboost_inputs).transpose())[0] < 0, np.abs(xgboost_trace.predict(pd.DataFrame(xgboost_inputs).transpose())[0]) / 10000, xgboost_trace.predict(pd.DataFrame(xgboost_inputs).transpose())[0] / 1000)), unit='s')
            index = index + 1
      
      # take only needed columns
      df_trace = df_trace[['concept:name', 'time:timestamp']]

      # convert time to string format
      df_trace['time:timestamp'] = df_trace['time:timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')

      # cut off the end token 
      df_trace = df_trace.drop(df_trace.index[-1])
      df_trace['concept:name'] = label_encoder.inverse_transform(df_trace['concept:name'])

      # append the trace to the list with corresponding case:concept:name
      predicted_traces_list.append(df_trace)
      case_concept_names_list.append(case_concept_name)     
      
      # save the final predictions
      df_final_predictions = pd.DataFrame({'case:concept:name': case_concept_names_list, 'predicted_traces': predicted_traces_list})

In [199]:
df_final_predictions.to_json(f'../data/trace_predictions_{chosen_dataset}.json', orient='records')

In [193]:
df_final_predictions['predicted_traces'].loc[0]

Unnamed: 0,concept:name,time:timestamp
0,A_Create Application,2016-03-07 15:34:00.886000+00:00
1,A_Concept,2016-03-07 15:34:00.888000+00:00
2,W_Complete application,2016-03-07 15:34:00.895000+00:00
3,W_Complete application,2016-03-07 15:34:00.896000+00:00
4,W_Complete application,2016-03-07 15:34:24.622000+00:00
...,...,...
79,W_Call incomplete files,2016-03-07 16:36:29.167117150+00:00
80,W_Call incomplete files,2016-03-07 16:37:18.827718712+00:00
81,W_Call incomplete files,2016-03-07 16:38:08.488320274+00:00
82,W_Call incomplete files,2016-03-07 16:38:58.148921836+00:00
