In [52]:
import pickle
import joblib

import pandas as pd

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from numpy import mean, std
from sklearn.model_selection import cross_val_score

In [53]:
path_to_data_folder =  '/Users/alexraudvee/Desktop/TU_e/year_2/Q3Y2_Process_Mining/process_mining_data'

In [54]:
# extract the 1-10 lags

df_2017 = pd.read_csv(f"{path_to_data_folder}/BPI_2017_withtime_XG.csv")

df_2017['concept:name - lag_1'] = df_2017['concept:name'].shift(1)
df_2017['concept:name - lag_2'] = df_2017['concept:name'].shift(2)
df_2017['concept:name - lag_3'] = df_2017['concept:name'].shift(3)

df_2017 = df_2017.loc[3:]

# define target
df_2017['concept:name next'] = df_2017['concept:name'].shift(-1)

df_2017 = df_2017.drop(df_2017.index[-1])

In [55]:
# only for testing 
# total_rows = df_2017.shape[0]

# # Calculate the index to divide the DataFrame 
# split_index = total_rows // 2

# # Split the DataFrame into two halves
# df_2017 = df_2017.iloc[:split_index, :]

df_2017

Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,time_to_next_event_seconds,predicted_time_to_next_event_seconds,elapsed_time_from_start,predicted_start_time,predicted_step,predicted_time_to_next_event_seconds_XG,concept:name - lag_1,concept:name - lag_2,concept:name - lag_3,concept:name next
3,3.0,1,0,22,2,701704,6,3,5,1,...,0 days 00:00:00.010000,0 days 00:00:35.170000,33119,128,7,0 days 00:00:15,22.0,8.0,4.0,21.0
4,4.0,0,0,21,2,630431,3,4,5,1,...,0 days 00:00:00.010000,0 days 00:00:00.010000,33126,127,6,0 days 00:00:05.500000,22.0,22.0,8.0,3.0
5,5.0,4,0,3,0,169679,1,5,5,1,...,4 days 23:34:08.080000,2 days 01:50:47.740000,33133,126,0,4 days 22:54:39.812500,21.0,22.0,22.0,4.0
6,6.0,0,0,4,0,219156,1,6,7,1,...,0 days 00:00:00.050000,0 days 00:00:00.040000,0,112,1,0 days 00:00:53.700001,3.0,21.0,22.0,8.0
7,7.0,4,0,8,0,130913,1,7,7,1,...,0 days 00:00:00.190000,0 days 00:00:00.300000,49,113,3,0 days 00:00:11.200000,4.0,3.0,21.0,22.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1201084,1201084.0,4,148,7,0,47129,1,1200969,11,1,...,0 days 00:00:00,0 days 00:00:00,999898,42,5,0 days 00:01:17.500000,10.0,20.0,6.0,25.0
1201085,1201085.0,1,148,25,2,708416,0,1200970,11,1,...,0 days 00:00:00.010000,0 days 00:00:00.020000,999899,44,5,0 days 00:01:00.799999,7.0,10.0,20.0,11.0
1201086,1201086.0,4,148,11,1,323681,1,1200971,11,1,...,0 days 00:00:00.010000,0 days 00:00:00.010000,999900,45,5,0 days 00:01:17.500000,25.0,7.0,10.0,11.0
1201087,1201087.0,4,148,11,1,250417,1,1200972,11,1,...,0 days 00:00:00.010000,0 days 00:00:00.010000,999901,43,5,0 days 00:01:17.500000,11.0,25.0,7.0,11.0


In [56]:
df_2017.columns

Index(['Unnamed: 0', 'Action', 'org:resource', 'concept:name', 'EventOrigin',
       'EventID', 'lifecycle:transition', 'time:timestamp', 'case:LoanGoal',
       'case:ApplicationType', 'case:concept:name', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost',
       'Selected', 'CreditScore', 'OfferedAmount', 'OfferID',
       'timestamp_difference', 'event_seq', 'time_to_next_event',
       'time_to_next_event_seconds', 'predicted_time_to_next_event_seconds',
       'elapsed_time_from_start', 'predicted_start_time', 'predicted_step',
       'predicted_time_to_next_event_seconds_XG', 'concept:name - lag_1',
       'concept:name - lag_2', 'concept:name - lag_3', 'concept:name next'],
      dtype='object')

In [58]:
# Prepare data
X = df_2017[['concept:name - lag_2', 'concept:name - lag_1', 'concept:name']]
y = df_2017['concept:name next']

# Use LabelEncoder to convert string labels to numerical values
label_encoder = LabelEncoder()
X['concept:name - lag_1'] = label_encoder.fit_transform(X['concept:name - lag_1'])
X['concept:name - lag_2'] = label_encoder.fit_transform(X['concept:name - lag_2'])
X['concept:name'] = label_encoder.fit_transform(X['concept:name'])
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Define the Random Forest Classifier
rf_clf = RandomForestClassifier(n_jobs=-1)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [21, 22, 23],
    'max_depth': [18, 19, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform nested cross-validation
outer_cv = KFold(n_splits=3, shuffle=False, random_state=None)  # 3-fold outer cross-validation
inner_cv = KFold(n_splits=5, shuffle=False, random_state=None)  # 5-fold inner cross-validation


grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, scoring='accuracy', cv=inner_cv)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

df_2017['predicted_event_next_rfc'] = label_encoder.inverse_transform(best_model.predict(X))

# decode labels
df_2017['concept:name next'] = label_encoder.inverse_transform(df_2017['concept:name next'])
df_2017['concept:name next'] = label_encoder.inverse_transform(df_2017['predicted_event_next_rfc'])

# Print the results
print(f"Best params: {grid_search.best_params_}")
print(f'Nested Cross-Validation Accuracy: {grid_search.best_score_:.4f}')
print(best_model.score(X_test, y_test))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['concept:name - lag_1'] = label_encoder.fit_transform(X['concept:name - lag_1'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['concept:name - lag_2'] = label_encoder.fit_transform(X['concept:name - lag_2'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['concept:name'] = label_encoder.fit_t

In [45]:
X = df_2017[['concept:name - lag_2', 'concept:name - lag_1', 'concept:name']]

model = joblib.load("random_forest_model.pkl")

# Add predictions as a new column to the DataFrame
df_2017['predicted_event_next_rfc'] = label_encoder.inverse_transform(model.predict(X))

In [50]:
df_2017['concept:name next'] = label_encoder.inverse_transform(df_2017['concept:name next'])

AttributeError: 'float' object has no attribute 'round'

In [49]:
df_2017

Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,predicted_time_to_next_event_seconds,elapsed_time_from_start,predicted_start_time,predicted_step,predicted_time_to_next_event_seconds_XG,concept:name - lag_1,concept:name - lag_2,concept:name - lag_3,concept:name next,predicted_event_next_rfc
3,3.0,1,0,22,2,701704,6,3,5,1,...,0 days 00:00:35.170000,33119,128,7,0 days 00:00:15,22.0,8.0,4.0,21.0,21.0
4,4.0,0,0,21,2,630431,3,4,5,1,...,0 days 00:00:00.010000,33126,127,6,0 days 00:00:05.500000,22.0,22.0,8.0,3.0,3.0
5,5.0,4,0,3,0,169679,1,5,5,1,...,2 days 01:50:47.740000,33133,126,0,4 days 22:54:39.812500,21.0,22.0,22.0,4.0,4.0
6,6.0,0,0,4,0,219156,1,6,7,1,...,0 days 00:00:00.040000,0,112,1,0 days 00:00:53.700001,3.0,21.0,22.0,8.0,8.0
7,7.0,4,0,8,0,130913,1,7,7,1,...,0 days 00:00:00.300000,49,113,3,0 days 00:00:11.200000,4.0,3.0,21.0,22.0,22.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1201084,1201084.0,4,148,7,0,47129,1,1200969,11,1,...,0 days 00:00:00,999898,42,5,0 days 00:01:17.500000,10.0,20.0,6.0,25.0,25.0
1201085,1201085.0,1,148,25,2,708416,0,1200970,11,1,...,0 days 00:00:00.020000,999899,44,5,0 days 00:01:00.799999,7.0,10.0,20.0,11.0,25.0
1201086,1201086.0,4,148,11,1,323681,1,1200971,11,1,...,0 days 00:00:00.010000,999900,45,5,0 days 00:01:17.500000,25.0,7.0,10.0,11.0,25.0
1201087,1201087.0,4,148,11,1,250417,1,1200972,11,1,...,0 days 00:00:00.010000,999901,43,5,0 days 00:01:17.500000,11.0,25.0,7.0,11.0,25.0


In [None]:
# Save the best model to a file
model_filename = 'random_forest_model.joblib'
joblib.dump(best_model, model_filename)

# Save the best model to a file using pickle
model_filename = 'random_forest_model.pkl'
with open(model_filename, 'wb') as model_file:
    pickle.dump(best_model, model_file)

### Only for test purposes, do not run it

In [35]:
# create dataset
X = df_2017[['concept:name - lag_2', 'concept:name - lag_1', 'concept:name']]
y = df_2017[['concept:name next']]

# Use LabelEncoder to convert string labels to numerical values
label_encoder = LabelEncoder()
X['concept:name - lag_1'] = label_encoder.fit_transform(X['concept:name - lag_1'])
X['concept:name - lag_2'] = label_encoder.fit_transform(X['concept:name - lag_2'])
X['concept:name'] = label_encoder.fit_transform(X['concept:name'])
y = label_encoder.fit_transform(y)

# configure the cross-validation procedure
cv_outer = KFold(n_splits=10, shuffle=False, random_state=None)
# enumerate splits
outer_results = list()
for train_ix, test_ix in cv_outer.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]
    # configure the cross-validation procedure
    cv_inner = KFold(n_splits=3, shuffle=False, random_state=None)
    # define the model
    model = RandomForestClassifier(n_jobs=1)
    # define search
    search = GridSearchCV(model, param_grid=param_grid, scoring='accuracy', cv=cv_inner, refit=True)
    # execute search
    result = search.fit(X_train, y_train)
    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_
    # evaluate model on the hold out dataset
    yhat = best_model.predict(X_test)
    # evaluate the model
    acc = accuracy_score(y_test, yhat)
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (mean(outer_results), std(outer_results)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['concept:name - lag_1'] = label_encoder.fit_transform(X['concept:name - lag_1'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['concept:name - lag_2'] = label_encoder.fit_transform(X['concept:name - lag_2'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['concept:name'] = label_encoder.fit_t

>acc=0.980, est=0.950, cfg={'max_depth': 19, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 21}
>acc=0.960, est=0.952, cfg={'max_depth': 19, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 21}
>acc=0.960, est=0.952, cfg={'max_depth': 19, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 19}
>acc=0.950, est=0.953, cfg={'max_depth': 18, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 21}
>acc=0.960, est=0.952, cfg={'max_depth': 18, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 19}
>acc=0.960, est=0.952, cfg={'max_depth': 19, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 20}
>acc=0.940, est=0.954, cfg={'max_depth': 18, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 21}
>acc=0.920, est=0.957, cfg={'max_depth': 19, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 20}
>acc=0.919, est=0.955, cfg={'max_depth': 18, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimat

In [14]:
# Save the best model to a file
model_filename = 'random_forest_model.joblib'
joblib.dump(best_model, model_filename)

# Save the best model to a file using pickle
model_filename = 'random_forest_model.pkl'
with open(model_filename, 'wb') as model_file:
    pickle.dump(best_model, model_file)

In [18]:
df_2017

Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,predicted_time_to_next_event_seconds,elapsed_time_from_start,predicted_start_time,predicted_step,predicted_time_to_next_event_seconds_XG,concept:name - lag_1,concept:name - lag_2,concept:name - lag_3,concept:name next,predicted_next_event_rfc
3,3.0,1,0,22,2,701704,6,3,5,1,...,0 days 00:00:35.170000,33119,128,7,0 days 00:00:15,22.0,8.0,4.0,21.0,21
4,4.0,0,0,21,2,630431,3,4,5,1,...,0 days 00:00:00.010000,33126,127,6,0 days 00:00:05.500000,22.0,22.0,8.0,3.0,3
5,5.0,4,0,3,0,169679,1,5,5,1,...,2 days 01:50:47.740000,33133,126,0,4 days 22:54:39.812500,21.0,22.0,22.0,4.0,4
6,6.0,0,0,4,0,219156,1,6,7,1,...,0 days 00:00:00.040000,0,112,1,0 days 00:00:53.700001,3.0,21.0,22.0,8.0,8
7,7.0,4,0,8,0,130913,1,7,7,1,...,0 days 00:00:00.300000,49,113,3,0 days 00:00:11.200000,4.0,3.0,21.0,22.0,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1201085,1201085.0,1,148,25,2,708416,0,1200970,11,1,...,0 days 00:00:00.020000,999899,44,5,0 days 00:01:00.799999,7.0,10.0,20.0,11.0,25
1201086,1201086.0,4,148,11,1,323681,1,1200971,11,1,...,0 days 00:00:00.010000,999900,45,5,0 days 00:01:17.500000,25.0,7.0,10.0,11.0,25
1201087,1201087.0,4,148,11,1,250417,1,1200972,11,1,...,0 days 00:00:00.010000,999901,43,5,0 days 00:01:17.500000,11.0,25.0,7.0,11.0,25
1201088,1201088.0,4,148,11,1,316489,1,1200973,11,1,...,0 days 00:00:00.010000,999902,47,5,0 days 00:01:17.500000,11.0,11.0,25.0,11.0,19
