In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
# from sklearn.model_selection import train_test_split
import random


plt.style.use('seaborn')

### Loading data

In [2]:
path = r"/home/dazai/Documents/Process mining/Data/BPI 2012/"
df_train = pd.read_csv(f'{path}BPI_Challenge_2012-training.csv')
df_test = pd.read_csv(f'{path}BPI_Challenge_2012-test.csv')


df_data = pd.concat([df_train, df_test])
df_data['event time:timestamp'] = pd.to_datetime(df_data['event time:timestamp'], dayfirst=True).dt.date
# df_data['event time:timestamp'] = df_data['event time:timestamp'].to_timestamp()
df_data.sort_values(by=['event time:timestamp'])
df_data.reset_index(inplace=True, drop=True)
# df_data.drop(['case description', 'event org:resource'], axis=1, inplace=True)

# remove whitespace at beginning and end of column name
df_data.columns = df_data.columns.str.strip()



### Label Encoding 
NOTE: To make the sliding window model work, We will use label encoding for 'event concept:name' to preserve one outcome variable  <br><br>

In [6]:
# Get unique activities and encode them
Y = df_data['event concept:name'].unique()
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(Y)

# Randomly selecting a few to test
randomYs = random.choices(Y, k=5)
print(randomYs)
label_encoder.transform(randomYs)

['W_Beoordelen fraude', 'O_SENT', 'W_Nabellen incomplete dossiers', 'W_Wijzigen contractgegevens', 'O_DECLINED']


array([18, 15, 20, 23, 13])

In [None]:
# Get dummy variables and encode lifecycle
df_dummies_lifecycle = pd.get_dummies(df_data['event lifecycle:transition'], prefix='Lifecycle', drop_first=True)
df_encoded = df_data.loc[:, df_data.columns != 'event lifecycle:transition'].copy().join(df_dummies_lifecycle)

# Encode event concept name
df_encoded['event concept:name'] = label_encoder.transform(df_data['event concept:name'])
df_encoded

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event time:timestamp,Lifecycle_SCHEDULE,Lifecycle_START
0,0,173688,2011-10-01T00:38:44.546+02:00,20000,9,2011-10-01,0,0
1,1,173688,2011-10-01T00:38:44.546+02:00,20000,6,2011-10-01,0,0
2,2,173688,2011-10-01T00:38:44.546+02:00,20000,7,2011-10-01,0,0
3,3,173688,2011-10-01T00:38:44.546+02:00,20000,19,2011-10-01,1,0
4,4294967296,173691,2011-10-01T08:08:58.256+02:00,5000,9,2011-10-01,0,0
...,...,...,...,...,...,...,...,...
262195,54666343743523,213276,2012-02-27T14:12:41.868+01:00,15000,20,2012-03-14,0,1
262196,54666343743524,213276,2012-02-27T14:12:41.868+01:00,15000,20,2012-03-14,0,0
262197,49495203119136,209595,2012-02-15T10:10:36.503+01:00,13000,21,2012-03-14,0,1
262198,52342766436386,211624,2012-02-21T23:38:40.044+01:00,35000,20,2012-03-14,0,1


### Implementation of One-Hot encoding

In [None]:
# dummy variables encoded
df_dummies_name = pd.get_dummies(df_data['event concept:name'], prefix='Event Name', drop_first=True)

# adding encoded values 
cols = df_data.columns[:-3].tolist()
df_ohencoded = df_data[cols].copy().join(df_dummies_name).join(df_dummies_lifecycle)

In [None]:
df_ohencoded

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,Event Name_A_ACTIVATED,Event Name_A_APPROVED,Event Name_A_CANCELLED,Event Name_A_DECLINED,Event Name_A_FINALIZED,Event Name_A_PARTLYSUBMITTED,...,Event Name_O_SENT_BACK,Event Name_W_Afhandelen leads,Event Name_W_Beoordelen fraude,Event Name_W_Completeren aanvraag,Event Name_W_Nabellen incomplete dossiers,Event Name_W_Nabellen offertes,Event Name_W_Valideren aanvraag,Event Name_W_Wijzigen contractgegevens,Lifecycle_SCHEDULE,Lifecycle_START
0,0,173688,2011-10-01T00:38:44.546+02:00,20000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,173688,2011-10-01T00:38:44.546+02:00,20000,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2,173688,2011-10-01T00:38:44.546+02:00,20000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,173688,2011-10-01T00:38:44.546+02:00,20000,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,4294967296,173691,2011-10-01T08:08:58.256+02:00,5000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262195,54666343743523,213276,2012-02-27T14:12:41.868+01:00,15000,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
262196,54666343743524,213276,2012-02-27T14:12:41.868+01:00,15000,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
262197,49495203119136,209595,2012-02-15T10:10:36.503+01:00,13000,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
262198,52342766436386,211624,2012-02-21T23:38:40.044+01:00,35000,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1


### Implementation of sliding window transformation
Note: For every row in the dataset the activity name of the CURRENT row is the Y value, the x values consist of: ALL columns of current row without current Y + ALL columns from LAST row including last Y. So the first and last values in the dataset will be discarded as there is either no input or output

In [None]:
# we will convert the ID's to string to prevent rounding can convert back if needed
df_encoded['eventID'] = df_encoded['eventID'].astype('str')


# To do: look into cyclic encoding for months
# To do: implement per trace

In [None]:
# All columns except Y are predictors (x1, x2, x3...)
df_encoded['Y'] = df_encoded.shift(-1)['event concept:name']
display(df_encoded)

# Only keeping columns that we need for event prediction
cols = df_encoded.columns.tolist()
wanted_index = [3,4,6,7,8]
cols = [i[1] for i in enumerate(cols) if i[0] in wanted_index]
# df_encoded.drop(df_encoded.tail(1).index,inplace=True)
display(df_encoded[cols])


Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event time:timestamp,Lifecycle_SCHEDULE,Lifecycle_START,Y
0,0,173688,2011-10-01T00:38:44.546+02:00,20000,9,2011-10-01,0,0,6.0
1,1,173688,2011-10-01T00:38:44.546+02:00,20000,6,2011-10-01,0,0,7.0
2,2,173688,2011-10-01T00:38:44.546+02:00,20000,7,2011-10-01,0,0,19.0
3,3,173688,2011-10-01T00:38:44.546+02:00,20000,19,2011-10-01,1,0,9.0
4,4294967296,173691,2011-10-01T08:08:58.256+02:00,5000,9,2011-10-01,0,0,6.0
...,...,...,...,...,...,...,...,...,...
262195,54666343743523,213276,2012-02-27T14:12:41.868+01:00,15000,20,2012-03-14,0,1,20.0
262196,54666343743524,213276,2012-02-27T14:12:41.868+01:00,15000,20,2012-03-14,0,0,21.0
262197,49495203119136,209595,2012-02-15T10:10:36.503+01:00,13000,21,2012-03-14,0,1,20.0
262198,52342766436386,211624,2012-02-21T23:38:40.044+01:00,35000,20,2012-03-14,0,1,20.0


Unnamed: 0,case AMOUNT_REQ,event concept:name,Lifecycle_SCHEDULE,Lifecycle_START,Y
0,20000,9,0,0,6.0
1,20000,6,0,0,7.0
2,20000,7,0,0,19.0
3,20000,19,1,0,9.0
4,5000,9,0,0,6.0
...,...,...,...,...,...
262195,15000,20,0,1,20.0
262196,15000,20,0,0,21.0
262197,13000,21,0,1,20.0
262198,35000,20,0,1,20.0


In [None]:
# Current row + last row together as predictors
current = df_encoded[cols[:-1]].iloc[1:-1 ,:]
last = df_encoded[cols[:-1]].shift(1).iloc[1:-1 ,:]
display(current)
display(last)

Unnamed: 0,case AMOUNT_REQ,event concept:name,Lifecycle_SCHEDULE,Lifecycle_START
1,20000,6,0,0
2,20000,7,0,0
3,20000,19,1,0
4,5000,9,0,0
5,5000,6,0,0
...,...,...,...,...
262194,2000,20,0,0
262195,15000,20,0,1
262196,15000,20,0,0
262197,13000,21,0,1


Unnamed: 0,case AMOUNT_REQ,event concept:name,Lifecycle_SCHEDULE,Lifecycle_START
1,20000.0,9.0,0.0,0.0
2,20000.0,6.0,0.0,0.0
3,20000.0,7.0,0.0,0.0
4,20000.0,19.0,1.0,0.0
5,5000.0,9.0,0.0,0.0
...,...,...,...,...
262194,2000.0,22.0,1.0,0.0
262195,2000.0,20.0,0.0,0.0
262196,15000.0,20.0,0.0,1.0
262197,15000.0,20.0,0.0,0.0


Note: Y value in this case is the actual next event. With input x1,x2,x3... where X consists of all columns from last row and all columns from current row excluding Y. The Y value from last row acts as a predictor x for current row to predict Y next event

In [None]:
Y = df_encoded['Y'].to_numpy()[1:-1]
X = [x + x1 for x, x1 in zip(current.to_numpy().tolist(), current.to_numpy().tolist())]


# To do: sliding window implementation with bigger window