In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from aux_functions import split_data
# from aux_functions import compute_levensthein_distance
from sklearn.preprocessing import LabelEncoder
import torch.optim as optim
import torch.nn as nn
import torch
import tqdm as tqdm
from sklearn.decomposition import PCA
from torch.masked import masked_tensor

In [46]:
def compute_levensthein_distance(predicted, actual):
    """
    calculating the distance between the two lists of activities
    :param predicted: list of activities
    :param actual: list of activities
    :return: distance between the two lists
    """
    # if the actual list is empty, return the length of the predicted list
    if len(actual) == 0:
        return len(predicted)

    # if the predicted list is empty, return the length of the actual list
    if len(predicted) == 0:
        return len(actual)

    # creating a matrix with the size of the two lists
    matrix = [[0 for _ in range(len(actual) + 1)] for _ in range(len(predicted) + 1)]

    # filling the first row and the first column of the matrix
    for i in range(len(predicted) + 1):
        matrix[i][0] = i
    for j in range(len(actual) + 1):
        matrix[0][j] = j

    # filling the matrix
    for i in range(1, len(predicted) + 1):
        for j in range(1, len(actual) + 1):
            if predicted[i - 1] == actual[j - 1]:
                matrix[i][j] = matrix[i - 1][j - 1]
            else:
                matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + 1)

    return matrix[len(predicted)][len(actual)]

In [3]:
df = pd.read_csv('data/preprocessed/BPI_Challenge_2012.csv')

In [4]:
df.head()

Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,position,next_concept:name,next_timestamp
0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000,1,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+00:00
1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000,2,A_PREACCEPTED,2011-10-01 00:39:37.906000+00:00
2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000,3,W_Completeren aanvraag,2011-10-01 00:39:38.875000+00:00
3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000,4,W_Completeren aanvraag,2011-10-01 11:36:46.437000+00:00
4,112.0,COMPLETE,A_SUBMITTED,2011-10-01 08:08:58.256000+00:00,2011-10-01 08:08:58.256000+00:00,173691,5000,1,A_PARTLYSUBMITTED,2011-10-01 08:09:02.195000+00:00


In [5]:
df[df['case:concept:name'] == 173697]

Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,position,next_concept:name,next_timestamp
10,112.0,COMPLETE,A_SUBMITTED,2011-10-01 08:11:08.866000+00:00,2011-10-01 08:11:08.865000+00:00,173697,15000,1,A_PARTLYSUBMITTED,2011-10-01 08:11:09.035000+00:00
11,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 08:11:09.035000+00:00,2011-10-01 08:11:08.865000+00:00,173697,15000,2,A_DECLINED,2011-10-01 08:11:46.420000+00:00
14,112.0,COMPLETE,A_DECLINED,2011-10-01 08:11:46.420000+00:00,2011-10-01 08:11:08.865000+00:00,173697,15000,3,No_Activity,


In [6]:
# Removing the sequences with length of more than 50
df = df[df['case:concept:name'].isin(df.groupby('case:concept:name').size()[df.groupby('case:concept:name').size() <= 10].index)]

In [7]:
# creating a label encoder
le = {}
le[0] = 'No_Activity'
i = 1 
for label in df['concept:name'].unique():
    le[i] = label
    i += 1
le

{0: 'No_Activity',
 1: 'A_SUBMITTED',
 2: 'A_PARTLYSUBMITTED',
 3: 'A_DECLINED',
 4: 'A_PREACCEPTED',
 5: 'W_Completeren aanvraag',
 6: 'W_Afhandelen leads',
 7: 'A_CANCELLED',
 8: 'W_Beoordelen fraude',
 9: 'A_ACCEPTED'}

In [8]:
# Encoding the activities
df['int:concept:name'] = df['concept:name'].apply(lambda x: [k for k, v in le.items() if v == x][0])
df['int:next_concept:name'] = df['next_concept:name'].apply(lambda x: [k for k, v in le.items() if v == x][0])
df.head()

Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,position,next_concept:name,next_timestamp,int:concept:name,int:next_concept:name
10,112.0,COMPLETE,A_SUBMITTED,2011-10-01 08:11:08.866000+00:00,2011-10-01 08:11:08.865000+00:00,173697,15000,1,A_PARTLYSUBMITTED,2011-10-01 08:11:09.035000+00:00,1,2
11,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 08:11:09.035000+00:00,2011-10-01 08:11:08.865000+00:00,173697,15000,2,A_DECLINED,2011-10-01 08:11:46.420000+00:00,2,3
14,112.0,COMPLETE,A_DECLINED,2011-10-01 08:11:46.420000+00:00,2011-10-01 08:11:08.865000+00:00,173697,15000,3,No_Activity,,3,0
15,112.0,COMPLETE,A_SUBMITTED,2011-10-01 08:15:39.894000+00:00,2011-10-01 08:15:39.894000+00:00,173700,5000,1,A_PARTLYSUBMITTED,2011-10-01 08:15:40.102000+00:00,1,2
16,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 08:15:40.102000+00:00,2011-10-01 08:15:39.894000+00:00,173700,5000,2,A_DECLINED,2011-10-01 08:16:21.037000+00:00,2,3


In [9]:
# encoder = LabelEncoder()

# # Fit and transform the 'concept:name' column
# encoded_concept_names = encoder.fit_transform(df['concept:name'])
# # Adjusting labels to start from 1 instead of 0
# encoded_concept_names += 1

# # Fit and transform the 'next_concept:name' column
# encoded_next_concept_names = encoder.fit_transform(df['next_concept:name'])
# # Adjusting labels to start from 1 instead of 0
# encoded_next_concept_names += 1

# # Assign the encoded values back to the DataFrame
# df['int:concept:name'] = encoded_concept_names
# df['int:next_concept:name'] = encoded_next_concept_names

In [10]:
# # one hot encoding the 'concept:name'
# concept_name_copy = df['concept:name'].copy()
# df = pd.get_dummies(df, columns=['concept:name'], dtype=int)
# df['concept:name'] = concept_name_copy
# df

In [11]:
# one hot encode the 'life_cycle:transition' column
df = pd.get_dummies(df, columns=['lifecycle:transition'])
df['lifecycle:transition_COMPLETE'] = df['lifecycle:transition_COMPLETE'].astype(int)
df['lifecycle:transition_START'] = df['lifecycle:transition_START'].astype(int)
df['lifecycle:transition_SCHEDULE'] = df['lifecycle:transition_SCHEDULE'].astype(int)

df.head(1)

Unnamed: 0,org:resource,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,position,next_concept:name,next_timestamp,int:concept:name,int:next_concept:name,lifecycle:transition_COMPLETE,lifecycle:transition_SCHEDULE,lifecycle:transition_START
10,112.0,A_SUBMITTED,2011-10-01 08:11:08.866000+00:00,2011-10-01 08:11:08.865000+00:00,173697,15000,1,A_PARTLYSUBMITTED,2011-10-01 08:11:09.035000+00:00,1,2,1,0,0


In [12]:
columns_to_keep = ['int:concept:name','case:concept:name', 'position','int:next_concept:name', 'lifecycle:transition_COMPLETE', 'lifecycle:transition_SCHEDULE', 'lifecycle:transition_START']
df = df[columns_to_keep]
df.head()

Unnamed: 0,int:concept:name,case:concept:name,position,int:next_concept:name,lifecycle:transition_COMPLETE,lifecycle:transition_SCHEDULE,lifecycle:transition_START
10,1,173697,1,2,1,0,0
11,2,173697,2,3,1,0,0
14,3,173697,3,0,1,0,0
15,1,173700,1,2,1,0,0
16,2,173700,2,3,1,0,0


In [13]:
train, test = split_data(df, 0.8)

Original size: 31452
Train size: 25135
Test size: 6263
Ratio: 0.8005286960952926
Dropped cases in both sets: 8
Dropped rows from dataset: 54


In [14]:
train.head()

Unnamed: 0,int:concept:name,case:concept:name,position,int:next_concept:name,lifecycle:transition_COMPLETE,lifecycle:transition_SCHEDULE,lifecycle:transition_START
10,1,173697,1,2,1,0,0
11,2,173697,2,3,1,0,0
14,3,173697,3,0,1,0,0
15,1,173700,1,2,1,0,0
16,2,173700,2,3,1,0,0


In [15]:
# Generating all the possible prefixes and suffixes of different length in the train set
prefixes = []
suffixes = []
for case in train['case:concept:name'].unique():
    case_df = train[train['case:concept:name'] == case]
    for i in range(1, len(case_df)):
        prefixes.append(case_df.iloc[:i].values)
        suffixes.append(list(case_df.iloc[i:]['int:concept:name'].values))

In [16]:
# Generating all the possible prefixes and suffixes of different length in the test set
prefixes_test = []
suffixes_test = []
for case in test['case:concept:name'].unique():
    case_df = test[test['case:concept:name'] == case]
    for i in range(1, len(case_df)):
        prefixes_test.append(case_df.iloc[:i].values)
        suffixes_test.append(list(case_df.iloc[i:]['int:concept:name'].values))

In [17]:
prefixes[6], suffixes[6]

(array([[     1, 173703,      1,      2,      1,      0,      0],
        [     2, 173703,      2,      4,      1,      0,      0],
        [     4, 173703,      3,      5,      1,      0,      0]],
       dtype=int64),
 [5, 5, 5, 5, 7, 5])

In [18]:
# Dropping the 'case:concept:name' column and in:next_concept:name column from the train and test suffixes
# Which are the 2nd and 4th columns
prefixes = [np.delete(prefix, [1,3], axis=1) for prefix in prefixes]
prefixes_test = [np.delete(prefix, [1,3], axis=1) for prefix in prefixes_test]

In [19]:
# Add padding to the prefix nd suffix to make every sequence of the same length (the length of the longest sequence) TRAIN
# We add the padding before the sequence in prefixes and after the sequence in suffixes
max_length = df['case:concept:name'].value_counts().max()
padding_dim = len(prefixes[0][0])
for i in range(len(prefixes)):
    padding_prefix = np.zeros((max_length - len(prefixes[i]), padding_dim))
    padding_suffix = [0] * (max_length - len(suffixes[i]))
    prefixes[i] = np.concatenate([padding_prefix, prefixes[i]], axis=0)
    suffixes[i] = np.concatenate([suffixes[i], padding_suffix], axis=0)

In [20]:
# Add padding to the prefix nd suffix to make every sequence of the same length (the length of the longest sequence) TEST
# We add the padding before the sequence in prefixes and after the sequence in suffixes
max_length = df['case:concept:name'].value_counts().max()
padding_dim = len(prefixes_test[0][0])
for i in range(len(prefixes_test)):
    padding_prefix = np.zeros((max_length - len(prefixes_test[i]), padding_dim))
    padding_suffix = [0] * (max_length - len(suffixes_test[i]))
    prefixes_test[i] = np.concatenate([padding_prefix, prefixes_test[i]], axis=0)
    suffixes_test[i] = np.concatenate([suffixes_test[i], padding_suffix], axis=0)

In [21]:
class LSDM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSDM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n.squeeze(0))
        return out

In [22]:
# num_categories = len(df['int:concept:name'].unique())
# embedding_dim = 32  

In [23]:
# num_categories = len(df['int:concept:name'].unique())
# embedding_dim = 32  

In [24]:
# Define the dimensions
input_dim = len(prefixes[0][0])  
hidden_dim = 64
output_dim = len(suffixes[0])  

In [25]:
# Initialize the model
model = LSDM(input_dim, hidden_dim, output_dim)

In [26]:
prefixes[7]

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [1., 1., 1., 0., 0.],
       [2., 2., 1., 0., 0.],
       [4., 3., 1., 0., 0.],
       [5., 4., 0., 1., 0.]])

In [27]:
# Convert data to PyTorch tensors
prefixes_tensor = torch.tensor(np.array(prefixes), dtype=torch.float32)
suffixes_tensor = torch.tensor(np.array(suffixes), dtype=torch.float32)

In [28]:
# mask_prefix = torch.tensor(prefixes_tensor != -1)
# masked_prefix_tensor = masked_tensor(prefixes_tensor, mask_prefix)

# mask_suffix = torch.tensor(suffixes_tensor != -1)
# masked_suffix_tensor = masked_tensor(suffixes_tensor, mask_suffix)

In [29]:
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [30]:
# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(prefixes_tensor)
    loss = criterion(outputs, suffixes_tensor)
    loss.backward()
    optimizer.step()
    
    # Update the progress bar
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/10], Loss: 7.4221
Epoch [2/10], Loss: 6.9822
Epoch [3/10], Loss: 6.4991
Epoch [4/10], Loss: 5.8582
Epoch [5/10], Loss: 4.9802
Epoch [6/10], Loss: 4.0425
Epoch [7/10], Loss: 3.5712
Epoch [8/10], Loss: 3.4671
Epoch [9/10], Loss: 3.4849
Epoch [10/10], Loss: 3.5351


In [31]:
prefixes_array_test = np.array(prefixes_test, dtype=np.float32)

In [32]:
# Convert data to PyTorch tensors
prefixes_test_tensor = torch.tensor(prefixes_array_test, dtype=torch.float32)
suffixes_test_tensor = torch.tensor(suffixes_test, dtype=torch.float32)

  suffixes_test_tensor = torch.tensor(suffixes_test, dtype=torch.float32)


In [33]:
# Set the model to evaluation mode
model.eval()

# Run the test data through the model
with torch.no_grad():
    test_outputs = model(prefixes_test_tensor)
    test_loss = criterion(test_outputs, suffixes_test_tensor)

print(f'Test Loss: {test_loss.item():.4f}')

Test Loss: 3.6383


In [34]:
# set below 1 to zero and round the numbers
test_outputs[test_outputs < 1] = 0
test_outputs = torch.round(test_outputs)

In [35]:
# Comparing one instance from predictions and the actual values
print(test_outputs[0])
print(suffixes_test_tensor[0])

tensor([5., 4., 3., 2., 1., 0., 0., 0., 0., 0.])
tensor([2., 6., 6., 6., 6., 6., 6., 3., 6., 0.])


In [36]:
# Another comparison
print(test_outputs[2000])
print(suffixes_test_tensor[2000])

tensor([5., 4., 3., 2., 1., 0., 0., 0., 0., 0.])
tensor([6., 6., 6., 6., 3., 6., 0., 0., 0., 0.])


In [37]:
test_outputs[69], suffixes_test_tensor[69]

(tensor([5., 4., 3., 2., 1., 0., 0., 0., 0., 0.]),
 tensor([5., 5., 7., 5., 0., 0., 0., 0., 0., 0.]))

In [38]:
test_outputs[3000], suffixes_test_tensor[453]

(tensor([5., 4., 3., 2., 1., 0., 0., 0., 0., 0.]),
 tensor([6., 3., 6., 0., 0., 0., 0., 0., 0., 0.]))

In [39]:
test_outputs[200], suffixes_test_tensor[200]

(tensor([5., 4., 3., 2., 1., 0., 0., 0., 0., 0.]),
 tensor([2., 3., 0., 0., 0., 0., 0., 0., 0., 0.]))

In [40]:
# mapping the predictions to the original labels
label_predictions = [le[int(i)] for i in test_outputs[2000]]
label_actual = [le[int(i)] for i in suffixes_test_tensor[2000]]
label_predictions, label_actual

(['W_Completeren aanvraag',
  'A_PREACCEPTED',
  'A_DECLINED',
  'A_PARTLYSUBMITTED',
  'A_SUBMITTED',
  'No_Activity',
  'No_Activity',
  'No_Activity',
  'No_Activity',
  'No_Activity'],
 ['W_Afhandelen leads',
  'W_Afhandelen leads',
  'W_Afhandelen leads',
  'W_Afhandelen leads',
  'A_DECLINED',
  'W_Afhandelen leads',
  'No_Activity',
  'No_Activity',
  'No_Activity',
  'No_Activity'])

In [41]:
compute_levensthein_distance(['label_predictions'], ['label_actual'])

3424


1

In [42]:
# for i in range(len(test_outputs)):
#     label_predictions = [le[int(i)] for i in test_outputs[i]]
#     label_actual = [le[int(i)] for i in suffixes_test_tensor[i]]
#     print(f'Predicted: {label_predictions}')
#     print(f'Actual: {label_actual}')
#     print(f'Levenshtein Distance: {compute_levensthein_distance(label_predictions, label_actual)}')
#     print('-------------------------------------')

In [48]:
# minn = 100
# id = 0
# for i in range(len(test_outputs)):
#     label_predictions = [le[int(i)] for i in test_outputs[i]]
#     label_actual = [le[int(i)] for i in suffixes_test_tensor[i]]
#     distance = compute_levensthein_distance(label_predictions, label_actual)
#     if distance < minn:
#         minn = distance
#         id = i
# id, minn

(91, 3)

In [47]:
sum = 0
for i in range(len(test_outputs)):
    label_predictions = [le[int(i)] for i in test_outputs[i]]
    label_actual = [le[int(i)] for i in suffixes_test_tensor[i]]
    sum += compute_levensthein_distance(label_predictions, label_actual)

sum / len(test_outputs)

4.990903577926016