# Libraries

In [1]:
import os
import numpy as np
import pandas as pd
from keras.models import Model
from keras.utils import to_categorical
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Dense, Embedding
from keras.preprocessing.sequence import pad_sequences
from pm4py.objects.conversion.log import converter as log_converter

# Data Preparation

In [2]:
# Load the dataset
data_path = '/Users/xuenichen/Desktop/Process_Mining_1-main/data'
chosed_dataset = 'BPI_Challenge_2017'
dataframe = pd.read_csv(f'{data_path}/train_test_{chosed_dataset}.csv')
dataframe['time:timestamp'] = pd.to_datetime(dataframe['time:timestamp'])

# Calculate average duration between events
dataframe = dataframe.sort_values(['case:concept:name', 'time:timestamp'])

dataframe['next_timestamp'] = dataframe.groupby('case:concept:name')['time:timestamp'].shift(-1)

dataframe['duration'] = (dataframe['next_timestamp'] -dataframe['time:timestamp']).dt.total_seconds() / 60.0

average_duration = dataframe['duration'].mean()

# Preprocess and Encode

In [3]:
# Convert the CSV file into an event log
parameters = {
    "case_id_glue": "case:concept:name",
    "activity_key": "concept:name",
    "timestamp_key": "time:timestamp"
}
event_log = log_converter.apply(
    dataframe, parameters=parameters, variant=log_converter.Variants.TO_EVENT_LOG)

# Encode activities
activity_encoder = LabelEncoder()
all_activities = [event['concept:name']for trace in event_log for event in trace]

activity_encoder.fit(all_activities)

# Encode all activities in the log
for trace in event_log:
    for event in trace:
        event['concept:name'] = activity_encoder.transform([event['concept:name']])[0]

# Build and Train the Seq2Seq Model

In [4]:
def build_seq2seq_model(input_vocab_size, output_vocab_size, latent_dim=256):
    # Define an input sequence and process it.
    encoder_inputs = Input(shape=(None,))
    encoder_embedding = Embedding(input_vocab_size, latent_dim)(encoder_inputs)
    encoder_outputs, state_h, state_c = LSTM(latent_dim, return_state=True)(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Set up the decoder, using `encoder_states` as initial state.
    decoder_inputs = Input(shape=(None,))
    decoder_embedding = Embedding(
        output_vocab_size, latent_dim)(decoder_inputs)
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(
        decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(output_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Define the model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

In [5]:
# Create input and target sequences from the encoded event log
input_sequences = [[event['concept:name']
                    for event in trace[:-1]] for trace in event_log]
target_sequences = [[event['concept:name']
                     for event in trace[1:]] for trace in event_log]

# Pad sequences and prepare data for training
max_sequence_length = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='post')

target_sequences = pad_sequences(target_sequences, maxlen=max_sequence_length, padding='post')

target_sequences = to_categorical(target_sequences, num_classes=len(activity_encoder.classes_))

# Split data into training and testing sets
input_train, input_test, target_train, target_test = train_test_split(input_sequences, target_sequences, test_size=0.2)

# Assuming build_seq2seq_model is defined and ready
model = build_seq2seq_model(input_vocab_size=len(activity_encoder.classes_), output_vocab_size=len(activity_encoder.classes_))

model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])

model.fit([input_train, input_train], target_train,batch_size=64, epochs=10, validation_split=0.2)

Epoch 1/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m222s[0m 746ms/step - accuracy: 0.9090 - loss: 0.3562 - val_accuracy: 0.9727 - val_loss: 0.0797
Epoch 2/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m227s[0m 765ms/step - accuracy: 0.9736 - loss: 0.0762 - val_accuracy: 0.9739 - val_loss: 0.0738
Epoch 3/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 789ms/step - accuracy: 0.9742 - loss: 0.0720 - val_accuracy: 0.9741 - val_loss: 0.0718
Epoch 4/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 802ms/step - accuracy: 0.9746 - loss: 0.0697 - val_accuracy: 0.9767 - val_loss: 0.0613
Epoch 5/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 760ms/step - accuracy: 0.9783 - loss: 0.0577 - val_accuracy: 0.9798 - val_loss: 0.0526
Epoch 6/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m232s[0m 782ms/step - accuracy: 0.9804 - loss: 0.0507 - val_accuracy: 0.9806 - val_loss: 0.0498
Epoc

<keras.src.callbacks.history.History at 0x2d2fac5d0>

# Generate Predictions and Timestamps to new files

In [6]:
predictions = model.predict([input_test, input_test])
predicted_sequences = np.argmax(predictions, axis=-1)

# Inverse transform to get the activity names back
predicted_activities = [activity_encoder.inverse_transform(seq) for seq in predicted_sequences]

# Generate timestamps for each predicted activity based on the average duration
predicted_timestamps = []
for activities in predicted_activities:
    timestamps = [dataframe['time:timestamp'].min() + timedelta(minutes=i*average_duration) for i in range(len(activities))]
    
    predicted_timestamps.append([ts.strftime('%Y-%m-%d %H:%M:%S') for ts in timestamps])

# Prepare and save the DataFrame
test_case_ids = [trace.attributes['concept:name'] for trace in event_log][:len(predicted_activities)] 

df_predictions = pd.DataFrame({
    'case:concept:name': test_case_ids,
    'predicted_trace': ['; '.join(activities) for activities in predicted_activities],
    'predicted_timestamps': ['; '.join(timestamps) for timestamps in predicted_timestamps]
})
df_predictions.to_csv(f'{data_path}/Seq2Seq_predictions_{chosed_dataset}.csv', index=False)

[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 221ms/step
