# Libraries

In [1]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, TimeDistributed
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import numpy as np
import tensorflow as tf
from datetime import timedelta
from sklearn.model_selection import train_test_split

# Data Preparation

In [2]:
# Load the dataset
data_path = '/Users/xuenichen/Desktop/Process_Mining_1-main/data'
chosed_dataset = 'BPI_Challenge_2017'
data = pd.read_csv(f'{data_path}/train_test_{chosed_dataset}.csv')

# Preprocess 

In [3]:
# Convert timestamps to pandas datetime
data['time:timestamp'] = pd.to_datetime(data['time:timestamp'])
data.sort_values(by=['case:concept:name', 'time:timestamp'], inplace=True)

# Initialize LabelEncoder
activity_encoder = LabelEncoder()

# Fit the encoder on all unique activities, including 'A_Create Application'
all_activities = data['concept:name'].unique().tolist() + ['A_Create Application']
activity_encoder.fit(all_activities)

# Encode all activities in the dataset
data['activity_encoded'] = activity_encoder.transform(data['concept:name'])

# Group by case and create sequences of activity codes
sequences = data.groupby('case:concept:name')['activity_encoded'].apply(list)

# Find the maximum sequence length for padding
max_seq_length = max(len(s) for s in sequences) + 1  # Plus one for the start token

# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

# Create start tokens for each sequence
start_activity_code = activity_encoder.transform(['A_Create Application'])[0]
start_tokens = np.full((padded_sequences.shape[0], 1), start_activity_code)
# Add start tokens to the beginning of each sequence
padded_sequences = np.hstack((start_tokens, padded_sequences))

# Prepare input (X) and target (Y) for the model
X = padded_sequences[:, :-1]  # All but the last column
Y = to_categorical(padded_sequences[:, 1:], num_classes=len(activity_encoder.classes_))  # One-hot encoded

# Building and Training the Seq2Seq Model

In [4]:
# Define the Seq2Seq model architecture
def build_seq2seq(input_dim, seq_len, embedding_dim=64, lstm_dim=256):
    # Encoder
    encoder_inputs = Input(shape=(seq_len,))
    encoder_embedding = Embedding(input_dim=input_dim, output_dim=embedding_dim)(encoder_inputs)
    encoder_outputs, state_h, state_c = LSTM(lstm_dim, return_state=True)(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(seq_len,))
    decoder_embedding = Embedding(input_dim=input_dim, output_dim=embedding_dim)(decoder_inputs)
    decoder_lstm = LSTM(lstm_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = TimeDistributed(Dense(input_dim, activation='softmax'))
    decoder_outputs = decoder_dense(decoder_outputs)

    # Seq2Seq Model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model


# Instantiate and compile the model
seq_len = X.shape[1]
input_dim = len(activity_encoder.classes_)
seq2seq_model = build_seq2seq(input_dim, seq_len)
seq2seq_model.compile(optimizer=Adam(1e-3), loss='categorical_crossentropy', metrics=['accuracy'])

# Split the data into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train the model
history = seq2seq_model.fit(
    [X_train, X_train], Y_train,
    batch_size=64,
    epochs=5,  
    validation_data=([X_val, X_val], Y_val)
)

Epoch 1/10
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 681ms/step - accuracy: 0.7621 - loss: 1.0986 - val_accuracy: 0.8957 - val_loss: 0.3403
Epoch 2/10
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 720ms/step - accuracy: 0.9026 - loss: 0.3098 - val_accuracy: 0.9361 - val_loss: 0.2081
Epoch 3/10
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 742ms/step - accuracy: 0.9436 - loss: 0.1877 - val_accuracy: 0.9600 - val_loss: 0.1353
Epoch 4/10
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 744ms/step - accuracy: 0.9610 - loss: 0.1301 - val_accuracy: 0.9635 - val_loss: 0.1122
Epoch 5/10
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 779ms/step - accuracy: 0.9637 - loss: 0.1103 - val_accuracy: 0.9652 - val_loss: 0.1018
Epoch 6/10
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 770ms/step - accuracy: 0.9653 - loss: 0.1019 - val_accuracy: 0.9673 - val_loss: 0.0963
Epoch 7/10
[1m69/69[

In [5]:
# Function to generate a sequence of activities
def generate_full_sequence(model, input_seq, activity_encoder, max_length):
    start_token = activity_encoder.transform(['A_Create Application'])[0]
    decoder_input = np.zeros((1, max_length))
    decoder_input[0, 0] = start_token  # setting the start token

    output_seq = []

    for i in range(1, max_length):
        current_pred_probs = model.predict(
            [input_seq, decoder_input], verbose=0)
        current_pred = np.argmax(current_pred_probs[0, i - 1, :], axis=-1)
        output_seq.append(current_pred)
        decoder_input[0, i] = current_pred

    decoded_sequence = activity_encoder.inverse_transform(output_seq)
    return decoded_sequence

# this cell runs for more than 1h but without any output. At first showing the ms/step but after 20min, VScode died. So I switch to this. need someone to run and check

In [6]:
# Predict sequences for each case
predicted_sequences = {}
case_ids = sequences.index.tolist()

for i, case_id in enumerate(case_ids):
    input_seq = X[i:i+1]  # Select the encoder input for the current case
    predicted_sequence = generate_full_sequence(seq2seq_model, input_seq, activity_encoder, max_seq_length)
    predicted_sequences[case_id] = predicted_sequence

In [None]:
# Format the predictions into a DataFrame
predictions_df = pd.DataFrame()

# Find the last known timestamp for each case to use as the initial timestamp for predictions
last_timestamps = data.groupby('case:concept:name')['time:timestamp'].last()

# Generate predictions and format them
for case_id, predicted_seq in predicted_sequences.items():
    initial_timestamp = last_timestamps[case_id]
    # Generate timestamps for each predicted activity, spaced one hour apart
    timestamps = [initial_timestamp + timedelta(hours=i) for i in range(len(predicted_seq))]
    case_df = pd.DataFrame({
        'case:concept:name': [case_id] * len(predicted_seq),
        'concept:name': predicted_seq,
        'time:timestamp': timestamps
    })
    predictions_df = predictions_df.append(case_df, ignore_index=True)

# Save the formatted predictions to a CSV file
predictions_csv_path = f'{data_path}/Seq2Seq_predictions_{chosed_dataset}.csv'
predictions_df.to_csv(predictions_csv_path, index=False)