# Libraries

In [10]:
import json
import pandas as pd
from datetime import timedelta
from datetime import datetime
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding
import os

# Data Preparation

In [2]:
# Load the dataset
data_path = '/Users/xuenichen/Desktop/Process_Mining_1-main/data'
chosed_dataset = 'BPI_Challenge_2017'
dataframe = pd.read_csv(f'{data_path}/train_test_{chosed_dataset}.csv')
dataframe['time:timestamp'] = pd.to_datetime(dataframe['time:timestamp'])

# Calculate average duration between events
dataframe = dataframe.sort_values(['case:concept:name', 'time:timestamp'])

dataframe['next_timestamp'] = dataframe.groupby('case:concept:name')['time:timestamp'].shift(-1)

dataframe['duration'] = (dataframe['next_timestamp'] -dataframe['time:timestamp']).dt.total_seconds() / 60.0

average_duration = dataframe['duration'].mean()

# Preprocess and Encode

In [3]:
# Convert the CSV file into an event log
parameters = {
    "case_id_glue": "case:concept:name",
    "activity_key": "concept:name",
    "timestamp_key": "time:timestamp"
}
event_log = log_converter.apply(
    dataframe, parameters=parameters, variant=log_converter.Variants.TO_EVENT_LOG)

# Encode activities
activity_encoder = LabelEncoder()
all_activities = [event['concept:name']for trace in event_log for event in trace]

activity_encoder.fit(all_activities)

# Encode all activities in the log
for trace in event_log:
    for event in trace:
        event['concept:name'] = activity_encoder.transform([event['concept:name']])[0]

# Build and Train the Seq2Seq Model

In [4]:
def build_seq2seq_model(input_vocab_size, output_vocab_size, latent_dim=256):
    # Define an input sequence and process it.
    encoder_inputs = Input(shape=(None,))
    encoder_embedding = Embedding(input_vocab_size, latent_dim)(encoder_inputs)
    encoder_outputs, state_h, state_c = LSTM(latent_dim, return_state=True)(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Set up the decoder, using `encoder_states` as initial state.
    decoder_inputs = Input(shape=(None,))
    decoder_embedding = Embedding(
        output_vocab_size, latent_dim)(decoder_inputs)
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(
        decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(output_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Define the model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

In [5]:
# Create input and target sequences from the encoded event log
input_sequences = [[event['concept:name'] for event in trace[:-1]] for trace in event_log]

target_sequences = [[event['concept:name'] for event in trace[1:]] for trace in event_log]

# Pad sequences and prepare data for training
max_sequence_length = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='post')

target_sequences = pad_sequences(target_sequences, maxlen=max_sequence_length, padding='post')

target_sequences = to_categorical(target_sequences, num_classes=len(activity_encoder.classes_))

# Split data into training and testing sets
input_train, input_test, target_train, target_test = train_test_split(input_sequences, target_sequences, test_size=0.2)

# Assuming build_seq2seq_model is defined and ready
model = build_seq2seq_model(input_vocab_size=len(activity_encoder.classes_), output_vocab_size=len(activity_encoder.classes_))

model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])

model.fit([input_train, input_train], target_train,batch_size=64, epochs=10, validation_split=0.2)

Epoch 1/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 770ms/step - accuracy: 0.9107 - loss: 0.3486 - val_accuracy: 0.9731 - val_loss: 0.0792
Epoch 2/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m246s[0m 831ms/step - accuracy: 0.9732 - loss: 0.0772 - val_accuracy: 0.9741 - val_loss: 0.0726
Epoch 3/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 775ms/step - accuracy: 0.9741 - loss: 0.0723 - val_accuracy: 0.9744 - val_loss: 0.0703
Epoch 4/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m237s[0m 800ms/step - accuracy: 0.9742 - loss: 0.0709 - val_accuracy: 0.9760 - val_loss: 0.0658
Epoch 5/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 786ms/step - accuracy: 0.9758 - loss: 0.0659 - val_accuracy: 0.9770 - val_loss: 0.0604
Epoch 6/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 773ms/step - accuracy: 0.9774 - loss: 0.0591 - val_accuracy: 0.9782 - val_loss: 0.0564
Epoc

<keras.src.callbacks.history.History at 0x17f7e80d0>

# Generate Predictions and Timestamps to new files

In [11]:
predictions = model.predict([input_test, input_test])
predicted_sequences = np.argmax(predictions, axis=-1)

# Inverse transform to get the activity names back
predicted_activities = [activity_encoder.inverse_transform(seq) for seq in predicted_sequences]

# Using the minimum timestamp from the dataset as a starting point
base_timestamp = datetime(2016, 1, 1)

predicted_timestamps = []
for activities in predicted_activities:
    timestamps = []
    current_timestamp = base_timestamp
    for i in range(len(activities)):
        timestamps.append(current_timestamp.strftime('%Y-%m-%d %H:%M:%S'))
        # Increment the current timestamp by average_duration, with checks
        next_timestamp = current_timestamp + \
            timedelta(minutes=average_duration)

        # Safety check: Ensure the year is within a reasonable range
        if next_timestamp.year < 2262:
            current_timestamp = next_timestamp
        else:
            # If exceeding bounds, just replicate the last valid timestamp
            # Max safe value for pandas
            current_timestamp = datetime(2018, 1, 1)

    predicted_timestamps.append(timestamps)
    # Increment the base timestamp for the next trace
    # Adjust this logic if necessary to ensure it's realistic for your dataset
    base_timestamp = current_timestamp + \
        timedelta(minutes=average_duration * len(activities))

# Pair each activity with its corresponding timestamp in a dictionary
structured_traces = []
for activities, timestamps in zip(predicted_activities, predicted_timestamps):
    trace = [{'concept:name': act, 'time:timestamp': ts} for act, ts in zip(activities, timestamps)]
    
    structured_traces.append(json.dumps(trace))

# Ensure `test_case_ids` matches the length of `predicted_activities`
test_case_ids = [trace.attributes['concept:name'] for trace in event_log][:len(predicted_activities)]

# Prepare and save the DataFrame
df_predictions = pd.DataFrame({
    'case:concept:name': test_case_ids,
    'trace': structured_traces
})

# Specify your output path
adjusted_output_path = os.path.join(data_path, f'Seq2Seq_predictions_{chosed_dataset}.csv')

df_predictions.to_csv(adjusted_output_path, index=False)

[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 214ms/step
