In [1]:
!pip install tensorflow



In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from sklearn.preprocessing import MinMaxScaler

data_path = '/Users/xuenichen/Desktop/Process_Mining_1-main/data'
chosed_dataset = 'BPI_Challenge_2017'
datasets = ['train', 'test', 'train_test']

In [2]:
def prepare_sequences(dataframe, tokenizer=None):
    activities = dataframe['concept:name'].astype(str).tolist()
    timestamps = pd.to_datetime(dataframe['time:timestamp']).values
    time_diffs = np.diff(timestamps) / np.timedelta64(1, 's')
    time_diffs = np.insert(time_diffs, 0, 0)

    if tokenizer is None:
        tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
        tokenizer.fit_on_texts(activities)
    sequences = tokenizer.texts_to_sequences(activities)

    X, y_activity, y_time = [], [], []
    for i in range(1, len(sequences)):
        X.append(sequences[:i])
        y_activity.append(sequences[i])
        y_time.append(time_diffs[i])

    max_sequence_len = max([len(x) for x in X])
    X = pad_sequences(X, maxlen=max_sequence_len, padding='pre')
    y_activity = np.array(y_activity)
    y_time = np.array(y_time).reshape(-1, 1)

    scaler = MinMaxScaler()
    y_time = scaler.fit_transform(y_time)

    vocab_size = len(tokenizer.word_index) + 1

    return X, y_activity, y_time, tokenizer, vocab_size, max_sequence_len, scaler

In [3]:
# Model building
def build_model(vocab_size, max_sequence_len):
    encoder_inputs = Input(shape=(max_sequence_len,))
    enc_emb = Embedding(vocab_size, 100)(encoder_inputs)
    encoder_lstm = LSTM(100, return_state=True)
    encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
    encoder_states = [state_h, state_c]

    decoder_inputs = Input(shape=(max_sequence_len,))
    dec_emb_layer = Embedding(vocab_size, 100)
    dec_emb = dec_emb_layer(decoder_inputs)
    decoder_lstm = LSTM(100, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
    activity_output = Dense(vocab_size, activation='softmax')(decoder_outputs)
    time_output = Dense(1, activation='relu')(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs],[activity_output, time_output])
    model.compile(optimizer='adam', loss=['categorical_crossentropy', 'mse'], metrics=['acc', 'mse'])

    return model

In [5]:
# Function to save predictions
def generate_activity_sequence(model, tokenizer, scaler, max_sequence_len, start_activity='A_Create Application', max_len=10):
    sequence = [tokenizer.word_index[start_activity]] if start_activity in tokenizer.word_index else [0]
    output_sequence = [start_activity]
    current_time = 0

    for _ in range(max_len - 1):
        padded_sequence = pad_sequences([sequence], maxlen=max_sequence_len, padding='pre')
        
        pred_activity, pred_time = model.predict([padded_sequence, padded_sequence])
        
        next_activity_index = np.argmax(pred_activity[0, -1])
        if next_activity_index == 0:
            break
        next_activity = tokenizer.index_word.get(next_activity_index, 'UNK')
        output_sequence.append(next_activity)
        sequence.append(next_activity_index)
        current_time += scaler.inverse_transform(
            pred_time[0, -1].reshape(-1, 1))[0, 0]
        if next_activity == 'END':
            break

    return output_sequence, current_time


def save_predictions_with_sequences(dataset_name, model, tokenizer, scaler, vocab_size, max_sequence_len, chosed_dataset):
    case_ids = range(1, 101)  # Example case IDs; adjust based on your data
    predicted_traces = []
    total_times = []

    for _ in case_ids:
        predicted_trace, total_time = generate_activity_sequence(model, tokenizer, scaler, max_sequence_len)
        predicted_traces.append(', '.join(predicted_trace))
        total_times.append(total_time)

    output_df = pd.DataFrame({
        'case:concept:name': case_ids,
        'predicted_trace': predicted_traces,
        'total_timestamp': total_times
    })

    output_filename = f'Seq2Seq_traces_{dataset_name}_{chosed_dataset}.csv'
    output_df.to_csv(os.path.join(data_path, output_filename), index=False)
    print(f"Saved predictions to {output_filename}")

In [6]:
tokenizer = None  # Initialize tokenizer

for dataset_name in datasets:
    dataframe = pd.read_csv(os.path.join(data_path, f'{dataset_name}_{chosed_dataset}.csv'))
    X, y_activity, y_time, tokenizer, vocab_size, max_sequence_len, scaler = prepare_sequences(dataframe, tokenizer)
    y_activity_categorical = tf.keras.utils.to_categorical(y_activity, num_classes=vocab_size)

    model = build_model(vocab_size, max_sequence_len)
    model.fit([X, X], [y_activity_categorical, y_time],batch_size=64, epochs=10, validation_split=0.2)

    save_predictions_with_sequences(dataset_name, model, tokenizer, scaler, vocab_size, max_sequence_len, chosed_dataset)

  dataframe = pd.read_csv(os.path.join(


: 