In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [15]:
data = pd.read_csv("encoded_sequences.csv")

# Ensure that column names are correctly labeled
assert 'caseid' in data.columns and 'sequence' in data.columns, "Columns are not named correctly!"

sequences = [eval(seq) for seq in data['sequence']]

# Extract unique event types
num_classes = len(set([item for sublist in sequences for item in sublist]))

# Display the first few rows of the dataset
data.head()



Unnamed: 0,caseid,sequence
0,173688,"[9, 6, 7, 19, 19, 0, 14, 5, 12, 15, 21, 19, 21..."
1,173691,"[9, 6, 7, 19, 19, 19, 19, 0, 5, 14, 12, 15, 21..."
2,173694,"[9, 6, 7, 19, 19, 19, 19, 19, 19, 19, 19, 0, 5..."
3,173697,"[9, 6, 4]"
4,173700,"[9, 6, 4]"


In [18]:
# Cell 3: Prepare data
from tensorflow.keras.preprocessing.sequence import pad_sequences

sequence_length = max([len(seq) for seq in sequences])
X_train = pad_sequences(sequences, maxlen=sequence_length, padding='post')



In [23]:
# Cell 4: Define the Generator

generator = keras.Sequential([
    keras.layers.Embedding(input_dim=num_classes, output_dim=128, input_length=sequence_length),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.TimeDistributed(keras.layers.Dense(num_classes, activation="softmax"))
], name="generator")

generator.compile(loss="categorical_crossentropy", optimizer="adam")




In [24]:
# Cell 5: Define the Discriminator

discriminator = keras.Sequential([
    keras.layers.Embedding(input_dim=num_classes, output_dim=128, input_length=sequence_length),
    keras.layers.GRU(128, return_sequences=False),  # Only the last state is needed
    keras.layers.Dense(1, activation='sigmoid')
], name="discriminator")

discriminator.compile(loss="binary_crossentropy", optimizer="adam")


In [26]:
# Cell 6: Adversarial Training Loop

for epoch in range(100):  # Adjust the number of epochs as needed.
    
    # Generate fake sequences
    X_fake_logits = generator.predict(X_train)
    X_fake = np.argmax(X_fake_logits, axis=-1)  # Convert softmax outputs to discrete event values
    
    # Train Discriminator
    y_real = np.ones((len(X_train), 1))
    y_fake = np.zeros((len(X_fake), 1))

    X_dis = np.concatenate([X_train, X_fake])
    y_dis = np.concatenate([y_real, y_fake])

    d_loss = discriminator.train_on_batch(X_dis, y_dis)
    
    # Train Generator
    discriminator.trainable = False

    adversarial_model = keras.Sequential([generator, discriminator])
    adversarial_model.compile(loss="binary_crossentropy", optimizer="adam")

    y_mislabeled = np.ones((len(X_train), 1))
    g_loss = adversarial_model.train_on_batch(X_train, y_mislabeled)
    
    discriminator.trainable = True

    print(f"Epoch: {epoch}, D Loss: {d_loss}, G Loss: {g_loss}")




ValueError: Exception encountered when calling layer "discriminator" (type Sequential).

Input 0 of layer "gru_6" is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: (None, 175, 24, 128)

Call arguments received by layer "discriminator" (type Sequential):
  • inputs=tf.Tensor(shape=(None, 175, 24), dtype=float32)
  • training=None
  • mask=None

In [5]:
# Hyperparameters play a crucial role in determining the performance of our model.
# It's often a good practice to list them at one place for easy tuning.

EMBEDDING_DIM = 128  # Size of the embedding for each event
HIDDEN_DIM = 128     # Size of the hidden layers in RNNs
MAX_SEQUENCE_LENGTH = 50  # Adjust this based on the maximum sequence length in your data
BATCH_SIZE = 64      # Batch size for training


In [6]:
# The Generator is responsible for creating sequences. 
# We use an LSTM based architecture to capture the temporal dependencies of the process event logs.

def build_generator():
    model = keras.Sequential()
    model.add(layers.Embedding(input_dim=25, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)) 
    model.add(layers.LSTM(HIDDEN_DIM, return_sequences=True))
    model.add(layers.TimeDistributed(layers.Dense(25, activation="softmax")))
    return model

generator = build_generator()
generator.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 128)           3200      
                                                                 
 lstm (LSTM)                 (None, 50, 128)           131584    
                                                                 
 time_distributed (TimeDistr  (None, 50, 25)           3225      
 ibuted)                                                         
                                                                 
Total params: 138,009
Trainable params: 138,009
Non-trainable params: 0
_________________________________________________________________


In [7]:
# The Discriminator's job is to differentiate between real and generated sequences.
# Just like the generator, we use an LSTM based architecture.

def build_discriminator():
    model = keras.Sequential()
    model.add(layers.Embedding(input_dim=25, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
    model.add(layers.LSTM(HIDDEN_DIM))
    model.add(layers.Dense(1, activation="sigmoid"))
    return model

discriminator = build_discriminator()
discriminator.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 128)           3200      
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 134,913
Trainable params: 134,913
Non-trainable params: 0
_________________________________________________________________


In [9]:
generator = keras.Sequential([
    keras.layers.Embedding(input_dim=num_classes, output_dim=128, input_length=sequence_length),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(sequence_length)  # Remove the softmax activation
], name="generator")

generator.compile(loss="mse", optimizer="adam")



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x244b982c950>

In [11]:
# We also pre-train the discriminator to help it distinguish between real and fake sequences early on.

X_fake = generator.predict(X_train)
X_fake = np.argmax(X_fake, axis=-1)  # Convert probabilities to event indices

y_real = np.ones((len(X_train), 1))
y_fake = np.zeros((len(X_fake), 1))

X_dis = np.concatenate([X_train, X_fake])
y_dis = np.concatenate([y_real, y_fake])

discriminator.compile(loss="binary_crossentropy", optimizer="adam")
discriminator.fit(X_dis, y_dis, batch_size=BATCH_SIZE, epochs=1)




<keras.callbacks.History at 0x244c0e63990>

In [None]:
# Further steps might include:
# - Incorporating domain-specific rules or constraints into the generation process.
# - Using a more complex model architecture or newer models like Transformers.
# - Implementing techniques like Reinforcement Learning to further guide the generation process.

In [13]:
# This is the core of the GAN. 
# We alternately train the Discriminator to distinguish real sequences from generated ones and the Generator to fool the Discriminator.

for epoch in range(100):  # Adjust the number of epochs as needed.
    
    # Train Discriminator
    X_fake = generator.predict(X_train)
    X_fake_indices = np.argmax(X_fake, axis=-1)  # Convert probabilities to event indices

    y_real = np.ones((len(X_train), 1))
    y_fake = np.zeros((len(X_fake_indices), 1))

    X_dis = np.concatenate([X_train, X_fake_indices])
    y_dis = np.concatenate([y_real, y_fake])

    d_loss = discriminator.train_on_batch(X_dis, y_dis)
    
    # Train Generator
    # We will create an adversarial model for this.
    discriminator.trainable = False  # Freeze the discriminator

    adversarial_model = keras.Sequential([generator, discriminator])
    adversarial_model.compile(loss="binary_crossentropy", optimizer="adam")

    y_mislabeled = np.ones((len(X_train), 1))  # We want the generator to produce sequences that the discriminator thinks are real
    g_loss = adversarial_model.train_on_batch(X_train, y_mislabeled)
    
    # Unfreeze the discriminator
    discriminator.trainable = True

    print(f"Epoch: {epoch}, D Loss: {d_loss}, G Loss: {g_loss}")




ValueError: Exception encountered when calling layer "sequential_1" (type Sequential).

Input 0 of layer "lstm_1" is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: (None, 50, 25, 128)

Call arguments received by layer "sequential_1" (type Sequential):
  • inputs=tf.Tensor(shape=(None, 50, 25), dtype=float32)
  • training=None
  • mask=None