## Mini Proc Model with MiniAE to demonstrate the use of LTN in an Autoencoder
1. Generate the data
2. Train the Autoencoder
3. Modify the AE with LTN

## Mini Proc Model
### Process

e1 -> e2 -> e3 -> e4 -> e5

event attributes = name, user

event_names = ["Create SC", "Approve SC", "Create PO", "Approve PO", "Pay"]

user_names = ["Dev", "Chantal", "Seokju", "Jonas", "Kaly"]

### Valid traces
1. ["Create SC", "Approve SC", "Create PO", "Approve PO", "Pay"]
1. ["Create SC", "Create PO", "Approve SC", "Approve PO", "Pay"]

### Event User Mapping
1. "Create SC" : "Dev", "Chantal" 
2. "Approve SC" : "Kaly"
3. "Create PO" : "Dev", "Jonas"
4. "Approve PO" : "Kaly"
5. "Pay" : "Seokju"

### Data
1. Traces = 1000
2. p_anomaly = 0.3 # This means that the possibility that a given trace is anomalous is 0.3
3. Anomaly types:
    1. Control flow: irregular flow ordering
    2. Attribute: Wrong attributes assigned

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from pprint import pprint
import itertools
import pickle

np.random.seed(0)

In [3]:
import tensorflow as tf
from tensorflow import keras
import ltn
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print("GPU found")
    print("Memory growth set")
else:
    print("No GPU found")

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU found
Memory growth set


In [4]:
class Event(dict):
    """
    Event class with following keys:
    :name: name of the event
    :user: user who performed the event
    :case_id: case id of the event
    """
    pass
class Case(list):
    """
    Case is a list of traces
    """
    pass

In [48]:
class Dataset():
    def __init__(self, max_cases=None, anomaly_probabilty=None):
        self.event_names = set(["Create SC", "Approve SC", "Create PO", "Approve PO", "Pay"])
        self.user_names = set(["Dev", "Chantal", "Seokju", "Jonas", "Kaly"])
        self.valid_traces = [
            ["Create SC", "Approve SC", "Create PO", "Approve PO", "Pay"],
        ]
        self.invalid_traces_control_flow = [
            ["Approve SC", "Create SC", "Create PO", "Approve PO", "Pay"],
            ["Create SC", "Approve SC", "Approve PO", "Create PO", "Pay"],
            ["Create SC", "Approve SC", "Create PO", "Pay", "Approve PO"],            
            ["Create PO", "Approve PO", "Create SC", "Approve SC", "Pay"]
        ]
        self.event_user_mapping = {
            "Create SC": ["Dev", "Chantal"],
            "Approve SC": ["Kaly"],
            "Create PO": ["Dev", "Jonas"],
            "Approve PO": ["Kaly"],
            "Pay": ["Seokju"]
        }
        self.event_user_mapping_inv = {
            "Dev": ["Create SC", "Create PO"],
            "Chantal": ["Create SC"],
            "Kaly": ["Approve SC", "Approve PO"],
            "Jonas": ["Create PO"],
            "Seokju": ["Pay"]
        }
        self.anomaly_probability = 0.3
        self.max_cases = 1000
        self.actual_cases = 0
        if max_cases is not None:
            self.max_cases = max_cases
        if anomaly_probabilty is not None:
            self.anomaly_probability = anomaly_probabilty
        self.raw_dataset = []
        self.case_id_counter = 0
        self.encoders = {}
        self.encoders["name"] = LabelEncoder()
        self.encoders["name"].fit(list(self.event_names))
        print(f"encoders[name]:\n{[(str(c), str(t)) for c, t in list(zip(self.encoders['name'].classes_, self.encoders['name'].transform(self.encoders['name'].classes_)))]}")
        self.encoders["user"] = LabelEncoder()
        self.encoders["user"].fit(list(self.user_names))
        print(f"encoders[user]:\n{[(str(c), str(t)) for c, t in list(zip(self.encoders['user'].classes_, self.encoders['user'].transform(self.encoders['user'].classes_)))]}")
        
        
    def create_valid_traces(self, num_traces):
        partial_dataset = []
        cases_per_trace = num_traces // len(self.valid_traces)
        for valid_trace in self.valid_traces:
            for _ in range(cases_per_trace):
                case = Case()
                event = Event()
                for event_name in valid_trace:
                    event["name"] = event_name
                    event["user"] = np.random.choice(self.event_user_mapping[event_name])
                    event["case_id"] = self.case_id_counter
                    # case.append(event.copy())
                    partial_dataset.append(event.copy())
                self.case_id_counter += 1
                # partial_dataset.append(case)
        return partial_dataset, len(partial_dataset)//5
                
    def create_invalid_traces_control_flow(self, num_traces):
        partial_dataset = []
        cases_per_trace = num_traces // len(self.invalid_traces_control_flow)
        for invalid_trace in self.invalid_traces_control_flow:
            for _ in range(cases_per_trace):
                case = Case()
                event = Event()
                for event_name in invalid_trace:
                    event["name"] = event_name
                    event["user"] = np.random.choice(self.event_user_mapping[event_name])
                    event["case_id"] = self.case_id_counter
                    # case.append(event.copy())
                    partial_dataset.append(event.copy())
                self.case_id_counter += 1
                # partial_dataset.append(case)
        return partial_dataset, len(partial_dataset)//5
    
    def create_invalid_traces_attribute(self, num_traces):
        partial_dataset = []
        cases_per_trace = num_traces // len(self.valid_traces)
        for valid_trace in self.valid_traces:
            for _ in range(cases_per_trace):
                case = Case()
                event = Event()
                for event_name in valid_trace:
                    event["name"] = event_name
                    wrong_users = set(self.user_names) - set(self.event_user_mapping[event_name])
                    event["user"] = np.random.choice(list(wrong_users))
                    event["case_id"] = self.case_id_counter
                    # case.append(event.copy())
                    partial_dataset.append(event.copy())
                self.case_id_counter += 1
                # partial_dataset.append(case)
        return partial_dataset, len(partial_dataset)//5
    
    def create_dataset(self, max_cases=None, anomaly_probabilty=None):
        if max_cases is None:
            max_cases = self.max_cases
        else:
            self.max_cases = max_cases
        if anomaly_probabilty is None:
            anomaly_probabilty = self.anomaly_probability
        else:
            self.anomaly_probability = anomaly_probabilty
        num_anomalous_cases = int(max_cases * anomaly_probabilty)
        num_normal_cases = max_cases - num_anomalous_cases
        self.actual_cases = []
        raw_dataset, actual_count = self.create_valid_traces(num_normal_cases)
        self.raw_dataset += raw_dataset
        self.actual_cases += [actual_count]
        raw_dataset, actual_count = self.create_invalid_traces_control_flow(num_anomalous_cases // 2)
        self.raw_dataset += raw_dataset
        self.actual_cases += [actual_count]
        raw_dataset, actual_count = self.create_invalid_traces_attribute(num_anomalous_cases // 2)
        self.raw_dataset += raw_dataset
        self.actual_cases += [actual_count]
    
    @property
    def raw_dataset_as_df(self):
        # name, user, case_id"
        return pd.DataFrame(self.raw_dataset)
    
    @property
    def raw_dataset_as_np_array(self):
        np_df = self.raw_dataset_as_df.to_numpy().reshape(-1, 5, 3)
        return np_df
    
    @property
    def encoded_features(self):
        without_case_id = self.raw_dataset_as_np_array[:, :, 0:2] # name, user
        # without_case_id_one_row_one_case = without_case_id.reshape(-1, 10)
        without_case_id_one_row_one_event = without_case_id.reshape(-1, 2)
        just_names = without_case_id_one_row_one_event[:, 0]
        just_users = without_case_id_one_row_one_event[:, 1]
        encoded_names = self.encoders["name"].transform(just_names).reshape(-1, 1)
        encoded_users = self.encoders["user"].transform(just_users).reshape(-1, 1)
        encoded_data = np.hstack((encoded_names, encoded_users)).reshape(-1, 10)
        return encoded_data
    
    @property
    def one_hot_encoded_features(self):
        one_row_one_event = self.encoded_features.reshape(-1, 2)
        from sklearn.preprocessing import OneHotEncoder

        name_column = one_row_one_event[:, 0].reshape(-1, 1)  # Get name column
        user_column = one_row_one_event[:, 1].reshape(-1, 1)  # Get user column

        self.one_hot_encoder = {}
        self.one_hot_encoder["name"] = OneHotEncoder(sparse_output=False)
        self.one_hot_encoder["user"]= OneHotEncoder(sparse_output=False)

        one_hot_encoded_name = self.one_hot_encoder["name"].fit_transform(name_column).reshape(-1, 5)
        one_hot_encoded_user = self.one_hot_encoder["user"].fit_transform(user_column).reshape(-1, 5)
        
        one_hot_encoded_data = np.hstack((one_hot_encoded_name, one_hot_encoded_user)).reshape(-1, 50)
        return one_hot_encoded_data
    
    @property
    def one_hot_encoded_features_2d(self):
        return self.one_hot_encoded_features.reshape(-1, 10, 5)
    
    @property
    def x_one_hot_y_int(self):
        return self.one_hot_encoded_features, self.encoded_features.reshape
    
    @property
    def x_one_hot_2d_y_int_2d(self):
        return self.one_hot_encoded_features_2d, self.encoded_features.reshape(-1, 10, 1)
    
    @property
    def x_one_hot_2d_y_int(self):
        #reshape is not needed here because it's already (-1, 10)
        return self.one_hot_encoded_features_2d, self.encoded_features.reshape(-1, 10)
    
    def inverse_one_hot_encoded_features_to_int(self, one_hot_encoded_data):
        """
        There are 50 columns in a completely one hot encoded data
        thats 5 events and 5 users altenately
        first we deconde to integer level
        this function is just for one vector but maybe can be broadcasted over the whole dataset
        """
        one_row_one_event = one_hot_encoded_data.reshape(-1, 10)
        name_column = one_row_one_event[:, 0:5].reshape(-1, 5)
        user_column = one_row_one_event[:, 5:10].reshape(-1, 5)
        de_encoded_name = self.one_hot_encoder["name"].inverse_transform(name_column).reshape(-1, 1)
        de_encoded_user = self.one_hot_encoder["user"].inverse_transform(user_column).reshape(-1, 1)
        de_encoded_data = np.hstack((de_encoded_name, de_encoded_user)).reshape(-1, 2)
        return de_encoded_data.reshape(-1, 10)

    def inverse_one_hot_encoded_features_to_string(self, one_hot_encoded_data):
        """
        There are 50 columns in a completely one hot encoded data
        thats 5 events and 5 users altenately
        first we deconde to integer level
        then to string level
        this function is just for one vector but maybe can be broadcasted over the whole dataset
        """
        one_hot_encoded_data_int = self.inverse_one_hot_encoded_features_to_int(one_hot_encoded_data)
        one_row_one_event = one_hot_encoded_data_int.reshape(-1, 2)
        name_column = one_row_one_event[:, 0]
        user_column = one_row_one_event[:, 1]
        de_encoded_name = self.encoders["name"].inverse_transform(name_column).reshape(-1, 1)
        de_encoded_user = self.encoders["user"].inverse_transform(user_column).reshape(-1, 1)
        de_encoded_data = np.hstack((de_encoded_name, de_encoded_user)).reshape(-1, 2)
        return de_encoded_data.reshape(-1, 10)
        
        
        
    pass
    
synth_dataset = Dataset(max_cases=1000, anomaly_probabilty=0.3)
synth_dataset.create_dataset()
print(f"Cases Distribution: {synth_dataset.actual_cases}")
one_hot_flat = synth_dataset.one_hot_encoded_features
print(f"one_hot_flat shape: {one_hot_flat.shape}")
print(f"Raw features test:\n{synth_dataset.raw_dataset_as_df[:5]}")
print(f"Encoded Features test:\n{synth_dataset.encoded_features[0]}")
# pprint(f"One hot Encoded feature test: {one_hot_flat[0]}")
print(f"De encoding test:\n{synth_dataset.inverse_one_hot_encoded_features_to_int(one_hot_flat[0])}")
# from pprint import pprint
# pprint(f"Encoded Features: {temp.shape}")
# temp = synth_dataset.one_hot_encoded_features
# pprint(f"One hot Encoded Features: {temp.shape}")
# int_de_encode = synth_dataset.inverse_one_hot_encoded_features_to_int(temp[0])
# print(f"De encoding test: {int_de_encode}")
# str_de_encode = synth_dataset.inverse_one_hot_encoded_features_to_string(temp[0])


encoders[name]:
[('Approve PO', '0'), ('Approve SC', '1'), ('Create PO', '2'), ('Create SC', '3'), ('Pay', '4')]
encoders[user]:
[('Chantal', '0'), ('Dev', '1'), ('Jonas', '2'), ('Kaly', '3'), ('Seokju', '4')]
Cases Distribution: [700, 148, 150]
one_hot_flat shape: (998, 50)
Raw features test:
         name    user  case_id
0   Create SC     Dev        0
1  Approve SC    Kaly        0
2   Create PO   Jonas        0
3  Approve PO    Kaly        0
4         Pay  Seokju        0
Encoded Features test:
[3 1 1 3 2 2 0 3 4 4]
De encoding test:
[[3 1 1 3 2 2 0 3 4 4]]


In [11]:
def model_fn(dataset: Dataset):
    from tensorflow.keras.layers import Input, Dense, Dropout, GaussianNoise
    from tensorflow.keras.models import Model
    from tensorflow.keras.optimizers import Adam

    hidden_layers = 2
    hidden_size_factor = .2
    noise = True

    features = dataset.one_hot_encoded_features

    # Parameters
    input_size = features.shape[1]

    # Input layer
    input = Input(shape=(input_size,), name='input')
    x = input

    # Noise layer
    if noise is not None:
        x = GaussianNoise(noise)(x)

    # Hidden layers
    for i in range(hidden_layers):
        if isinstance(hidden_size_factor, list):
            factor = hidden_size_factor[i]
        else:
            factor = hidden_size_factor
        x = Dense(int(input_size * factor), activation='relu', name=f'hid{i + 1}')(x)
        x = Dropout(0.5)(x)

    # Output layer
    output = Dense(input_size, activation='sigmoid', name='output')(x)

    # Build model
    model = Model(inputs=input, outputs=output)

    # Compile model
    model.compile(
        optimizer=Adam(learning_rate=0.0001, beta_2=0.99),
        loss='mean_squared_error',
    )

    return model, features, features

In [12]:
dae_model, _, _ = model_fn(synth_dataset)

In [13]:
dae_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 50)]              0         
                                                                 
 gaussian_noise_1 (GaussianN  (None, 50)               0         
 oise)                                                           
                                                                 
 hid1 (Dense)                (None, 10)                510       
                                                                 
 dropout_2 (Dropout)         (None, 10)                0         
                                                                 
 hid2 (Dense)                (None, 10)                110       
                                                                 
 dropout_3 (Dropout)         (None, 10)                0         
                                                           

In [44]:
history = dae_model.fit(
    one_hot_flat, one_hot_flat,
    epochs=70,
    batch_size=100,
    shuffle=True,
    validation_split=0.2,
    verbose=1,
)

Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


In [45]:
test_vector = one_hot_flat[0].reshape(1, -1)
pprint(test_vector)
op = dae_model.predict(test_vector)
pprint(op)

array([[0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 1.]])
array([[0.2739193 , 0.24642536, 0.2337602 , 0.85556823, 0.36841056,
        0.5256383 , 0.50858957, 0.2868952 , 0.14678694, 0.4635234 ,
        0.22361268, 0.69030535, 0.27700204, 0.16933021, 0.29169515,
        0.28178638, 0.31617442, 0.23962119, 0.77750623, 0.20826949,
        0.42237478, 0.20493293, 0.52529866, 0.15338881, 0.1778515 ,
        0.29552883, 0.46022958, 0.45655587, 0.2667199 , 0.3328998 ,
        0.7781956 , 0.28047216, 0.41250315, 0.3798826 , 0.30386594,
        0.52685326, 0.43211424, 0.24897757, 0.70690864, 0.21838123,
        0.24589202, 0.18628019, 0.26893368, 0.13125217, 0.6247224 ,
        0.24578662, 0.18616524, 0.41130507, 0.27672476, 0.70805293]],
      dtype=float32)


In [39]:
print(op.shape)

(1, 50)


In [40]:
def model_2d_fn(dataset: Dataset):
    """
    Two dimentional variational autoencoder
    """
    from tensorflow.keras.layers import Input, Dense, Dropout, GaussianNoise, Reshape, Flatten, Concatenate
    from tensorflow.keras.models import Model
    from tensorflow.keras.optimizers import Adam

    hidden_layers = 2
    hidden_size_factor = .2
    noise = True

    features = dataset.one_hot_encoded_features_2d

    # Parameters
    input_size_1 = features.shape[1]
    input_size_2 = features.shape[2]
    

    # Input layer
    input = Input(shape=(input_size_1, input_size_2, ), name='input')
    flat_input = Flatten()(input)
    x = flat_input

    # Noise layer
    if noise is not None:
        x = GaussianNoise(noise)(x)

    # Hidden layers
    for i in range(hidden_layers):
        if isinstance(hidden_size_factor, list):
            factor = hidden_size_factor[i]
        else:
            factor = hidden_size_factor
        x = Dense(int(input_size_1 * input_size_2 * factor), activation='relu', name=f'hid{i + 1}')(x)
        x = Dropout(0.5)(x)

    # Output layer
    outputs = []
    for i in range(input_size_1):
        output = Dense(input_size_2, activation='sigmoid', name=f'output_{i}')(x)
        outputs.append(output)

    outputs = Concatenate(axis=1)(outputs)
    outputs = Reshape((input_size_1, input_size_2))(outputs)
    # Build model
    model = Model(inputs=input, outputs=outputs)

    # Compile model
    model.compile(
        optimizer=Adam(learning_rate=0.0001, beta_2=0.99),
        loss='mean_squared_error',
    )

    return model, features, features

In [41]:
dae_model_2d, _, _ = model_2d_fn(synth_dataset)
dae_model_2d.summary()


Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 10, 5)]      0           []                               
                                                                                                  
 flatten_2 (Flatten)            (None, 50)           0           ['input[0][0]']                  
                                                                                                  
 gaussian_noise_4 (GaussianNois  (None, 50)          0           ['flatten_2[0][0]']              
 e)                                                                                               
                                                                                                  
 hid1 (Dense)                   (None, 10)           510         ['gaussian_noise_4[0][0]'] 

In [42]:
history_2d = dae_model_2d.fit(
    synth_dataset.one_hot_encoded_features_2d, synth_dataset.one_hot_encoded_features_2d,
    epochs=60,
    batch_size=100,
    shuffle=True,
    validation_split=0.2,
    verbose=1,
)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


In [43]:
test_vector_2d = synth_dataset.one_hot_encoded_features_2d[0].reshape(-1, 10, 5)
pprint(test_vector_2d)
op = dae_model_2d.predict(test_vector_2d)
pprint(op)

array([[[0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1.]]])
array([[[0.48761427, 0.5015451 , 0.4773597 , 0.52389514, 0.49006116],
        [0.5001155 , 0.5040638 , 0.48521015, 0.50037163, 0.46817517],
        [0.50422937, 0.5264716 , 0.46972108, 0.49066883, 0.5076579 ],
        [0.4751231 , 0.47617364, 0.49726123, 0.5187941 , 0.49022898],
        [0.5045805 , 0.4825806 , 0.5179023 , 0.4880969 , 0.47701532],
        [0.48949397, 0.4944772 , 0.5036503 , 0.4638604 , 0.49131078],
        [0.5261544 , 0.47858143, 0.48469475, 0.48980522, 0.48459852],
        [0.48354363, 0.46886715, 0.4872648 , 0.5188754 , 0.4859896 ],
        [0.47369137, 0.48774937, 0.48405817, 0.46121997, 0.4997255 ],
        [0.47668332, 0.48584104, 0.5075947 , 0.4818354 , 0.52130926]