In [0]:
#!pip install pm4py

In [0]:
import pm4py
from pm4py.objects.conversion.log import converter as log_converter
import pandas as pd
from pyspark.sql.functions import col, struct, explode, collect_list,lit,array,split, expr
from pyspark.sql.types import StructType, StructField, StringType, ArrayType,FloatType,DoubleType,IntegerType,TimestampType
from time import sleep
from pyspark.sql.functions import udf
from pyspark.sql import DataFrame,SparkSession
import pyspark.sql.functions as F 
from pyspark.sql.functions import current_timestamp
import datetime

Create model

In [0]:
class AlphabetService:
    def __init__(self):
        self.activity_to_alphabet = {}
        self.alphabet_to_activity = {}
        self.char_counter = 64  

    def alphabetize(self, label):
        if label not in self.activity_to_alphabet:
            self.char_counter += 1
            self.activity_to_alphabet[label] = chr(self.char_counter)
            self.alphabet_to_activity[chr(self.char_counter)] = label
            #print("label",label,"char",chr(self.char_counter))
        return self.activity_to_alphabet[label]

    def clear(self):
        self.activity_to_alphabet.clear()
        self.alphabet_to_activity.clear()
        self.char_counter = 64


def process_trace(trace, nodes,depth, current_node='root'):
    last_node = current_node #root
    previous_n_nodes = ['root']

    nodes['root'] = {'label': 'root', 'parent': None, 'level':0, 'direct_children': set(),'direct_children_labels':set(),'nth_children':set()}
    for event in trace:
        event_name = event['alphabetized_label']
        node_id = f"{last_node}-{event_name}"
        if node_id not in nodes:
            nodes[node_id] = {'label': event_name, 'parent': last_node if last_node != 'root' else 'root', 'level':int((len(node_id)-4)/2), 'direct_children': set(),'direct_children_labels':set(),'nth_children':set()}
        
        #update the children of the parent node
        if len(previous_n_nodes)>depth:
            previous_n_nodes = previous_n_nodes[1:]
        for node in previous_n_nodes:
            start = len(node)
            end = len(node_id)-1
            events_between = tuple(node_id[start+1:end-1].split("-"))
            sub_array = (node_id,event_name,int((len(node_id)-4)/2),events_between)
            nodes[node]['nth_children'].add(sub_array)
        if last_node != 'root':
            nodes[last_node]['direct_children'].add(node_id)
            nodes[last_node]['direct_children_labels'].add(event_name)
        previous_n_nodes.append(node_id)
        last_node = node_id

    return nodes

def convert_sets_to_lists(obj):
    if isinstance(obj, set):
        return list(obj)
    elif isinstance(obj, dict):
        return {k: convert_sets_to_lists(v) for k, v in obj.items()}
    else:
        return obj

event_log = pm4py.read_xes("/tmp/BPI_2012_1k_sample.xes")
dataframe = log_converter.apply(event_log, variant=log_converter.Variants.TO_DATA_FRAME)
labels_trace = dataframe[["concept:name", "case:concept:name"]]
grouped_traces = labels_trace.groupby("case:concept:name", sort=False)

alphabet_service = AlphabetService()
nodes = {}
for trace, group in grouped_traces:
    group['alphabetized_label'] = group["concept:name"].apply(alphabet_service.alphabetize)
    nodes = process_trace(group.to_dict('records'), nodes, 3)

labels = alphabet_service.activity_to_alphabet
schema = StructType([
    StructField("event", StringType(), True),
    StructField("label", StringType(), True)
])
df_labels = spark.createDataFrame(list(labels.items()), schema=schema)

data = [{"node_id": node_id, **convert_sets_to_lists(node_data)} for node_id, node_data in nodes.items()]
#df_nodes = spark.createDataFrame(data)

schema = StructType([
    StructField("node_id", StringType(), True),
    StructField("label", StringType(), True),
    StructField("level", IntegerType(), True),
    StructField("direct_children", ArrayType(StringType()), True),
    StructField("direct_children_labels", ArrayType(StringType()), True),
    StructField("nth_children", ArrayType(
        StructType([
            StructField("node_id", StringType(), True),
            StructField("label", StringType(), True),
            StructField("level", IntegerType(), True),
            StructField("events_between", ArrayType(StringType()), True)
        ])
    ), True),
    StructField("parent", StringType(), True)
])
df_nodes = spark.createDataFrame(data, schema=schema)

In [0]:
df_nodes.createOrReplaceTempView("iws_model")
df_labels.createOrReplaceTempView("iws_labels")


In [0]:
%sql
CREATE OR REPLACE TABLE iws_event (event STRING, time_stamp TIMESTAMP, trace_id STRING);

CREATE OR REPLACE TABLE iws_state
(trace_id STRING, ts TIMESTAMP, current_node STRING,current_id STRING,cost_of_alignment INTEGER,previous_events STRING, trace STRING, execution_sequence STRING,event_level INTEGER,current_event_level INTEGER,current_node_level INTEGER);
--event level to filter out the latest alignments later

In [0]:
event_df = spark.readStream.table("iws_event").withWatermark("time_stamp", "1 minute")
event_df.createOrReplaceTempView("events")

In [0]:
%scala
import scala.collection.mutable.ArrayBuffer

def calculateAlignmentCost(modelEvents: String, eventArray: Array[String]): Array[(String, String, Int, Int)] = {
  val newEvents = modelEvents.replace("-", "").split("")
  val n = eventArray.length
  val m = newEvents.length
  val dp = Array.tabulate(n + 1, m + 1)((i, j) => if (i == 0) j else if (j == 0) i else 0)

  // calculate the matrix
  for (i <- 1 to n) {
    for (j <- 1 to m) {
      if (eventArray(i - 1) == newEvents(j - 1)) {
        dp(i)(j) = dp(i - 1)(j - 1)
      } else {
        dp(i)(j) = math.min(dp(i - 1)(j) + 1, dp(i)(j - 1) + 1)
      }
    }
  }

  // track back
  val alignment = ArrayBuffer[(String, String, Int, Int)]()
  var i = n
  var j = m
  var cost = dp(n)(m)

  while (i > 0 && j > 0) {
    if (eventArray(i - 1) == newEvents(j - 1)) {
      alignment.prepend((eventArray(i - 1), "sync", 0, i))
      i -= 1
      j -= 1
    } else if (dp(i)(j) == dp(i - 1)(j) + 1) {
      alignment.prepend((eventArray(i - 1), "log", 1, i))
      i -= 1
    } else {
      alignment.prepend((newEvents(j - 1), "model", 1, j))
      j -= 1
    }
  }

  // handle any elements left over 
  while (i > 0) {
    alignment.prepend((eventArray(i - 1), "log", 1, i))
    i -= 1
  }
  while (j > 0) {
    alignment.prepend((newEvents(j - 1), "model", 1, j))
    j -= 1
  }

  alignment.toArray
}

// Registering the UDF
spark.udf.register("calculateAlignmentCost", (modelEvents: String, eventArray: Array[String]) => calculateAlignmentCost(modelEvents, eventArray))


Solution 1: Using complete output

In [0]:
%sql
CREATE OR REPLACE TABLE state_test_comp
(trace_id STRING, event_array ARRAY<STRUCT<time_stamp TIMESTAMP,label STRING>>,len INTEGER,previous_id STRING,cost_of_alignment INTEGER,previous_events STRING,previous_alignment ARRAY<STRUCT<event String,move_type String>>,event_level INTEGER,current_event_level INTEGER,current_node_level INTEGER);

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW POSSIBLE_ALIGNMENTS AS (
SELECT trace_id, node_id as current_id ,cost_of_alignment + ARRAY_MAX(calc_alignment.cost) AS cost_of_alignment,concat(previous_events,concat_ws("",current_events)) as previous_events,CONCAT(
                    previous_alignment,
                    TRANSFORM(
                        calc_alignment,
                            x -> named_struct('event', x.event, 'move_type', x.move_type)
                        )
                 ) AS alignment,current_event_level + size(current_events) as event_level,event_index,time_stamp,incoming_label,level as current_node_level,current_events FROM (
SELECT *,calculate_alignment(previous_id,node_id,current_events) as calc_alignment 
FROM (SELECT DISTINCT *, idx AS event_index, 
                col.time_stamp as time_stamp,
                col.label as incoming_label,
                slice(event_array,current_event_level+1,len - current_event_level).label as current_events  
                from state_test_comp
                LATERAL VIEW posexplode(slice(event_array,current_event_level+1,len-current_event_level)) AS idx, col) f
                JOIN iws_model m ON (m.node_id LIKE CONCAT(f.previous_id, '%') AND
                m.level < f.current_node_level + event_index + 3 AND
                m.label = f.incoming_label) OR
                m.node_id = f.previous_id) );

CREATE OR REPLACE temp VIEW stream_test_alignm AS SELECT DISTINCT 
                trace_id,
                current_id,
                previous_events,
                alignment,
                cost_of_alignment,
                event_level,
                max_event_level as current_event_level,
                current_node_level
FROM   (
        SELECT *,
        Max(event_level) OVER (partition BY trace_id) AS max_event_level,
        Row_number() OVER (partition BY trace_id,current_id ORDER BY event_level DESC,cost_of_alignment ASC, Len(previous_events) DESC) rn
        FROM POSSIBLE_ALIGNMENTS)
WHERE rn = 1

In [0]:
spark.sql("""SELECT
            e.trace_id as trace_id,
            e.event_array as event_array,
            size(e.event_array) as len,
            COALESCE(r.current_id, 'root') AS previous_id,
            COALESCE(r.cost_of_alignment, 0) AS cost_of_alignment,
            COALESCE(r.previous_events, '') AS previous_events,
            COALESCE(r.alignment, ARRAY(struct("" AS event, "" AS move_type))) AS previous_alignment,
            COALESCE(r.event_level, 0) AS event_level,
            COALESCE(r.current_event_level, 0) AS current_event_level,
            COALESCE(r.current_node_level, 0) AS current_node_level
            FROM (
                SELECT trace_id,
                        array_sort(collect_list(struct(time_stamp, label))) AS event_array
                    FROM 
                        events e
                    JOIN 
                        iws_labels l ON e.event = l.event
                    GROUP BY trace_id) e
            LEFT JOIN stream_test_alignm r ON e.trace_id = r.trace_id
""").writeStream.format("delta").outputMode("complete").option("checkpointLocation", "/tmp/delta/state_append_30079/").toTable("state_test_comp")


foreachBatch method

In [0]:
%sql
CREATE OR REPLACE TABLE state_test_batch
(trace_id STRING, ts TIMESTAMP,current_id STRING,previous_events STRING,event_level INTEGER,current_node_level INTEGER,label STRING,alignment ARRAY<STRUCT<event String,move_type String>>,cost_of_alignment INTEGER,event_array ARRAY<STRING>,event_index INTEGER,batch_id INTEGER);

In [0]:
%sql
CREATE OR REPLACE temp VIEW stream_test_alignm_batch AS SELECT DISTINCT 
                trace_id,
                current_id,
                previous_events,
                alignment,
                cost_of_alignment,
                event_level,
                max_event_level as current_event_level,
                current_node_level,
                rn,
                event_array,
                event_index,
                batch_id
FROM   (
        SELECT *,
        Max(event_level) OVER (partition BY trace_id) AS max_event_level,
        Row_number() OVER (partition BY trace_id,current_id ORDER BY event_level DESC,cost_of_alignment ASC, Len(previous_events) DESC) rn
        FROM state_test_batch)
WHERE rn = 1

In [0]:
%sql
select * from iws_event

In [0]:
def process_batch(df: DataFrame, batch_id: int):

    if not df.isEmpty():
        df.createOrReplaceTempView("streaming_data")

        result_df = df.sparkSession.sql("""
            WITH FIRST_BD AS (
            SELECT *,substr(node_id FROM len(previous_id) + 1) model_sub, CASE WHEN len(substr(node_id FROM len(previous_id) + 1)) = 0 THEN 0 ELSE _event_index END as event_index  FROM (
            SELECT
                e.trace_id AS trace_id,
                idx AS _event_index,
                col.time_stamp AS time_stamp,
                col.label AS incoming_label,
                e.event_array.label AS event_array,
                size(e.event_array) AS len,
                COALESCE(r.current_id, 'root') AS previous_id,
                COALESCE(r.cost_of_alignment, 0) AS cost_of_alignment,
                COALESCE(r.previous_events, '') AS previous_events,
                substr(concat_ws("", e.event_array.label), greatest(idx-3,0), idx + 1) AS trace_suffix,
                COALESCE(r.alignment, ARRAY(struct("" AS event, "" AS move_type))) AS previous_alignment,
                COALESCE(r.event_level, 0) AS event_level,
                COALESCE(r.current_event_level, 0) AS current_event_level,
                COALESCE(r.current_node_level, 0) AS current_node_level
            FROM (
                SELECT 
                    trace_id,
                    array_sort(collect_list(struct(time_stamp, label))) AS event_array
                FROM 
                    streaming_data e 
                JOIN 
                    iws_labels l ON e.event = l.event
                GROUP BY trace_id
            ) e
            LEFT JOIN stream_test_alignm_batch r ON e.trace_id = r.trace_id 
            LATERAL VIEW posexplode(e.event_array) AS idx, col
        ) f
        JOIN iws_model m ON (m.node_id LIKE CONCAT(f.previous_id, '%') 
            AND m.level < f.current_node_level  + _event_index + len - len(trace_suffix) + 2
            AND m.label = f.incoming_label) 
            OR (m.node_id = f.previous_id AND len = _event_index + 1)),
MaxEventIndexPerTrace AS (
    SELECT
        trace_id as _trace_id,
        MAX(event_index) AS max_event_index
    FROM FIRST_BD
    GROUP BY trace_id
),
BASE_DATA AS (SELECT * FROM FIRST_BD LEFT JOIN MaxEventIndexPerTrace on FIRST_BD.trace_id = MaxEventIndexPerTrace._trace_id WHERE event_index > greatest(max_event_index-4,0))

SELECT 
      trace_id,
                time_stamp as ts,
                node_id as current_id,
                concat(previous_events,concat_ws("",event_array)) as previous_events,
                current_event_level + len as event_level,
                level as current_node_level,
                label,
                CONCAT(
                    previous_alignment,
                    TRANSFORM(
                        calc_alignment,
                            x -> named_struct('event', x._1, 'move_type', x._2)
                        )
                 ) AS alignment,
                cost_of_alignment + ARRAY_MAX(calc_alignment._3) AS cost_of_alignment,
                event_array,
                event_index
    FROM (
        SELECT *,calculateAlignmentCost(substr(node_id FROM len(previous_id) + 1),event_array) as calc_alignment 
        FROM BASE_DATA
            ) 
        """)
        result_df = result_df.withColumn("batch_id",F.lit(batch_id))
        # Write the results of the SQL query to a Delta table
        result_df.write.format("delta").mode("append").option("checkpointLocation", "/tmp/delta/state_append_30038/").saveAsTable("state_test_batch")

# Set up the write stream using foreachBatch
query = streaming_df_batch_4.writeStream.foreachBatch(process_batch).start()
#query.awaitTermination()

Solution trying to optimise the amount of UDF calculations to be done, however cross-join is probaly too slow.

In [0]:
def process_batch(df, batch_id):

    if not df.isEmpty():
        df.createOrReplaceTempView("streaming_data")

        result_df = df.sparkSession.sql("""
            WITH FIRST_BD AS (
            SELECT *,substr(node_id FROM len(previous_id) + 1) model_sub, CASE WHEN len(substr(node_id FROM len(previous_id) + 1)) = 0 THEN 0 ELSE _event_index END as event_index  FROM (
            SELECT
                e.trace_id AS trace_id,
                idx AS _event_index,
                col.time_stamp AS time_stamp,
                col.label AS incoming_label,
                e.event_array.label AS event_array,
                size(e.event_array) AS len,
                COALESCE(r.current_id, 'root') AS previous_id,
                COALESCE(r.cost_of_alignment, 0) AS cost_of_alignment,
                COALESCE(r.previous_events, '') AS previous_events,
                substr(concat_ws("", e.event_array.label), greatest(idx-3,0), idx + 1) AS trace_suffix,
                COALESCE(r.alignment, ARRAY(struct("" AS event, "" AS move_type))) AS previous_alignment,
                COALESCE(r.event_level, 0) AS event_level,
                COALESCE(r.current_event_level, 0) AS current_event_level,
                COALESCE(r.current_node_level, 0) AS current_node_level
            FROM (
                SELECT 
                    trace_id,
                    array_sort(collect_list(struct(time_stamp, label))) AS event_array
                FROM 
                    streaming_data e 
                JOIN 
                    iws_labels l ON e.event = l.event
                GROUP BY trace_id
            ) e
            LEFT JOIN stream_test_alignm_batch r ON e.trace_id = r.trace_id 
            LATERAL VIEW posexplode(e.event_array) AS idx, col
        ) f
        JOIN iws_model m ON (m.node_id LIKE CONCAT(f.previous_id, '%') 
            AND m.level < f.current_node_level  + _event_index + 2
            AND m.label = f.incoming_label) 
            OR (m.node_id = f.previous_id AND len = _event_index + 1)
    ),
MaxEventIndexPerTrace AS (
    SELECT
        trace_id as _trace_id,
        MAX(event_index) AS max_event_index
    FROM FIRST_BD
    GROUP BY trace_id
),
BASE_DATA AS (SELECT * FROM FIRST_BD LEFT JOIN MaxEventIndexPerTrace on FIRST_BD.trace_id = MaxEventIndexPerTrace._trace_id WHERE event_index > greatest(max_event_index-4,0)),

    SUFFIXES AS (
        SELECT  * FROM (
            SELECT  
                a.previous_id, 
                a.trace_id, 
                a.node_id AS candidate,
                MAX(CASE WHEN b.model_sub LIKE CONCAT(a.model_sub, '-%') THEN 1 ELSE 0 END) AS is_covered
            FROM BASE_DATA a
            JOIN BASE_DATA b ON a.model_sub <> b.model_sub 
                AND a.trace_id = b.trace_id 
                AND a.previous_id = b.previous_id
            GROUP BY a.trace_id, a.previous_id,a.node_id
        ) WHERE IS_COVERED = 0
    ),
    INTERMEDIATE AS (
        SELECT 
            trace_id, 
            node_id AS current_id,
            TRANSFORM(
                calc_alignment,
                x -> named_struct('event', x._1, 'move_type', x._2,"_cost",x._3, "event_index", x._4)
            ) AS alignment,
            model_sub
        FROM (
            SELECT 
                b.trace_id, 
                b.node_id, 
                b.previous_id, 
                b.model_sub,
                calculateAlignmentCost(
                    b.model_sub, 
                    event_array
                ) AS calc_alignment  
            FROM BASE_DATA b 
            INNER JOIN SUFFIXES s ON b.trace_id = s.trace_id 
                AND b.previous_id = s.previous_id 
                AND b.node_id = s.candidate
        )
    ),
    UPDATED_BASE_DATA AS (
        SELECT 
            b.*,
            i.alignment 
        FROM BASE_DATA b
        LEFT JOIN INTERMEDIATE i ON b.trace_id = i.trace_id 
            AND i.current_id LIKE CONCAT("%",b.model_sub, '%') 
    ),
UPDATED_ALIGNMENTS AS(SELECT len(model_sub)/2,trace_id,time_stamp,previous_events,current_event_level,len,event_array,label,previous_alignment,cost_of_alignment,event_index,node_id,level,TRANSFORM(
                    alignment,
                        x -> CASE WHEN x.event_index > event_index+1 OR len(model_sub) = 0 and x.move_type in ("log","sync") THEN named_struct('event', x.event, 'move_type', "log","_cost",1,"index",x.event_index)
                        WHEN x.move_type in ("model","sync") and len(model_sub)/2 < x.event_index THEN named_struct('event',"",'move_type',"",'_cost',0,"index",x.event_index)
                        else named_struct('event', x.event, 'move_type', x.move_type,"_cost",x._cost,"index",x.event_index) END)
                         AS new_alignment FROM UPDATED_BASE_DATA)

      SELECT  trace_id,
                time_stamp as ts,
                node_id as current_id,
                concat(previous_events,concat_ws("",event_array)) as previous_events,
                current_event_level + len as event_level,
                level as current_node_level,
                label,
                TRANSFORM(
                        new_alignment,
                            x -> named_struct('event', x.event, 'move_type', x.move_type)
                        
                 ) AS alignment,
                cost_of_alignment + aggregate(new_alignment._cost, 0, (acc, x) -> acc + x) AS cost_of_alignment,
                event_array,
                event_index FROM UPDATED_ALIGNMENTS 
""")
        result_df = result_df.withColumn("batch_id",F.lit(batch_id))

        result_df.write.format("delta").mode("append").option("checkpointLocation", "/tmp/delta/state_append_30033/").saveAsTable("state_test_batch")

query = streaming_df_batch.writeStream.foreachBatch(process_batch).start()


In [0]:
%sql
select * from state_test_batch 

Using session_window

In [0]:
%sql
CREATE OR REPLACE TABLE iws_event_session (event STRING, time_stamp TIMESTAMP, trace_id STRING);

In [0]:
event_df_session = spark.readStream.table("iws_event_session").withWatermark("time_stamp", "0 seconds")
event_df_session = event_df.withColumn("batch_ts", current_timestamp())
event_df_session = event_df_session \
    .withWatermark("batch_ts", "1 minute") \
    .groupBy("trace_id", F.session_window("batch_ts", "5 minutes")) \
    .agg(F.array_sort(F.collect_list(struct("time_stamp", "event"))).alias("sorted_events"))
event_df_session.createOrReplaceTempView("events_session")

In [0]:
%sql
select * from events_session

In [0]:

event_df_session = spark.readStream.table("iws_event_session").withWatermark("time_stamp", "0 seconds")
event_df_session.createOrReplaceTempView("events_session")

In [0]:
%sql
CREATE OR REPLACE TABLE state_test_session
(trace_id STRING, ts STRUCT<ts1 ARRAY<TIMESTAMP>,cur_ts TIMESTAMP>,time_window STRUCT<start Timestamp,end Timestamp>,current_id STRING,previous_events STRING,event_level INTEGER,current_event_level INTEGER,current_node_level INTEGER,labels ARRAY<STRING>,len INTEGER,node_id STRING,label STRING,alignment ARRAY<STRUCT<event String,move_type String>>,cost_of_alignment INTEGER);

In [0]:
%sql
CREATE OR REPLACE temp VIEW stream_test_alignm_session AS SELECT DISTINCT 
                trace_id,
                node_id AS current_id,
                previous_events,
                alignment,
                cost_of_alignment,
                event_level,
                current_event_level,
                current_node_level,
                rn
FROM   (
        SELECT *,
        Max(current_event_level) OVER (partition BY trace_id) AS max_event_level,
        Row_number() OVER (partition BY trace_id,node_id ORDER BY event_level DESC,cost_of_alignment ASC, Len(previous_events) DESC) rn
        FROM state_test_session)
WHERE rn = 1

In [0]:
spark.sql("""SELECT trace_id , ts ,time_window,current_id,previous_events ,event_level ,current_event_level ,current_node_level ,labels,len ,node_id ,label ,concat(align,transform(
    alignm, 
    x -> named_struct('event', x.event, 'move_type', x.move_type)
  )) as alignment, cost_+array_max(alignm.cost) as cost_of_alignment 
  FROM(
SELECT trace_id,STRUCT(ts as ts1,current_timestamp() as cur_ts) as ts,time_window ,current_id ,cost_of_alignment as cost_ ,concat(previous_events,concat_ws("",labels)) AS previous_events, calculate_alignment(current_id,node_id,labels) as alignm,alignment as align ,current_event_level + len as event_level ,current_event_level + len as current_event_level ,level as current_node_level ,labels ,len ,node_id ,m.label
FROM (
    SELECT trace_id,
           ts,
           event_index,
           label,
           time_window,
           current_id,
           cost_of_alignment,
           previous_events,
           alignment,
           event_level,
           current_event_level,
           current_node_level,
           labels,
           size(labels) as len
    FROM (
        SELECT e.trace_id,
               idx AS event_index, 
               time_window,
               col.label as label, 
               e.event_arr.label as labels,
               e.event_arr.time_stamp as ts,
               COALESCE(r.current_id, 'root') AS current_id, 
               COALESCE(cost_of_alignment, 0) AS cost_of_alignment, 
               COALESCE(r.previous_events, '') AS previous_events, 
               COALESCE(r.alignment,array(struct("" as event, "" as move_type)))  as alignment,
               COALESCE(event_level, 0) AS event_level, 
               COALESCE(current_event_level, 0) AS current_event_level, 
               COALESCE(current_node_level, 0) AS current_node_level
        FROM (
            SELECT trace_id,
                   array_sort(collect_list(struct(time_stamp, label))) AS event_arr, session_window(time_stamp, '1 minute') AS time_window
            FROM events_session e 
            JOIN iws_labels l ON e.event = l.event
            GROUP BY trace_id, time_window
        ) e 
        LEFT JOIN stream_test_alignm_session r ON e.trace_id = r.trace_id
        LATERAL VIEW posexplode(event_arr)  AS idx, col
    ) q
) f
JOIN iws_model m ON (m.node_id LIKE CONCAT(f.current_id, '%')
                 AND m.level < f.current_node_level + f.event_index + 3
AND m.label = f.label) or m.node_id = f.current_id) -- no match then log moves
""").writeStream.format("delta").outputMode("append").option("checkpointLocation","/tmp/delta/state_append_30029/").toTable("state_test_session")

In [0]:
%sql
select * from stream_test_alignm_session

In [0]:
%sql
select * from iws_event_session e join iws_labels l on e.event = l.event where trace_id = "trace_4" order by time_stamp desc

Add events to iws_event

In [0]:
schema = StructType([
    StructField("event", StringType(), True),
    StructField("time_stamp", TimestampType(), True),
    StructField("trace_id", StringType(), True)
])
streaming_df_batch_5 = spark.readStream \
    .format("parquet").schema(schema) \
    .load('path_to_data')

dummy data

In [0]:
%sql
CREATE OR REPLACE TABLE iws_model AS
SELECT "root" AS node_id, "-" AS label, array("A") AS children_labels, array("root-A") AS children_id, 
    array(
        struct("root-A" AS node_id, "A" AS label, 1 as level, array() as events_between),
        struct("root-A-B" AS node_id, "B" AS label, 2 as level, array("A") as events_between),
        struct("root-A-B-X" AS node_id, "X" AS label, 3 as level, array("A","B") as events_between), 
        struct("root-A-B-D" AS node_id, "D" AS label, 3 as level, array("A","B") as events_between)
    ) AS nth_children, 0 as level
UNION ALL 
SELECT "root-A", "A", array("B"), array("root-A-B"), 
    array(
        struct("root-A-B" AS node_id, "B" AS label, 2 as level, array() as events_between),
        struct("root-A-B-X" AS node_id, "X" AS label, 3 as level, array("B") as events_between),
        struct("root-A-B-D" AS node_id, "D" AS label, 3 as level, array("B") as events_between),
        struct("root-A-B-D-C" AS node_id, "C" AS label, 4 as level, array("B","D") as events_between)
    ),1
UNION ALL 
SELECT "root-A-B", "B", array("X", "D"), array("root-A-B-X", "root-A-B-D"), 
    array(
        struct("root-A-B-X" AS node_id, "X" AS label, 3 as level, array() as events_between), -- "A-B-C" now "A-B-X"
        struct("root-A-B-D" AS node_id, "D" AS label, 3 as level, array() as events_between),
        struct("root-A-B-D-C" AS node_id, "C" AS label, 4 as level, array("D") as events_between)
    ),2
UNION ALL 
SELECT "root-A-B-X", "X", array(), array(),
    array(),3
UNION ALL 
SELECT "root-A-B-D", "D", array("C"), array("root-A-B-D-C"), 
    array(
        struct("root-A-B-D-C" AS node_id, "C" AS label, 4 as level, array() as events_between),
        struct("root-A-B-D-C-D" AS node_id, "D" AS label, 5 as level, array("C") as events_between),
        struct("root-A-B-D-C-C" AS node_id, "C" AS label, 5 as level, array("C") as events_between)
    ),3
UNION ALL 
SELECT "root-A-B-D-C", "C", array("D", "C"), array("root-A-B-D-C-D", "root-A-B-D-C-C"), 
    array(
        struct("root-A-B-D-C-D" AS node_id, "D" AS label, 5 as level, array() as events_between),
        struct("root-A-B-D-C-C" AS node_id, "C" AS label, 5 as level, array() as events_between)
    ),4
UNION ALL 
SELECT "root-A-B-D-C-D", "D", array(), array(), array(),5
UNION ALL 
SELECT "root-A-B-D-C-C", "C", array(), array(), array(),5;




In [0]:
%sql
CREATE OR REPLACE TABLE iws_event 
(event STRING,time_stamp TIMESTAMP,trace_id STRING);
---SELECT "A" event, CURRENT_TIMESTAMP() time_stamp, "trace_id_0" trace_id;


CREATE OR REPLACE TABLE iws_event_state
(event STRING, time_stamp TIMESTAMP, trace_id STRING,processed STRING);

CREATE OR REPLACE TABLE iws_labels 
SELECT "A" AS event, "A"  AS label
UNION ALL
SELECT "B","B" 
UNION ALL
SELECT "C","C" 
UNION ALL
SELECT "D","D" 
UNION ALL
SELECT "X","X" ;

CREATE OR REPLACE TABLE iws_state
(trace_id STRING, ts TIMESTAMP, current_node STRING,current_id STRING,cost_of_alignment INTEGER,previous_events STRING, trace STRING, execution_sequence STRING,event_level INTEGER,current_node_level INTEGER);
--event level to filter out the latest alignments later

In [0]:
import random 

activities = ["A","B","C","D","X"]
trace_id_lower = 0
trace_id_upper = 9

def insert_event(event=None, trace_id=None):
    if not event:
        event = random.choice(activities)
    if not trace_id:
        trace_id = 0 #random.randint(trace_id_lower, trace_id_upper)
    spark.sql(f"INSERT INTO iws_event SELECT '{event}', CURRENT_TIMESTAMP(), 'trace_id_{trace_id}'")

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW latest_state AS 
SELECT DISTINCT trace_id, ts, current_node, current_id, cost_of_alignment,previous_events,trace,execution_sequence,event_level, max_event_level as current_event_level,current_node_level,rn,write_ts FROM (
SELECT *, row_number() OVER (PARTITION BY trace_id,current_id order by event_level desc,cost_of_alignment asc, len(previous_events) desc) rn,
max(event_level) OVER (PARTITION BY trace_id) AS max_event_level FROM iws_state
) WHERE rn = 1 --and event_level > current_event_level -2

In [0]:
#if looking at older events, cost should be difference in event levels also number of skips >> between is same number
#current level -> used to track what event the state is from if the state is used in the future
#event level -> used to track the current event level (how many events there have been)
#current_node_level -> used to track what level the node is, used in calculation
spark.sql("""
SELECT trace_id,ts,exploded.*,previous_events FROM(
SELECT trace_id,ts,og_event,concat(model_moves,exploded_struct) as sub_exploded,previous_events FROM(
SELECT 
      e.trace_id, 
      e.time_stamp as ts,
      e.og_event,
      concat(e.previous_events,e.event) as previous_events,
      transform(
        filter(m.nth_children, x -> x.label = e.event),
        x -> struct(e.event as current_node, x.node_id as current_id, e.cost_of_alignment + abs(x.level - current_node_level-1) + current_event_level - event_level as cost_of_alignment,  concat(coalesce(trace,"root"),repeat(">>",abs(x.level - current_node_level-1)),e.event) as trace,CONCAT(execution_sequence, coalesce(CONCAT_WS('', x.events_between)),e.event) as execution_sequence, current_event_level+1 as event_level, x.level as current_node_level)
      ) as model_moves,
      array(struct(e.current_node as current_node, m.node_id as current_id, e.cost_of_alignment + e.current_event_level - e.event_level + 1 as cost_of_alignment, concat(trace, e.event) as trace, concat(execution_sequence, ">>")as execution_sequence, e.current_event_level + 1 as event_level, current_node_level))
      AS exploded_struct
    FROM 
      (SELECT 
      e.trace_id,
      e.time_stamp,
      e.event as og_event, 
      l.label as event, 
      COALESCE(r.current_node, "-") as current_node, 
      COALESCE(r.current_id, "root") as current_id, 
      coalesce(cost_of_alignment, 0) as cost_of_alignment, 
      coalesce(r.previous_events,"") as previous_events , 
      coalesce(trace,"") as trace, 
      coalesce(execution_sequence,"") as execution_sequence, 
      coalesce(event_level, 0) as event_level, 
      coalesce(current_event_level, 0) as current_event_level, 
      coalesce(current_node_level,0) as current_node_level FROM events e LEFT JOIN latest_state r ON e.trace_id = r.trace_id JOIN iws_labels l on e.event = l.event) e 
    JOIN iws_model m ON e.current_id = m.node_id ))
  LATERAL VIEW explode(sub_exploded) t AS exploded
""").writeStream.format("delta").outputMode("append").option("checkpointLocation","/tmp/delta/state_append_69/").toTable("iws_state")