In [0]:
from pyspark.sql.functions import col, struct, explode, collect_list,lit,array,split, expr
from pyspark.sql.types import StructType, StructField, StringType, ArrayType,FloatType,DoubleType,IntegerType,TimestampType
from time import sleep
from pyspark.sql.functions import udf
from pyspark.sql import DataFrame,SparkSession
import pyspark.sql.functions as F 
from pyspark.sql.functions import current_timestamp
import datetime

In [0]:
%sql
CREATE OR REPLACE TABLE iws_model AS
SELECT "root" AS node_id, "-" AS label, array("A") AS children_labels, array("root-A") AS children_id, 
    array(
        struct("root-A" AS node_id, "A" AS label, 1 as level, array() as events_between),
        struct("root-A-B" AS node_id, "B" AS label, 2 as level, array("A") as events_between),
        struct("root-A-B-X" AS node_id, "X" AS label, 3 as level, array("A","B") as events_between), 
        struct("root-A-B-D" AS node_id, "D" AS label, 3 as level, array("A","B") as events_between)
    ) AS nth_children, 0 as level
UNION ALL 
SELECT "root-A", "A", array("B"), array("root-A-B"), 
    array(
        struct("root-A-B" AS node_id, "B" AS label, 2 as level, array() as events_between),
        struct("root-A-B-X" AS node_id, "X" AS label, 3 as level, array("B") as events_between),
        struct("root-A-B-D" AS node_id, "D" AS label, 3 as level, array("B") as events_between),
        struct("root-A-B-D-C" AS node_id, "C" AS label, 4 as level, array("B","D") as events_between)
    ),1
UNION ALL 
SELECT "root-A-B", "B", array("X", "D"), array("root-A-B-X", "root-A-B-D"), 
    array(
        struct("root-A-B-X" AS node_id, "X" AS label, 3 as level, array() as events_between), -- "A-B-C" now "A-B-X"
        struct("root-A-B-D" AS node_id, "D" AS label, 3 as level, array() as events_between),
        struct("root-A-B-D-C" AS node_id, "C" AS label, 4 as level, array("D") as events_between)
    ),2
UNION ALL 
SELECT "root-A-B-X", "X", array(), array(),
    array(),3
UNION ALL 
SELECT "root-A-B-D", "D", array("C"), array("root-A-B-D-C"), 
    array(
        struct("root-A-B-D-C" AS node_id, "C" AS label, 4 as level, array() as events_between),
        struct("root-A-B-D-C-D" AS node_id, "D" AS label, 5 as level, array("C") as events_between),
        struct("root-A-B-D-C-C" AS node_id, "C" AS label, 5 as level, array("C") as events_between)
    ),3
UNION ALL 
SELECT "root-A-B-D-C", "C", array("D", "C"), array("root-A-B-D-C-D", "root-A-B-D-C-C"), 
    array(
        struct("root-A-B-D-C-D" AS node_id, "D" AS label, 5 as level, array() as events_between),
        struct("root-A-B-D-C-C" AS node_id, "C" AS label, 5 as level, array() as events_between)
    ),4
UNION ALL 
SELECT "root-A-B-D-C-D", "D", array(), array(), array(),5
UNION ALL 
SELECT "root-A-B-D-C-C", "C", array(), array(), array(),5;




num_affected_rows,num_inserted_rows


In [0]:
%sql
CREATE OR REPLACE TABLE iws_event 
(event STRING,time_stamp TIMESTAMP,trace_id STRING);
---SELECT "A" event, CURRENT_TIMESTAMP() time_stamp, "trace_id_0" trace_id;


CREATE OR REPLACE TABLE iws_event_state
(event STRING, time_stamp TIMESTAMP, trace_id STRING,processed STRING);
CREATE OR REPLACE TABLE iws_labels
SELECT "A" AS event, "A"  AS label
UNION ALL
SELECT "B","B" 
UNION ALL
SELECT "C","C" 
UNION ALL
SELECT "D","D" 
UNION ALL
SELECT "X","X" ;

CREATE OR REPLACE TABLE iws_state
(trace_id STRING, ts TIMESTAMP, current_node STRING,current_id STRING,cost_of_alignment INTEGER, trace STRING, execution_sequence STRING,event_level INTEGER,current_event_level INTEGER,current_node_level INTEGER,previous_events STRING);
---event level to filter out the latest alignments later

In [0]:
import random 

activities = ["A","B","C","D","X"]
trace_id_lower = 0
trace_id_upper = 9

def insert_event(event=None, trace_id=None):
    if not event:
        event = random.choice(activities)
    if not trace_id:
        trace_id = 0 #random.randint(trace_id_lower, trace_id_upper)
    spark.sql(f"INSERT INTO iws_event SELECT '{event}', CURRENT_TIMESTAMP(), 'trace_id_{trace_id}'")

In [0]:
event_df = spark.readStream.table("iws_event").withWatermark("time_stamp", "1 minute")
event_df.createOrReplaceTempView("events")

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW latest_state AS 
SELECT DISTINCT trace_id, ts, current_node, current_id, cost_of_alignment,previous_events,trace,execution_sequence,event_level, max_event_level as current_event_level,current_node_level,rn FROM (
SELECT *, row_number() OVER (PARTITION BY trace_id,current_id order by event_level desc,cost_of_alignment asc, len(previous_events) desc) rn,
max(event_level) OVER (PARTITION BY trace_id) AS max_event_level FROM iws_state
) WHERE rn = 1 --and event_level > current_event_level -2

In [0]:
#if looking at older events, cost should be difference in event levels also number of skips >> between is same number
#current level -> used to track what event the state is from if the state is used in the future
#event level -> used to track the current event level (how many events there have been)
#current_node_level -> used to track what level the node is, used in calculation
spark.sql("""
SELECT trace_id,ts,exploded.*,previous_events FROM(
SELECT trace_id,ts,concat(model_moves,exploded_struct) as sub_exploded,previous_events FROM(
SELECT 
      e.trace_id, 
      e.time_stamp as ts,
      e.og_event,
      concat(e.previous_events,e.event) as previous_events,
      transform(
        filter(m.nth_children, x -> x.label LIKE e.event),
        x -> struct(e.event as current_node, x.node_id as current_id, e.cost_of_alignment + abs(x.level - current_node_level-1) + current_event_level - event_level as cost_of_alignment,  concat(coalesce(trace,"root"),repeat(">>",abs(x.level - current_node_level-1)),e.event) as trace,CONCAT(execution_sequence, coalesce(CONCAT_WS('', x.events_between)),e.event) as execution_sequence, current_event_level+1 as event_level, current_event_level + 1 as current_event_level,x.level as current_node_level)
      ) as model_moves,
      array(struct(e.current_node as current_node, m.node_id as current_id, e.cost_of_alignment + e.current_event_level - e.event_level + 1 as cost_of_alignment, concat(trace, e.event) as trace, concat(execution_sequence, ">>")as execution_sequence, e.current_event_level + 1 as event_level, current_event_level + 1 as current_event_level, current_node_level))
      AS exploded_struct
    FROM 
      (SELECT e.trace_id,e.time_stamp,e.event as og_event, l.label as event, COALESCE(r.current_node, "-") as current_node, COALESCE(r.current_id, "root") as current_id, coalesce(cost_of_alignment, 0) as cost_of_alignment, coalesce(r.previous_events,"") as previous_events , coalesce(trace,"") as trace, coalesce(execution_sequence,"") as execution_sequence, coalesce(event_level, 0) as event_level, coalesce(current_event_level, 0) as current_event_level,coalesce(current_node_level,0) as current_node_level FROM events e LEFT JOIN latest_state r ON e.trace_id = r.trace_id JOIN iws_labels l on e.event = l.event) e 
    JOIN iws_model m ON e.current_id = m.node_id ))
  LATERAL VIEW explode(sub_exploded) t AS exploded
""").writeStream.format("delta").outputMode("append").option("checkpointLocation","/tmp/delta/state_append_120/").toTable("iws_state")

Out[12]: <pyspark.sql.streaming.query.StreamingQuery at 0x7f96b042cac0>

In [0]:
insert_event("A",3)
sleep(10)
insert_event("X",3)
sleep(10)
insert_event("B",3)
sleep(10)
insert_event("A",3)
sleep(10)
insert_event("C",3)
#insert different events with different traces
#At the moment inserted events are AXBAC

In [0]:
%sql
select * from latest_state where trace_id = "trace_id_3"

trace_id,ts,current_node,current_id,cost_of_alignment,previous_events,trace,execution_sequence,event_level,current_event_level,current_node_level,rn
trace_id_3,2024-06-07T04:12:12.544+0000,-,root,5,AXBAC,AXBAC,>>>>>>>>>>,5,5,0,1
trace_id_3,2024-06-07T04:12:12.544+0000,A,root-A,4,AXBAC,AXBAC,A>>>>>>>>,5,5,1,1
trace_id_3,2024-06-07T04:12:12.544+0000,B,root-A-B,3,AXBAC,AXBAC,A>>B>>>>,5,5,2,1
trace_id_3,2024-06-07T04:12:12.544+0000,C,root-A-B-D-C,3,AXBAC,AXBA>>C,A>>B>>DC,5,5,4,1
trace_id_3,2024-06-07T04:12:12.544+0000,X,root-A-B-X,4,AXBAC,A>>XBAC,ABX>>>>>>,5,5,3,1


In [0]:
%sql
SELECT trace_id,current_id,previous_events,trace,execution_sequence,cost_of_alignment
FROM latest_state
WHERE (trace_id, cost_of_alignment) IN (
    SELECT trace_id, MIN(cost_of_alignment)
    FROM latest_state
    GROUP BY trace_id
) and trace_id = "trace_id_3";
--- select the final optimal prefix alignments

trace_id,current_id,previous_events,trace,execution_sequence,cost_of_alignment
trace_id_3,root-A-B,AXBAC,AXBAC,A>>B>>>>,3
trace_id_3,root-A-B-D-C,AXBAC,AXBA>>C,A>>B>>DC,3
