In [0]:
from pyspark.sql.functions import col, struct, explode, collect_list,lit,array,split, expr
from pyspark.sql.types import StructType, StructField, StringType, ArrayType,FloatType,DoubleType,IntegerType,TimestampType
from time import sleep
from pyspark.sql.functions import udf
from pyspark.sql import DataFrame,SparkSession
import pyspark.sql.functions as F 
from pyspark.sql.functions import current_timestamp
import datetime

In [0]:
%sql
CREATE OR REPLACE TABLE iws_model AS
SELECT "root" AS node_id, "-" AS label, array("A") AS children_labels, array("root-A") AS children_id, 
    array(
        struct("root-A" AS node_id, "A" AS label, 1 as level, array() as events_between),
        struct("root-A-B" AS node_id, "B" AS label, 2 as level, array("A") as events_between),
        struct("root-A-B-X" AS node_id, "X" AS label, 3 as level, array("A","B") as events_between), 
        struct("root-A-B-D" AS node_id, "D" AS label, 3 as level, array("A","B") as events_between)
    ) AS nth_children, 0 as level
UNION ALL 
SELECT "root-A", "A", array("B"), array("root-A-B"), 
    array(
        struct("root-A-B" AS node_id, "B" AS label, 2 as level, array() as events_between),
        struct("root-A-B-X" AS node_id, "X" AS label, 3 as level, array("B") as events_between),
        struct("root-A-B-D" AS node_id, "D" AS label, 3 as level, array("B") as events_between),
        struct("root-A-B-D-C" AS node_id, "C" AS label, 4 as level, array("B","D") as events_between)
    ),1
UNION ALL 
SELECT "root-A-B", "B", array("X", "D"), array("root-A-B-X", "root-A-B-D"), 
    array(
        struct("root-A-B-X" AS node_id, "X" AS label, 3 as level, array() as events_between), -- "A-B-C" now "A-B-X"
        struct("root-A-B-D" AS node_id, "D" AS label, 3 as level, array() as events_between),
        struct("root-A-B-D-C" AS node_id, "C" AS label, 4 as level, array("D") as events_between)
    ),2
UNION ALL 
SELECT "root-A-B-X", "X", array(), array(),
    array(),3
UNION ALL 
SELECT "root-A-B-D", "D", array("C"), array("root-A-B-D-C"), 
    array(
        struct("root-A-B-D-C" AS node_id, "C" AS label, 4 as level, array() as events_between),
        struct("root-A-B-D-C-D" AS node_id, "D" AS label, 5 as level, array("C") as events_between),
        struct("root-A-B-D-C-C" AS node_id, "C" AS label, 5 as level, array("C") as events_between)
    ),3
UNION ALL 
SELECT "root-A-B-D-C", "C", array("D", "C"), array("root-A-B-D-C-D", "root-A-B-D-C-C"), 
    array(
        struct("root-A-B-D-C-D" AS node_id, "D" AS label, 5 as level, array() as events_between),
        struct("root-A-B-D-C-C" AS node_id, "C" AS label, 5 as level, array() as events_between)
    ),4
UNION ALL 
SELECT "root-A-B-D-C-D", "D", array(), array(), array(),5
UNION ALL 
SELECT "root-A-B-D-C-C", "C", array(), array(), array(),5;




num_affected_rows,num_inserted_rows


In [0]:
%sql
CREATE OR REPLACE TABLE iws_event 
(event STRING,time_stamp TIMESTAMP,trace_id STRING);
---SELECT "A" event, CURRENT_TIMESTAMP() time_stamp, "trace_id_0" trace_id;


CREATE OR REPLACE TABLE iws_event_state
(event STRING, time_stamp TIMESTAMP, trace_id STRING,processed STRING);
CREATE OR REPLACE TABLE iws_labels
SELECT "A" AS event, "A"  AS label
UNION ALL
SELECT "B","B" 
UNION ALL
SELECT "C","C" 
UNION ALL
SELECT "D","D" 
UNION ALL
SELECT "X","X" ;

CREATE OR REPLACE TABLE iws_state
(trace_id STRING, ts TIMESTAMP, current_node STRING,current_id STRING,cost_of_alignment INTEGER,previous_events STRING, trace STRING, execution_sequence STRING,event_level
INTEGER,current_node_level INTEGER);
---event level to filter out the latest alignments later

In [0]:
import random 

activities = ["A","B","C","D","X"]
trace_id_lower = 0
trace_id_upper = 9

def insert_event(event=None, trace_id=None):
    if not event:
        event = random.choice(activities)
    if not trace_id:
        trace_id = 0 #random.randint(trace_id_lower, trace_id_upper)
    spark.sql(f"INSERT INTO iws_event SELECT '{event}', CURRENT_TIMESTAMP(), 'trace_id_{trace_id}'")

In [0]:
event_df_session = spark.readStream.table("iws_event").withWatermark("time_stamp", "0 seconds")
event_df_session.createOrReplaceTempView("events_session")

In [0]:
%scala
import scala.collection.mutable.ArrayBuffer

def calculateAlignmentCost(modelEvents: String, eventArray: Array[String]): Array[(String, String, Int, Int)] = {
  val newEvents = modelEvents.replace("-", "").split("")
  val n = eventArray.length
  val m = newEvents.length
  val dp = Array.tabulate(n + 1, m + 1)((i, j) => if (i == 0) j else if (j == 0) i else 0)

  if (modelEvents == "") {
    return eventArray.zipWithIndex.map { case (event, index) =>
      (event, "log", 1, index + 1)
    }
  }

  // Fill the matrix
  for (i <- 1 to n) {
    for (j <- 1 to m) {
      if (eventArray(i - 1) == newEvents(j - 1)) {
        dp(i)(j) = dp(i - 1)(j - 1)
      } else {
        dp(i)(j) = math.min(dp(i - 1)(j) + 1, dp(i)(j - 1) + 1)
      }
    }
  }

  // Track back to build the alignment
  val alignment = ArrayBuffer[(String, String, Int, Int)]()
  var i = n
  var j = m
  var cost = dp(n)(m)

  while (i > 0 && j > 0) {
    if (eventArray(i - 1) == newEvents(j - 1)) {
      alignment.prepend((eventArray(i - 1), "sync", 0, i))
      i -= 1
      j -= 1
    } else if (dp(i)(j) == dp(i - 1)(j) + 1) {
      alignment.prepend((eventArray(i - 1), "log", 1, i))
      i -= 1
    } else {
      alignment.prepend((newEvents(j - 1), "model", 1, j))
      j -= 1
    }
  }
  // Handle any remaining elements necessary if there are trailing events in the beginning of the trace or model nodes
  while (i > 0) {
    alignment.prepend((eventArray(i - 1), "log", 1, i))
    i -= 1
  }
  while (j > 0) {
    alignment.prepend((newEvents(j - 1), "model", 1, j))
    j -= 1
  }


  alignment.toArray
}

// Registering the UDF
spark.udf.register("calculateAlignmentCost", (modelEvents: String, eventArray: Array[String]) => calculateAlignmentCost(modelEvents, eventArray))


In [0]:
%sql
CREATE OR REPLACE TABLE state_test_session
(trace_id STRING, ts STRUCT<ts1 ARRAY<TIMESTAMP>,cur_ts TIMESTAMP>,time_window STRUCT<start Timestamp,end Timestamp>,current_id STRING,previous_events STRING,event_level INTEGER,current_event_level INTEGER,current_node_level INTEGER,labels ARRAY<STRING>,len INTEGER,node_id STRING,label STRING,alignment ARRAY<STRUCT<event String,move_type String>>,cost_of_alignment INTEGER);

In [0]:
%sql
CREATE OR REPLACE temp VIEW stream_test_alignm_session AS SELECT DISTINCT 
                trace_id,
                node_id AS current_id,
                previous_events,
                alignment,
                cost_of_alignment,
                event_level,
                current_event_level,
                current_node_level,
                rn
FROM   (
        SELECT *,
        Max(current_event_level) OVER (partition BY trace_id) AS max_event_level,
        Row_number() OVER (partition BY trace_id,node_id ORDER BY event_level DESC,cost_of_alignment ASC, Len(previous_events) DESC) rn
        FROM state_test_session)
WHERE rn = 1

In [0]:
spark.sql("""SELECT trace_id , ts ,time_window,current_id,previous_events ,event_level ,current_event_level ,current_node_level ,labels,len ,node_id ,label ,concat(align,transform(
    alignm, 
    x -> named_struct('event', x._1, 'move_type', x._2)
  )) as alignment, cost_+aggregate(alignm._3, 0, (acc, x) -> acc + x) as cost_of_alignment 
  FROM(
SELECT trace_id,STRUCT(ts as ts1,current_timestamp() as cur_ts) as ts,time_window ,current_id ,cost_of_alignment as cost_ ,concat(previous_events,concat_ws("",labels)) AS previous_events, calculateAlignmentCost(substr(node_id FROM len(current_id) + 1),labels) as alignm,alignment as align ,current_event_level + len as event_level ,current_event_level + len as current_event_level ,level as current_node_level ,labels ,len ,node_id ,m.label
FROM (
    SELECT trace_id,
           ts,
           event_index,
           label,
           time_window,
           current_id,
           cost_of_alignment,
           previous_events,
           alignment,
           event_level,
           current_event_level,
           current_node_level,
           labels,
           size(labels) as len
    FROM (
        SELECT e.trace_id,
               idx AS event_index, 
               time_window,
               col.label as label, 
               e.event_arr.label as labels,
               e.event_arr.time_stamp as ts,
               COALESCE(r.current_id, 'root') AS current_id, 
               COALESCE(cost_of_alignment, 0) AS cost_of_alignment, 
               COALESCE(r.previous_events, '') AS previous_events, 
               COALESCE(r.alignment,array(struct("" as event, "" as move_type)))  as alignment,
               COALESCE(event_level, 0) AS event_level, 
               COALESCE(current_event_level, 0) AS current_event_level, 
               COALESCE(current_node_level, 0) AS current_node_level
        FROM (
            SELECT trace_id,
                   array_sort(collect_list(struct(time_stamp, label))) AS event_arr, session_window(time_stamp, '1 minute') AS time_window
            FROM events_session e 
            JOIN iws_labels l ON e.event = l.event
            GROUP BY trace_id, time_window
        ) e 
        LEFT JOIN stream_test_alignm_session r ON e.trace_id = r.trace_id
        LATERAL VIEW posexplode(event_arr)  AS idx, col
    ) q
) f
JOIN iws_model m ON (m.node_id LIKE CONCAT(f.current_id, '%')
                 AND m.level < f.current_node_level + f.event_index + 3 + 2
AND m.label = f.label) or m.node_id = f.current_id) -- no match then log moves
""").writeStream.format("delta").outputMode("append").option("checkpointLocation","/tmp/delta/state_append_30029/").toTable("state_test_session")

Out[8]: <pyspark.sql.streaming.query.StreamingQuery at 0x7fcab05bd4f0>

In [0]:
insert_event("A",1)
insert_event("X",1)
insert_event("B",1)
sleep(70)
insert_event("A",1)
insert_event("C",1)
#insert different events with different traces
#At the moment inserted events are AXBAC

In [0]:
insert_event("A",1) #must be used to close the window and produce output

In [0]:
%sql
select * from stream_test_alignm_session where trace_id = "trace_id_1"

trace_id,current_id,previous_events,alignment,cost_of_alignment,event_level,current_event_level,current_node_level,rn
trace_id_1,root,AXBAC,"List(List(, ), List(A, log), List(X, log), List(B, log), List(A, log), List(C, log))",5,5,5,0,1
trace_id_1,root-A,AXBAC,"List(List(, ), List(A, sync), List(X, log), List(B, log), List(A, log), List(C, log))",4,5,5,1,1
trace_id_1,root-A-B,AXBAC,"List(List(, ), List(A, sync), List(X, log), List(B, sync), List(A, log), List(C, log))",3,5,5,2,1
trace_id_1,root-A-B-D-C,AXBAC,"List(List(, ), List(A, sync), List(X, log), List(B, sync), List(D, model), List(A, log), List(C, sync))",3,5,5,4,1
trace_id_1,root-A-B-D-C-C,AXBAC,"List(List(, ), List(A, sync), List(X, log), List(B, sync), List(D, model), List(C, model), List(A, log), List(C, sync))",4,5,5,5,1
trace_id_1,root-A-B-X,AXBAC,"List(List(, ), List(A, sync), List(B, model), List(X, sync), List(B, log), List(A, log), List(C, log))",4,5,5,3,1


In [0]:
%sql
SELECT trace_id,current_id,previous_events,slice(alignment, 2, size(alignment)),cost_of_alignment
FROM stream_test_alignm_session
WHERE (trace_id, cost_of_alignment) IN (
    SELECT trace_id, MIN(cost_of_alignment)
    FROM stream_test_alignm_session
    GROUP BY trace_id
) and trace_id = "trace_id_1";
--- select the final optimal prefix alignments

trace_id,current_id,previous_events,"slice(alignment, 2, size(alignment))",cost_of_alignment
trace_id_1,root-A-B,AXBAC,"List(List(A, sync), List(X, log), List(B, sync), List(A, log), List(C, log))",3
trace_id_1,root-A-B-D-C,AXBAC,"List(List(A, sync), List(X, log), List(B, sync), List(D, model), List(A, log), List(C, sync))",3
