In [0]:
from pyspark.sql.functions import col, struct, explode, collect_list,lit,array,split, expr
from pyspark.sql.types import StructType, StructField, StringType, ArrayType,FloatType,DoubleType,IntegerType,TimestampType
from time import sleep
from pyspark.sql.functions import udf
from pyspark.sql import DataFrame,SparkSession
import pyspark.sql.functions as F 
from pyspark.sql.functions import current_timestamp
import datetime

In [0]:
%sql
CREATE OR REPLACE TABLE iws_model AS
SELECT "root" AS node_id, "-" AS label, array("A") AS children_labels, array("root-A") AS children_id, 
    array(
        struct("root-A" AS node_id, "A" AS label, 1 as level, array() as events_between),
        struct("root-A-B" AS node_id, "B" AS label, 2 as level, array("A") as events_between),
        struct("root-A-B-X" AS node_id, "X" AS label, 3 as level, array("A","B") as events_between), 
        struct("root-A-B-D" AS node_id, "D" AS label, 3 as level, array("A","B") as events_between)
    ) AS nth_children, 0 as level
UNION ALL 
SELECT "root-A", "A", array("B"), array("root-A-B"), 
    array(
        struct("root-A-B" AS node_id, "B" AS label, 2 as level, array() as events_between),
        struct("root-A-B-X" AS node_id, "X" AS label, 3 as level, array("B") as events_between),
        struct("root-A-B-D" AS node_id, "D" AS label, 3 as level, array("B") as events_between),
        struct("root-A-B-D-C" AS node_id, "C" AS label, 4 as level, array("B","D") as events_between)
    ),1
UNION ALL 
SELECT "root-A-B", "B", array("X", "D"), array("root-A-B-X", "root-A-B-D"), 
    array(
        struct("root-A-B-X" AS node_id, "X" AS label, 3 as level, array() as events_between), -- "A-B-C" now "A-B-X"
        struct("root-A-B-D" AS node_id, "D" AS label, 3 as level, array() as events_between),
        struct("root-A-B-D-C" AS node_id, "C" AS label, 4 as level, array("D") as events_between)
    ),2
UNION ALL 
SELECT "root-A-B-X", "X", array(), array(),
    array(),3
UNION ALL 
SELECT "root-A-B-D", "D", array("C"), array("root-A-B-D-C"), 
    array(
        struct("root-A-B-D-C" AS node_id, "C" AS label, 4 as level, array() as events_between),
        struct("root-A-B-D-C-D" AS node_id, "D" AS label, 5 as level, array("C") as events_between),
        struct("root-A-B-D-C-C" AS node_id, "C" AS label, 5 as level, array("C") as events_between)
    ),3
UNION ALL 
SELECT "root-A-B-D-C", "C", array("D", "C"), array("root-A-B-D-C-D", "root-A-B-D-C-C"), 
    array(
        struct("root-A-B-D-C-D" AS node_id, "D" AS label, 5 as level, array() as events_between),
        struct("root-A-B-D-C-C" AS node_id, "C" AS label, 5 as level, array() as events_between)
    ),4
UNION ALL 
SELECT "root-A-B-D-C-D", "D", array(), array(), array(),5
UNION ALL 
SELECT "root-A-B-D-C-C", "C", array(), array(), array(),5;




num_affected_rows,num_inserted_rows


In [0]:
%sql
CREATE OR REPLACE TABLE iws_event 
(event STRING,time_stamp TIMESTAMP,trace_id STRING);
---SELECT "A" event, CURRENT_TIMESTAMP() time_stamp, "trace_id_0" trace_id;


CREATE OR REPLACE TABLE iws_event_state
(event STRING, time_stamp TIMESTAMP, trace_id STRING,processed STRING);
CREATE OR REPLACE TABLE iws_labels
SELECT "A" AS event, "A"  AS label
UNION ALL
SELECT "B","B" 
UNION ALL
SELECT "C","C" 
UNION ALL
SELECT "D","D" 
UNION ALL
SELECT "X","X" ;

CREATE OR REPLACE TABLE iws_state
(trace_id STRING, ts TIMESTAMP, current_node STRING,current_id STRING,cost_of_alignment INTEGER,previous_events STRING, trace STRING, execution_sequence STRING,event_level
INTEGER,current_node_level INTEGER);
---event level to filter out the latest alignments later

In [0]:
import random 

activities = ["A","B","C","D","X"]
trace_id_lower = 0
trace_id_upper = 9

def insert_event(event=None, trace_id=None):
    if not event:
        event = random.choice(activities)
    if not trace_id:
        trace_id = 0 #random.randint(trace_id_lower, trace_id_upper)
    spark.sql(f"INSERT INTO iws_event SELECT '{event}', CURRENT_TIMESTAMP(), 'trace_id_{trace_id}'")

In [0]:
event_df = spark.readStream.table("iws_event").withWatermark("time_stamp", "1 minute")
event_df.createOrReplaceTempView("events")

In [0]:
%scala
import scala.collection.mutable.ArrayBuffer

def calculateAlignmentCost(modelEvents: String, eventArray: Array[String]): Array[(String, String, Int, Int)] = {
  val newEvents = modelEvents.replace("-", "").split("")
  val n = eventArray.length
  val m = newEvents.length
  val dp = Array.tabulate(n + 1, m + 1)((i, j) => if (i == 0) j else if (j == 0) i else 0)

  if (modelEvents == "") {
    return eventArray.zipWithIndex.map { case (event, index) =>
      (event, "log", 1, index + 1)
    }
  }

  // Fill the matrix
  for (i <- 1 to n) {
    for (j <- 1 to m) {
      if (eventArray(i - 1) == newEvents(j - 1)) {
        dp(i)(j) = dp(i - 1)(j - 1)
      } else {
        dp(i)(j) = math.min(dp(i - 1)(j) + 1, dp(i)(j - 1) + 1)
      }
    }
  }

  // Track back to build the alignment
  val alignment = ArrayBuffer[(String, String, Int, Int)]()
  var i = n
  var j = m
  var cost = dp(n)(m)

  while (i > 0 && j > 0) {
    if (eventArray(i - 1) == newEvents(j - 1)) {
      alignment.prepend((eventArray(i - 1), "sync", 0, i))
      i -= 1
      j -= 1
    } else if (dp(i)(j) == dp(i - 1)(j) + 1) {
      alignment.prepend((eventArray(i - 1), "log", 1, i))
      i -= 1
    } else {
      alignment.prepend((newEvents(j - 1), "model", 1, j))
      j -= 1
    }
  }
  // Handle any remaining elements necessary if there are trailing events in the beginning of the trace or model nodes
  while (i > 0) {
    alignment.prepend((eventArray(i - 1), "log", 1, i))
    i -= 1
  }
  while (j > 0) {
    alignment.prepend((newEvents(j - 1), "model", 1, j))
    j -= 1
  }


  alignment.toArray
}

// Registering the UDF
spark.udf.register("calculateAlignmentCost", (modelEvents: String, eventArray: Array[String]) => calculateAlignmentCost(modelEvents, eventArray))


In [0]:
%sql
CREATE OR REPLACE TABLE state_test_batch_dummy
(trace_id STRING, ts TIMESTAMP,current_id STRING,previous_events STRING,event_level INTEGER,current_node_level INTEGER,label STRING,alignment ARRAY<STRUCT<event String,move_type String>>,cost_of_alignment INTEGER,event_array ARRAY<STRING>,event_index INTEGER,batch_id INTEGER);

In [0]:
%sql
CREATE OR REPLACE temp VIEW stream_test_alignm_batch_dummy AS SELECT DISTINCT 
                trace_id,
                current_id,
                previous_events,
                alignment,
                cost_of_alignment,
                event_level,
                max_event_level as current_event_level,
                current_node_level,
                rn,
                event_array,
                event_index,
                batch_id
FROM   (
        SELECT *,
        Max(event_level) OVER (partition BY trace_id) AS max_event_level,
        Row_number() OVER (partition BY trace_id,current_id ORDER BY event_level DESC,cost_of_alignment ASC, Len(previous_events) DESC) rn
        FROM state_test_batch_dummy)
WHERE rn = 1

In [0]:
def process_batch(df: DataFrame, batch_id: int):

    if not df.isEmpty():
        df.createOrReplaceTempView("streaming_data")

        result_df = df.sparkSession.sql("""
            WITH FIRST_BD AS (
            SELECT *,substr(node_id FROM len(previous_id) + 1) model_sub, CASE WHEN len(substr(node_id FROM len(previous_id) + 1)) = 0 THEN 0 ELSE _event_index END as event_index  FROM (
            SELECT
                e.trace_id AS trace_id,
                idx AS _event_index,
                col.time_stamp AS time_stamp,
                col.label AS incoming_label,
                e.event_array.label AS event_array,
                size(e.event_array) AS len,
                COALESCE(r.current_id, 'root') AS previous_id,
                COALESCE(r.cost_of_alignment, 0) AS cost_of_alignment,
                COALESCE(r.previous_events, '') AS previous_events,
                COALESCE(r.alignment, ARRAY(struct("" AS event, "" AS move_type))) AS previous_alignment,
                COALESCE(r.event_level, 0) AS event_level,
                COALESCE(r.current_event_level, 0) AS current_event_level,
                COALESCE(r.current_node_level, 0) AS current_node_level
            FROM (
                SELECT 
                    trace_id,
                    array_sort(collect_list(struct(time_stamp, label))) AS event_array
                FROM 
                    streaming_data e 
                JOIN 
                    iws_labels l ON e.event = l.event
                GROUP BY trace_id
            ) e
            LEFT JOIN stream_test_alignm_batch_dummy r ON e.trace_id = r.trace_id 
            LATERAL VIEW posexplode(e.event_array) AS idx, col
        ) f
        JOIN iws_model m ON (m.node_id LIKE CONCAT(f.previous_id, '%') 
            AND m.level < f.current_node_level  + _event_index + 3 + 2
            AND m.label = f.incoming_label) 
            OR (m.node_id = f.previous_id AND len = _event_index + 1)),
MaxEventIndexPerTrace AS (
    SELECT
        trace_id as _trace_id,
        MAX(event_index) AS max_event_index
    FROM FIRST_BD
    GROUP BY trace_id
),
BASE_DATA AS (SELECT * FROM FIRST_BD LEFT JOIN MaxEventIndexPerTrace on FIRST_BD.trace_id = MaxEventIndexPerTrace._trace_id WHERE event_index >= greatest(max_event_index-4,0))

SELECT 
      trace_id,
                time_stamp as ts,
                node_id as current_id,
                concat(previous_events,concat_ws("",event_array)) as previous_events,
                current_event_level + len as event_level,
                level as current_node_level,
                label,
                CONCAT(
                    previous_alignment,
                    TRANSFORM(
                        calc_alignment,
                            x -> named_struct('event', x._1, 'move_type', x._2)
                        )
                 ) AS alignment,
                cost_of_alignment + aggregate(calc_alignment._3, 0, (acc, x) -> acc + x) AS cost_of_alignment,
                event_array,
                event_index
    FROM (
        SELECT *,calculateAlignmentCost(substr(node_id FROM len(previous_id) + 1),event_array) as calc_alignment 
        FROM BASE_DATA
            ) 
        """)
        result_df = result_df.withColumn("batch_id",F.lit(batch_id))
        # Write the results of the SQL query to a Delta table
        result_df.write.format("delta").mode("append").option("checkpointLocation", "/tmp/delta/state_append_30050/").saveAsTable("state_test_batch_dummy")

# Set up the write stream using foreachBatch
query = event_df.writeStream.foreachBatch(process_batch).start()
#query.awaitTermination()

In [0]:
insert_event("C",0)
#insert different events with different traces
#At the moment inserted events are AXBAC

In [0]:
%sql
select * from stream_test_alignm_batch_dummy

trace_id,current_id,previous_events,alignment,cost_of_alignment,event_level,current_event_level,current_node_level,rn,event_array,event_index,batch_id
trace_id_0,root,AXBAC,"List(List(, ), List(A, log), List(X, log), List(B, log), List(A, log), List(C, log))",5,5,5,0,1,List(C),0,4
trace_id_0,root-A,AXBAC,"List(List(, ), List(A, log), List(X, log), List(B, log), List(A, sync), List(C, log))",4,5,5,1,1,List(C),0,4
trace_id_0,root-A-B,AXBAC,"List(List(, ), List(A, sync), List(X, log), List(B, sync), List(A, log), List(C, log))",3,5,5,2,1,List(C),0,4
trace_id_0,root-A-B-D-C,AXBAC,"List(List(, ), List(A, sync), List(X, log), List(B, sync), List(A, log), List(D, model), List(C, sync))",3,5,5,4,1,List(C),0,4
trace_id_0,root-A-B-D-C-C,AXBAC,"List(List(, ), List(A, sync), List(X, log), List(B, sync), List(A, log), List(D, model), List(C, model), List(C, sync))",4,5,5,5,1,List(C),0,4
trace_id_0,root-A-B-X,AXBAC,"List(List(, ), List(A, sync), List(B, model), List(X, sync), List(B, log), List(A, log), List(C, log))",4,5,5,3,1,List(C),0,4


In [0]:
%sql
SELECT trace_id,current_id,previous_events,slice(alignment, 2, size(alignment)),cost_of_alignment
FROM stream_test_alignm_batch_dummy
WHERE (trace_id, cost_of_alignment) IN (
    SELECT trace_id, MIN(cost_of_alignment)
    FROM stream_test_alignm_batch_dummy
    GROUP BY trace_id
);
--- select the final optimal prefix alignments

trace_id,current_id,previous_events,"slice(alignment, 2, size(alignment))",cost_of_alignment
trace_id_0,root-A-B,AXBAC,"List(List(A, sync), List(X, log), List(B, sync), List(A, log), List(C, log))",3
trace_id_0,root-A-B-D-C,AXBAC,"List(List(A, sync), List(X, log), List(B, sync), List(A, log), List(D, model), List(C, sync))",3
