In [27]:
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.config("spark.jars", "target/flux-capacitor-1.jar").getOrCreate()
flux_stateful_function = spark._sc._jvm.cccs.fluxcapacitor.FluxCapacitor.invoke
bloom_capacity = 200000

def print_df(df):
    df = df.drop("host_id")
    df = df.withColumn("human_readable", F.map_keys(F.map_filter(F.col("sigma").rule1, lambda _,v: v)))
    columns = df.columns
    columns.remove("human_readable")
    columns.insert(len(columns) - 1, "human_readable")
    df = df.select(columns)
    df.show(truncate=False)

In [28]:
spec = """
    rules:
        - rulename: rule1
          action: temporal
          tags:
            - name: f1
            - name: f2
            - name: f3
    """

df_input = spark.sql("""
    select
        *
    from
    values
    (TIMESTAMP '2022-12-30 00:00:01', 'host1', map('rule1', map('f2', true))),
    (TIMESTAMP '2022-12-30 00:00:02', 'host1', map('rule1', map('f1', false, 'f2', false, 'f3', false))),
    (TIMESTAMP '2022-12-30 00:00:03', 'host1', map('rule1', map('f3', true))),
    (TIMESTAMP '2022-12-30 00:00:04', 'host1', map('rule1', map('f1', false, 'f2', false, 'f3', false))),
    (TIMESTAMP '2022-12-30 00:00:05', 'host1', map('rule1', map('f1', true))),
    (TIMESTAMP '2022-12-30 00:00:06', 'host1', map('rule1', map('f1', false, 'f2', false, 'f3', false)))
    t(timestamp, host_id, sigma)
    """)

print_df(df_input)
jdf = flux_stateful_function(df_input._jdf, "host_id", bloom_capacity, spec, True)
df_output = DataFrame(jdf, spark)
print_df(df_output)

+-------------------+--------------+--------------------------------------------------+
|timestamp          |human_readable|sigma                                             |
+-------------------+--------------+--------------------------------------------------+
|2022-12-30 00:00:01|[f2]          |{rule1 -> {f2 -> true}}                           |
|2022-12-30 00:00:02|[]            |{rule1 -> {f1 -> false, f2 -> false, f3 -> false}}|
|2022-12-30 00:00:03|[f3]          |{rule1 -> {f3 -> true}}                           |
|2022-12-30 00:00:04|[]            |{rule1 -> {f1 -> false, f2 -> false, f3 -> false}}|
|2022-12-30 00:00:05|[f1]          |{rule1 -> {f1 -> true}}                           |
|2022-12-30 00:00:06|[]            |{rule1 -> {f1 -> false, f2 -> false, f3 -> false}}|
+-------------------+--------------+--------------------------------------------------+

+-------------------+--------------+--------------------------------------------------+
|timestamp          |human_read

In [29]:
spec = """
    rules:
        - rulename: rule1
          action: temporal
          ordered: true
          tags:
            - name: f1
            - name: f2
            - name: f3
    """

df_input = spark.sql("""
    select
        *
    from
    values
    (TIMESTAMP '2022-12-30 00:00:01', 'host1', map('rule1', map('f2', true))),
    (TIMESTAMP '2022-12-30 00:00:02', 'host1', map('rule1', map('f1', false, 'f2', false, 'f3', false))),
    (TIMESTAMP '2022-12-30 00:00:03', 'host1', map('rule1', map('f3', true))),
    (TIMESTAMP '2022-12-30 00:00:04', 'host1', map('rule1', map('f1', false, 'f2', false, 'f3', false))),
    (TIMESTAMP '2022-12-30 00:00:05', 'host1', map('rule1', map('f1', true))),
    (TIMESTAMP '2022-12-30 00:00:06', 'host1', map('rule1', map('f1', false, 'f2', false, 'f3', false))),
    (TIMESTAMP '2022-12-30 00:00:07', 'host1', map('rule1', map('f2', true))),
    (TIMESTAMP '2022-12-30 00:00:08', 'host1', map('rule1', map('f1', false, 'f2', false, 'f3', false))),
    (TIMESTAMP '2022-12-30 00:00:09', 'host1', map('rule1', map('f1', true))),
    (TIMESTAMP '2022-12-30 00:00:10', 'host1', map('rule1', map('f1', false, 'f2', false, 'f3', false))),
    (TIMESTAMP '2022-12-30 00:00:11', 'host1', map('rule1', map('f3', true))),
    (TIMESTAMP '2022-12-30 00:00:12', 'host1', map('rule1', map('f1', false, 'f2', false, 'f3', false)))
    t(timestamp, host_id, sigma)
    """)
print_df(df_input)
jdf = flux_stateful_function(df_input._jdf, "host_id", bloom_capacity, spec, True)
df_output = DataFrame(jdf, spark)
print_df(df_output)

+-------------------+--------------+--------------------------------------------------+
|timestamp          |human_readable|sigma                                             |
+-------------------+--------------+--------------------------------------------------+
|2022-12-30 00:00:01|[f2]          |{rule1 -> {f2 -> true}}                           |
|2022-12-30 00:00:02|[]            |{rule1 -> {f1 -> false, f2 -> false, f3 -> false}}|
|2022-12-30 00:00:03|[f3]          |{rule1 -> {f3 -> true}}                           |
|2022-12-30 00:00:04|[]            |{rule1 -> {f1 -> false, f2 -> false, f3 -> false}}|
|2022-12-30 00:00:05|[f1]          |{rule1 -> {f1 -> true}}                           |
|2022-12-30 00:00:06|[]            |{rule1 -> {f1 -> false, f2 -> false, f3 -> false}}|
|2022-12-30 00:00:07|[f2]          |{rule1 -> {f2 -> true}}                           |
|2022-12-30 00:00:08|[]            |{rule1 -> {f1 -> false, f2 -> false, f3 -> false}}|
|2022-12-30 00:00:09|[f1]       

In [30]:
spec = """
    rules:
        - rulename: rule1
          action: parent
          child: pid
          parent: parent_pid
          tags:
            - name: pf
    """

df_input = spark.sql("""
    select
        *
    from
    values
    (TIMESTAMP '2022-12-30 00:00:01', 'host1', 'pid100', '', map('rule1', map('pf', true, 'cf', false))),
    (TIMESTAMP '2022-12-30 00:00:02', 'host1', 'pid200', '', map('rule1', map('pf', false, 'cf', false))),
    (TIMESTAMP '2022-12-30 00:00:03', 'host1', 'pid300', 'pid100', map('rule1', map('pf', false, 'cf', false))),
    (TIMESTAMP '2022-12-30 00:00:04', 'host1', 'pid400', 'pid200', map('rule1', map('pf', false, 'cf', true))),
    (TIMESTAMP '2022-12-30 00:00:05', 'host1', 'pid500', '', map('rule1', map('pf', true, 'cf', false))),
    (TIMESTAMP '2022-12-30 00:00:06', 'host1', 'pid600', 'pid500', map('rule1', map('pf', false, 'cf', true))),
    (TIMESTAMP '2022-12-30 00:00:07', 'host1', 'pid700', 'pid600', map('rule1', map('pf', false, 'cf', true)))
    t(timestamp, host_id, pid, parent_pid, sigma)
    """)

print_df(df_input)
jdf = flux_stateful_function(df_input._jdf, "host_id", bloom_capacity, spec, True)
df_output = DataFrame(jdf, spark)
print_df(df_output)

+-------------------+------+----------+--------------+-------------------------------------+
|timestamp          |pid   |parent_pid|human_readable|sigma                                |
+-------------------+------+----------+--------------+-------------------------------------+
|2022-12-30 00:00:01|pid100|          |[pf]          |{rule1 -> {pf -> true, cf -> false}} |
|2022-12-30 00:00:02|pid200|          |[]            |{rule1 -> {pf -> false, cf -> false}}|
|2022-12-30 00:00:03|pid300|pid100    |[]            |{rule1 -> {pf -> false, cf -> false}}|
|2022-12-30 00:00:04|pid400|pid200    |[cf]          |{rule1 -> {pf -> false, cf -> true}} |
|2022-12-30 00:00:05|pid500|          |[pf]          |{rule1 -> {pf -> true, cf -> false}} |
|2022-12-30 00:00:06|pid600|pid500    |[cf]          |{rule1 -> {pf -> false, cf -> true}} |
|2022-12-30 00:00:07|pid700|pid600    |[cf]          |{rule1 -> {pf -> false, cf -> true}} |
+-------------------+------+----------+--------------+----------------

In [31]:
spec = """
    rules:
        - rulename: rule1
          action: ancestor
          child: pid
          parent: parent_pid
          tags:
            - name: pf
    """

df_input = spark.sql("""
    select
        *
    from
    values
    (TIMESTAMP '2022-12-30 00:00:05', 'host1', 'pid500', '', map('rule1', map('pf', true, 'cf', false))),
    (TIMESTAMP '2022-12-30 00:00:06', 'host1', 'pid600', 'pid500', map('rule1', map('pf', false, 'cf', false))),
    (TIMESTAMP '2022-12-30 00:00:07', 'host1', 'pid700', 'pid600', map('rule1', map('pf', false, 'cf', true)))
    t(timestamp, host_id, pid, parent_pid, sigma)
    """)

print_df(df_input)
jdf = flux_stateful_function(df_input._jdf, "host_id", bloom_capacity, spec, True)
df_output = DataFrame(jdf, spark)
print_df(df_output)

+-------------------+------+----------+--------------+-------------------------------------+
|timestamp          |pid   |parent_pid|human_readable|sigma                                |
+-------------------+------+----------+--------------+-------------------------------------+
|2022-12-30 00:00:05|pid500|          |[pf]          |{rule1 -> {pf -> true, cf -> false}} |
|2022-12-30 00:00:06|pid600|pid500    |[]            |{rule1 -> {pf -> false, cf -> false}}|
|2022-12-30 00:00:07|pid700|pid600    |[cf]          |{rule1 -> {pf -> false, cf -> true}} |
+-------------------+------+----------+--------------+-------------------------------------+

+-------------------+------+----------+--------------+------------------------------------+
|timestamp          |pid   |parent_pid|human_readable|sigma                               |
+-------------------+------+----------+--------------+------------------------------------+
|2022-12-30 00:00:05|pid500|          |[pf]          |{rule1 -> {pf -> t