In [1]:
import pyspark.sql.functions as F
from datetime import datetime
import time
from pyspark.sql import DataFrame
import os

from pyspark.sql import SparkSession


def flux_capacitor(input_df, spec):
    """Python function to invoke the scala code"""
    bloom_capacity = 10000;
    flux_stateful_function = spark._sc._jvm.cccs.fluxcapacitor.FluxCapacitor.invoke
    jdf = flux_stateful_function(
            input_df._jdf, 
            "group_key", 
            bloom_capacity, 
            spec)
    return DataFrame(jdf, spark)


spark = ( SparkSession.builder
            .config('spark.jars', './target/flux-capacitor-1.jar')
            .getOrCreate()
        )

schema = """
    timestamp timestamp,
    host_id integer, 
    id string, 
    parent_id string, 
    captured_folder_colname string, 
    Name string, 
    ImagePath string, 
    Commandline string
"""
df = (spark.read.format('csv')
      .schema(schema)
      .option("header", True)
      .option("ignoreLeadingWhiteSpace", True)
      .option("emptyValue","")
      .option("nullValue", "none")
      .load("file:///home/jovyan/work/flux-capacitor/inputs/events")
     )

print("The data read from the file is")
df.printSchema()
df.show(truncate=False)

df.createOrReplaceTempView("processinfo")



df_tagged = spark.sql(r"""--start-sparksql
select
    timestamp,
    host_id, 
    id, 
    parent_id, 
    captured_folder_colname, 
    Name, 
    ImagePath, 
    Commandline,
    -- distribute processing of each host by this key
    host_id as group_key,
    -- regroup each rule's tags in a map (ruleName -> Tags)
    map(
        'rule1', map('cr1_selection', cr1_selection,
                    'cr1_filter_empty', cr1_filter_empty,
                    'cr1_filter', cr1_filter,
                    'cr1_filter_localserver_fp', cr1_filter_localserver_fp,
                    'pr1_filter_iexplorer', pr1_filter_iexplorer,
                    'pr1_filter_msiexec_syswow64', pr1_filter_msiexec_syswow64,
                    'pr1_filter_msiexec_system32', pr1_filter_msiexec_system32),
        'rule2', map('cr2_filter_provider', cr2_filter_provider,
                    'cr2_filter_git', cr2_filter_git,
                    'pr2_selection', pr2_selection,
                    'pr2_filter_git', pr2_filter_git),
        'rule3', map('cr3_selection_other', cr3_selection_other,
                    'cr3_selection_atexec', cr3_selection_atexec,
                    'pr3_selection_other', pr3_selection_other,
                    'pr3_selection_atexec', pr3_selection_atexec)
       ) as sigma

from (
    select
        *,
    
        -- rule 1
        ImagePath ilike '%\\\\rundll32.exe' as cr1_selection,
        Commandline is null as cr1_filter_empty,
        Commandline ilike '%.dll%' OR Commandline = '' as cr1_filter,
        Commandline ilike '% -localserver %' as cr1_filter_localserver_fp,

        (   ImagePath ilike '%:\\\\Program Files\\\\Internet Explorer\\\\iexplore.exe'
            AND CommandLine ilike '%.cpl%')
        as pr1_filter_iexplorer,
        (   ImagePath ilike '%:\\\\Windows\\\\SysWOW64\\\\msiexec.exe'
            AND CommandLine ilike 'C:\\\\Windows\\\\syswow64\\\\MsiExec.exe -Embedding%')
        as pr1_filter_msiexec_syswow64,
        (   ImagePath ilike '%:\\\\Windows\\\\System32\\\\msiexec.exe' AND
            CommandLine ilike 'C:\\\\Windows\\\\system32\\\\MsiExec.exe -Embedding%')
        as pr1_filter_msiexec_system32,

        -- rule 2
        -- using name instead of Provider_Name
        Name = 'SystemTraceProvider-Process' as cr2_filter_provider,
        ImagePath ilike '%\\\\git.exe' as cr2_filter_git,
        ImagePath ilike '%\\\\conhost.exe' as pr2_selection,
        CommandLine ilike '% show %' as pr2_filter_git,

        -- rule 3
        ImagePath rlike('.*(\\\\wmiprvse.exe|\\\\mmc.exe|\\\\explorer.exe|\\\\services.exe)') as pr3_selection_other,

        (CommandLine ilike '%cmd.exe%'
        and CommandLine ilike '%/Q%'
        and CommandLine ilike '%/c%'
        and CommandLine ilike '%\\\\\\\\127.0.0.1\\\\%'
        and CommandLine ilike '%&1%') as cr3_selection_other,

        CommandLine rlike('svchost.exe -k netsvcs|taskeng.exe') as pr3_selection_atexec,

        (CommandLine ilike '%cmd.exe%'
        and CommandLine ilike '%/C%'
        and CommandLine ilike '%Windows\\\\Temp\\\\%'
        and CommandLine ilike '%&1%') as cr3_selection_atexec


    from
        processinfo
    where
        host_id is not null
)
--end-sparksql
""")


print("Tagged input row is")
df_tagged.printSchema()
df_tagged.show(truncate=False)


# instruct the flux capacitor to cache parent tags
flux_update_spec = """
rules:
    - rulename: rule1
      description: proc_creation_win_run_executable_invalid_extension
      action: parent
      tags:
        - name: pr1_filter_iexplorer
        - name: pr1_filter_msiexec_syswow64
        - name: pr1_filter_msiexec_system32
      parent: parent_id
      child: id
    
    - rulename: rule2
      description: proc_creation_win_susp_conhost
      action: parent
      tags:
        - name: pr2_selection
        - name: pr2_filter_git
      parent: parent_id
      child: id

    - rulename: rule3
      description: proc_creation_win_impacket_lateralization
      action: parent
      tags:
        - name: pr3_selection_other
        - name: pr3_selection_atexec
      parent: parent_id
      child: id
"""
print("The flux capacitor will cache tags according to this specification")
print(flux_update_spec)

# process the rows in the stateful flux capacitor function
df_flux_out = flux_capacitor(df_tagged, flux_update_spec)

print("The flux capacitor caches but also propagates historical tags. A row's parent tags will be set on a child's row")
df_flux_out.printSchema()
df_flux_out.show(truncate=False)

df_flux_out.createOrReplaceTempView("flux_capacitor_output")

# Now that we have the historical tags (for example parent tags)
# we can evaluate rules which combine tags from the current row and its parent

results = spark.sql(r"""--start-sparksql
select
    *
from (
select
    timestamp,
    host_id, 
    id, 
    parent_id, 
    captured_folder_colname, 
    Name, 
    ImagePath, 
    Commandline,
    group_key,
    map( -- store each resulting rule into its corresponding rule's map
        'rule1', map_concat(sigma.rule1, map('final_result', rule1)),
        'rule2', map_concat(sigma.rule2, map('final_result', rule2)),
        'rule3', map_concat(sigma.rule3, map('final_result', rule3))
    ) as sigma

from (
    select
        *,
    
        -- rule 1 -> condition: selection and not 1 of filter*
        sigma.rule1.cr1_selection AND NOT (  
                                 sigma.rule1.cr1_filter
                              OR sigma.rule1.cr1_filter_empty
                              OR sigma.rule1.cr1_filter_localserver_fp
                              OR sigma.rule1.pr1_filter_iexplorer
                              OR sigma.rule1.pr1_filter_msiexec_syswow64
                              OR sigma.rule1.pr1_filter_msiexec_system32)
        as rule1,

        -- rule 2 -> condition: selection and not 1 of filter_*
        sigma.rule2.pr2_selection AND NOT (sigma.rule2.r2_filter_provider 
                              OR sigma.rule2.pr2_filter_git)
        as rule2,

        -- rule 3 -> condition: 1 of selection_*
        (sigma.rule3.cr3_selection_other AND sigma.rule3.pr3_selection_other)
        OR
        (sigma.rule3.cr3_selection_atexec AND sigma.rule3.pr3_selection_atexec)
        as rule3
    from
        flux_capacitor_output
    )
)
-- where
--     (sigma.rule1.final_result 
--      OR sigma.rule2.final_result
--      OR sigma.rule3.final_result
--     )
order by
    timestamp

--end-sparksql
""")

print("After the flux capacitor we can evaluate the condition of each rule and store that result inside the sigma map")
results.printSchema()
results.show(truncate=False)

print("Looking at the final result of rule1, we see a hit")
results.select("timestamp", "sigma.rule1.final_result").show()

The data read from the file is
root
 |-- timestamp: timestamp (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- captured_folder_colname: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- ImagePath: string (nullable = true)
 |-- Commandline: string (nullable = true)

+-------------------+-------+---+---------+-----------------------+----+------------------------------+----------------------+
|timestamp          |host_id|id |parent_id|captured_folder_colname|Name|ImagePath                     |Commandline           |
+-------------------+-------+---+---------+-----------------------+----+------------------------------+----------------------+
|2022-12-25 00:00:01|1000   |1  |0        |folderA                |    |                              |                      |
|2022-12-25 00:00:02|1000   |2  |0        |folderA                |    |                              |abc git show is he