# Create silver tables


In [None]:
__CATALOG = "btv_dc30"
__BRONZE_SCHEMA = "bronze"
__SILVER_SCHEMA = "silver"
__BRONZE_BUCKET = "btv-dc30-bronze-t7rrh"
__SILVER_BUCKET = "btv-dc30-silver-t7rrh"

# Create Sysmon Silver tables

This code reads Sysmon event data from a bronze Delta table and organizes it by event ID using a predefined mapping of event IDs to human-readable event names. For each event type, it filters the bronze data to select relevant columns, renames some fields for clarity, and removes the original nested `winlog` column. Then, it writes each filtered DataFrame as a separate Delta table in S3 under a silver schema, named according to the event type. This process effectively splits the raw event data into distinct, structured tables for easier analysis by event category.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, from_json
from pyspark.sql.types import StructType, StructField, StringType, MapType
import re


sysmon_event_names = {
    "1": "ProcessCreation",
    "2": "ProcessChangedAFileCreationTime",
    "3": "NetworkConnection",
    "4": "SysmonServiceStateChanged",
    "5": "ProcessTerminated",
    "6": "DriverLoaded",
    "7": "ImageLoaded",
    "8": "CreateRemoteThread",
    "9": "RawAccessRead",
    "10": "ProcessAccess",
    "11": "FileCreate",
    "12": "RegistryEventObjectCreateAndDelete",
    "13": "RegistryEventValueSet",
    "14": "RegistryEventKeyAndValueRename",
    "15": "FileCreateStreamHash",
    "16": "ServiceConfigurationChange",
    "17": "PipeEventPipeCreated",
    "18": "PipeEventPipeConnected",
    "19": "WmiEventFilterActivityDetected",
    "20": "WmiEventConsumerActivityDetected",
    "21": "WmiEventConsumerToFilterActivityDetected",
    "22": "DnsEventDnsQuery",
    "23": "FileDeleteArchived",
    "24": "ClipboardChange",
    "25": "ProcessTampering",
    "26": "FileDeleteDetected",
    "27": "FileBlockExecutable",
    "28": "FileBlockShredding",
    "29": "FileExecutableDetected",
    "255": "Error"
}


# Read from bronze table
bronze_df = spark.read.table(f"{__CATALOG}.{__BRONZE_SCHEMA}.sysmon") \
    .drop("@timestamp", "@version", "agent", "ecs", "message", "log", "tags", "host")
    
# Event IDs that should include "file"
file_event_ids = {'2', '7', '11', '15', '17', '6', '18'}
registry_event_ids = {'12', '13', '14'}

dfs_by_event_name = {}
for event_id, event_name in sysmon_event_names.items():
    base_cols = [
        col("winlog"),
        col("rule.name").alias("rule"),
        col("process"),
        col("event"),
    ]
    
    # Conditionally add "file"
    if event_id in file_event_ids:
        base_cols.append(col("file").alias("file"))

    # Conditionally add "registry"
    if event_id in registry_event_ids:
        base_cols.append(col("registry").alias("registry"))
    
    dfs_by_event_name[event_name] = (
        bronze_df.filter(col("winlog.event_id") == event_id).select(*base_cols)
    )


clean_dfs_by_event_name = {}
for table_name, df in dfs_by_event_name.items():
    # Get struct fields
    columns_field = df.schema["winlog"]
    field_names = [f.name for f in columns_field.dataType.fields]

    # Find which fields are entirely null
    non_null_fields = []
    for f in field_names:
        non_null_count = df.agg(count(when(col(f"winlog.{f}").isNotNull(), f"winlog.{f}"))).collect()[0][0]
        if non_null_count > 0:  # keep only fields that have at least one non-null
            non_null_fields.append(f)

    # Select only useful fields + any other winlog you want
    clean_df = df.select(
        *[col(f"winlog.{f}").alias(f) for f in non_null_fields],
        col("rule"),
        col("process"),
        col("event"),
    ).drop("api","channel", "event_id", "opcode", "provider_name", "version")

    clean_dfs_by_event_name[table_name] = clean_df



def camel_to_snake(name: str) -> str:
    s1 = re.sub(r'(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', s1).lower()


for event_name, df in clean_dfs_by_event_name.items():
    table_name = camel_to_snake(event_name)
    print (f"sysmon_{table_name}")
    df.write.format("delta") \
        .mode("overwrite") \
        .option("mergeSchema", "true") \
        .option("path", f"s3://{__SILVER_BUCKET}/sysmon/{table_name}") \
        .saveAsTable(f"{__CATALOG}.{__SILVER_SCHEMA}.sysmon_{table_name}")



# Create Osquery silver tables

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import (
    StructType, 
    StructField, 
    StringType, 
    LongType, 
    IntegerType, 
    BooleanType, 
    MapType
)


bronze_df = spark.read.table(f"{__CATALOG}.{__BRONZE_SCHEMA}.osquery") \
    .select("host", "json")

osq_by_table_name = {
    row.name: bronze_df.filter(
        col("json.name") == row["name"]
    ).select(
        col("json.action").alias("update"),
        col("json.hostIdentifier").alias("hostIdentifier"),
        col("json.unixTime").alias("unixTime"),
        col("json.columns").alias("columns")
    )
    for row in bronze_df.select("json.name").distinct().collect()
}



In [None]:
from pyspark.sql.functions import col, count, when
import re

clean_osq_by_table_name = {}
for table_name, df in osq_by_table_name.items():
    # Get struct fields
    columns_field = df.schema["columns"]
    field_names = [f.name for f in columns_field.dataType.fields]

    # Find which fields are entirely null
    non_null_fields = []
    for f in field_names:
        non_null_count = df.agg(count(when(col(f"columns.{f}").isNotNull(), f"columns.{f}"))).collect()[0][0]
        if non_null_count > 0:  # keep only fields that have at least one non-null
            non_null_fields.append(f)

    # Select only useful fields + any other columns you want
    clean_df = df.select(
        *[col(f"columns.{f}").alias(f) for f in non_null_fields],
        col("update").alias("update"),
        col("hostIdentifier").alias("hostIdentifier"),
        col("unixTime").alias("unixTime")
    )

    clean_osq_by_table_name[table_name] = clean_df



for table_name, df in clean_osq_by_table_name.items():
    tn= re.sub(r'pack_([a-z]+(_)){2}', '', f"{table_name.replace('-', '_')}")
    df.write.format("delta") \
        .mode("overwrite") \
        .option("path", f"s3://{__SILVER_BUCKET}/osquery/{tn}") \
        .option("mergeSchema", "true") \
        .saveAsTable(f"{__CATALOG}.{__SILVER_SCHEMA}.osquery_{tn}")


# Create Zeek silver tables

In [None]:
tables = spark.catalog.listTables(f"{__CATALOG}.{__BRONZE_SCHEMA}")

# Filter tables that start with 'osquery_'
zeek_bronze_tables = [t.name for t in tables if t.name.startswith("zeek")]

for table_name in zeek_bronze_tables:
    sourcetype = table_name.split("zeek_")[1]
    print(sourcetype)

    bronze_df = spark.read.table(f"{__CATALOG}.{__BRONZE_SCHEMA}.{table_name}")

    bronze_df.write.format("delta") \
        .mode("overwrite") \
        .option("path", f"s3://{__SILVER_BUCKET}/zeek/{sourcetype}") \
        .option("mergeSchema", "true") \
        .saveAsTable(f"{__CATALOG}.{__SILVER_SCHEMA}.{table_name}")




broker
capture_loss
cluster
conn
dce_rpc
dns
dpd
files
http
kerberos
known_certs
known_services
notice
ntlm
ntp
pe
rdp
reporter
smb_files
smb_mapping
smtp
software
ssl
stats
tunnel
weird
x509


# Create Wineventlogs silver tables

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, from_json
from pyspark.sql.types import StructType, StructField, StringType, MapType
from pyspark.sql.functions import map_keys, explode, col
import re


# Read from bronze table
bronze_wineventlogs_df = spark.read.table(f"{__CATALOG}.{__BRONZE_SCHEMA}.wineventlogs") \
    .filter(col("winlog.provider_name") != "Microsoft-Windows-Sysmon") \
    .select(
        col("event"),
        col("winlog"),
    )

keys_df = bronze_wineventlogs_df.select(explode(map_keys(col("winlog"))).alias("key"))
unique_keys_list = [row["key"] for row in keys_df.distinct().collect()]

flatten_df = bronze_wineventlogs_df.select(
    *[col(f"winlog.{field}").alias(field) for field in unique_keys_list],
    col("event"),
).drop("api","channel", "event_id", "opcode", "provider_name", "version")


flatten_df.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .option("path", f"s3://{__SILVER_BUCKET}/wineventlogs") \
    .saveAsTable(f"{__CATALOG}.{__SILVER_SCHEMA}.wineventlogs")



root
 |-- event: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- winlog: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

