# Create bronze tables


In [0]:
__CATALOG = "btv_dc30"
__BRONZE_SCHEMA = "bronze"
__SILVER_SCHEMA = "silver"
__BRONZE_BUCKET = "btv-dc30-bronze-t7rrh"
__SILVER_BUCKET = "btv-dc30-silver-t7rrh"

# Create Sysmon bronze tables

In [0]:
sysmon_event_names = {
    "1": "ProcessCreation",
    "2": "ProcessChangedAFileCreationTime",
    "3": "NetworkConnection",
    "4": "SysmonServiceStateChanged",
    "5": "ProcessTerminated",
    "6": "DriverLoaded",
    "7": "ImageLoaded",
    "8": "CreateRemoteThread",
    "9": "RawAccessRead",
    "10": "ProcessAccess",
    "11": "FileCreate",
    "12": "RegistryEventObjectCreateAndDelete",
    "13": "RegistryEventValueSet",
    "14": "RegistryEventKeyAndValueRename",
    "15": "FileCreateStreamHash",
    "16": "ServiceConfigurationChange",
    "17": "PipeEventPipeCreated",
    "18": "PipeEventPipeConnected",
    "19": "WmiEventFilterActivityDetected",
    "20": "WmiEventConsumerActivityDetected",
    "21": "WmiEventConsumerToFilterActivityDetected",
    "22": "DnsEventDnsQuery",
    "23": "FileDeleteArchived",
    "24": "ClipboardChange",
    "25": "ProcessTampering",
    "26": "FileDeleteDetected",
    "27": "FileBlockExecutable",
    "28": "FileBlockShredding",
    "29": "FileExecutableDetected",
    "255": "Error"
}


for sysmon_event in dbutils.fs.ls(f"s3://{__BRONZE_BUCKET}/sysmon"):
    tn = sysmon_event.name.rstrip('/')
    print (f"{__CATALOG}.{__BRONZE_SCHEMA}.sysmon_{tn}")
    
    df = spark.read.json(sysmon_event.path)

    df.write.format("delta")\
        .mode("overwrite")\
        .option("mergeSchema", "true") \
        .saveAsTable(f"{__CATALOG}.{__BRONZE_SCHEMA}.sysmon_{tn}")


btv_dc30.bronze.sysmon_ClipboardChange
btv_dc30.bronze.sysmon_CreateRemoteThread
btv_dc30.bronze.sysmon_DnsEventDnsQuery
btv_dc30.bronze.sysmon_DriverLoaded
btv_dc30.bronze.sysmon_Error
btv_dc30.bronze.sysmon_FileCreate
btv_dc30.bronze.sysmon_FileCreateStreamHash
btv_dc30.bronze.sysmon_ImageLoaded
btv_dc30.bronze.sysmon_NetworkConnection
btv_dc30.bronze.sysmon_PipeEventPipeConnected
btv_dc30.bronze.sysmon_PipeEventPipeCreated
btv_dc30.bronze.sysmon_ProcessAccess
btv_dc30.bronze.sysmon_ProcessChangedAFileCreationTime
btv_dc30.bronze.sysmon_ProcessCreation
btv_dc30.bronze.sysmon_ProcessTampering
btv_dc30.bronze.sysmon_ProcessTerminated
btv_dc30.bronze.sysmon_RegistryEventObjectCreateAndDelete
btv_dc30.bronze.sysmon_RegistryEventValueSet
btv_dc30.bronze.sysmon_SysmonServiceStateChanged


# Create Zeek bronze tables

In [0]:
for zeek_event in dbutils.fs.ls(f"s3://{__BRONZE_BUCKET}/zeek"):
    tn = zeek_event.name.rstrip('/')
    print (f"{__CATALOG}.{__BRONZE_SCHEMA}.zeek_{tn}")
    
    df = spark.read.json(zeek_event.path)

    df.write.format("delta") \
        .mode("overwrite") \
        .saveAsTable(f"{__CATALOG}.{__BRONZE_SCHEMA}.zeek_{tn.replace('-','_')}")

btv_dc30.bronze.zeek_cluster
btv_dc30.bronze.zeek_conn
btv_dc30.bronze.zeek_dce_rpc
btv_dc30.bronze.zeek_dns
btv_dc30.bronze.zeek_dpd
btv_dc30.bronze.zeek_files
btv_dc30.bronze.zeek_http
btv_dc30.bronze.zeek_kerberos
btv_dc30.bronze.zeek_known_certs
btv_dc30.bronze.zeek_known_services
btv_dc30.bronze.zeek_notice
btv_dc30.bronze.zeek_ntlm
btv_dc30.bronze.zeek_ntp
btv_dc30.bronze.zeek_pe
btv_dc30.bronze.zeek_rdp
btv_dc30.bronze.zeek_reporter
btv_dc30.bronze.zeek_smb_files
btv_dc30.bronze.zeek_smb_mapping
btv_dc30.bronze.zeek_smtp
btv_dc30.bronze.zeek_software
btv_dc30.bronze.zeek_ssl
btv_dc30.bronze.zeek_stats
btv_dc30.bronze.zeek_tunnel
btv_dc30.bronze.zeek_weird


# Create Osquery bronze tables

In [0]:
import re

for osq_event in dbutils.fs.ls(f"s3://{__BRONZE_BUCKET}/osquery"):
    tn = re.sub(r'pack_([a-z]+(_)){2}', '', f"{osq_event.name.rstrip('/').replace('-', '_')}")
    print (f"{__CATALOG}.{__BRONZE_SCHEMA}.osquery_{tn}")


    df = spark.read.json(osq_event.path)

    
    df.write.format("delta") \
        .mode("overwrite") \
        .saveAsTable(f"{__CATALOG}.{__BRONZE_SCHEMA}.osquery_{tn}")


btv_dc30.bronze.osquery_cpu_time
btv_dc30.bronze.osquery_iptables
btv_dc30.bronze.osquery_memory_info
btv_dc30.bronze.osquery_device_nodes
btv_dc30.bronze.osquery_smbios_tables
btv_dc30.bronze.osquery_iptables
btv_dc30.bronze.osquery_last
btv_dc30.bronze.osquery_listening_ports
btv_dc30.bronze.osquery_logged_in_users
btv_dc30.bronze.osquery_mounts
btv_dc30.bronze.osquery_open_sockets
btv_dc30.bronze.osquery_kernel_modules
btv_dc30.bronze.osquery_mounts
btv_dc30.bronze.osquery_osquery_info
btv_dc30.bronze.osquery_schedule
btv_dc30.bronze.osquery_kernel_modules
btv_dc30.bronze.osquery_process_events
btv_dc30.bronze.osquery_runtime_perf
btv_dc30.bronze.osquery_socket_events
btv_dc30.bronze.osquery_syslog_events


# Create hmail bronze tables

In [0]:
print (f"{__CATALOG}.{__BRONZE_SCHEMA}.hmail")

df = spark.read.json(f"s3://{__BRONZE_BUCKET}/hmail")

df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable(f"{__CATALOG}.{__BRONZE_SCHEMA}.hmail")

btv_dc30.bronze.hmail


# Create Windows Event logs bronze tables

In [0]:
for win_event in dbutils.fs.ls(f"s3://{__BRONZE_BUCKET}/wineventlogs"):
    tn = win_event.name.rstrip('/').replace("/", "_").replace("-", "_")
    print (f"{__CATALOG}.{__BRONZE_SCHEMA}.wineventlogs_{tn}")
    
    df = spark.read.json(win_event.path)

    df.write.format("delta") \
        .mode("overwrite") \
        .saveAsTable(f"{__CATALOG}.{__BRONZE_SCHEMA}.wineventlogs_{tn}")



btv_dc30.bronze.wineventlogs_Application
btv_dc30.bronze.wineventlogs_Microsoft_Windows_PowerShell
btv_dc30.bronze.wineventlogs_Microsoft_Windows_SMBServer
btv_dc30.bronze.wineventlogs_Microsoft_Windows_Sysmon
btv_dc30.bronze.wineventlogs_Microsoft_Windows_TerminalServices_LocalSessionManager
btv_dc30.bronze.wineventlogs_Microsoft_Windows_TerminalServices_RemoteConnectionManager
btv_dc30.bronze.wineventlogs_Microsoft_Windows_WMI_Activity


[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-5031483627650517>, line 9[0m
[1;32m      3[0m [38;5;28mprint[39m ([38;5;124mf[39m[38;5;124m"[39m[38;5;132;01m{[39;00m__CATALOG[38;5;132;01m}[39;00m[38;5;124m.[39m[38;5;132;01m{[39;00m__BRONZE_SCHEMA[38;5;132;01m}[39;00m[38;5;124m.wineventlogs_[39m[38;5;132;01m{[39;00mtn[38;5;132;01m}[39;00m[38;5;124m"[39m)
[1;32m      5[0m df [38;5;241m=[39m spark[38;5;241m.[39mread[38;5;241m.[39mjson(win_event[38;5;241m.[39mpath)
[1;32m      7[0m df[38;5;241m.[39mwrite[38;5;241m.[39mformat([38;5;124m"[39m[38;5;124mdelta[39m[38;5;124m"[39m) \
[1;32m      8[0m     [38;5;241m.[39mmode([38;5;124m"[39m[38;5;124moverwrite[39m[38;5;124m"[39m) \
[0;32m----> 9[0m     [38;5;241m.[39msaveAsTable([38;5;124mf[39m[38;5;124m"[39m[38;5;132;01m{[39;00m__CATAL

# Verify table creation


In [0]:
tables = spark.sql("SHOW TABLES IN btv_dc30.bronze").filter("tableName LIKE '%'").collect()

for t in tables:
    table_name = f"btv_dc30.bronze.{t.tableName}"
    print(f"Query table: {table_name}")
    spark.sql(f"""SELECT * FROM {table_name} LIMIT 3;""")


Query table: btv_dc30.bronze.hmail
Query table: btv_dc30.bronze.osquery_cpu_time
Query table: btv_dc30.bronze.osquery_device_nodes
Query table: btv_dc30.bronze.osquery_iptables
Query table: btv_dc30.bronze.osquery_kernel_modules
Query table: btv_dc30.bronze.osquery_last
Query table: btv_dc30.bronze.osquery_listening_ports
Query table: btv_dc30.bronze.osquery_logged_in_users
Query table: btv_dc30.bronze.osquery_memory_info
Query table: btv_dc30.bronze.osquery_mounts
Query table: btv_dc30.bronze.osquery_open_sockets
Query table: btv_dc30.bronze.osquery_osquery_info
Query table: btv_dc30.bronze.osquery_process_events
Query table: btv_dc30.bronze.osquery_runtime_perf
Query table: btv_dc30.bronze.osquery_schedule
Query table: btv_dc30.bronze.osquery_smbios_tables
Query table: btv_dc30.bronze.osquery_socket_events
Query table: btv_dc30.bronze.osquery_syslog_events
Query table: btv_dc30.bronze.sysmon_clipboardchange
Query table: btv_dc30.bronze.sysmon_createremotethread
Query table: btv_dc30.