# Template Quest Notebook

In [2]:
# Import neccessary modules, add to this cell as needed
# Provided PySpark examples

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, LongType

## Part 1: Load the Sample Dataset

In [3]:
# Initiate a new Spark session and set the case sensitivity option
spark = (
    SparkSession.builder
        .appName("cyberquest")
        .getOrCreate()
)
spark.conf.set("spark.sql.caseSensitive", True)

In [10]:
# TODO: Load the raw data
df_bronze = spark.read.json("data/sysmon_spearphish_cribl.json")
# A short preview of the data is always helpful for a higher-level understanding of the data we will be dealing with.
print(f"Total records loaded: {df_bronze.count()}")
df_bronze.printSchema()
df_bronze.limit(3).toPandas()  # Found toPandas() in apache spark doc, found this more visually readable than default .show()

Total records loaded: 28017
root
 |-- Computer: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- EventCode: string (nullable = true)
 |-- User: string (nullable = true)
 |-- _raw: string (nullable = true)
 |-- _time: double (nullable = true)
 |-- cribl_breaker: string (nullable = true)
 |-- cribl_pipe: string (nullable = true)
 |-- host: string (nullable = true)
 |-- source: string (nullable = true)



Unnamed: 0,Computer,Description,EventCode,User,_raw,_time,cribl_breaker,cribl_pipe,host,source
0,win-host-ctus-attack-range-212,,23,NT AUTHORITY\SYSTEM,"{""Name"":""'Microsoft-Windows-Sysmon'"",""Guid"":""'...",1674818000.0,fallback,Splunk_UF_Windows_XML_WEC_WEF_Sysmon,f2c75b47cbb7,/tmp/cribldata/in/sysmon_office_doc_abuses_rel...
1,win-dc-ctus-attack-range-460.attackrange.local,,23,NT AUTHORITY\SYSTEM,"{""Name"":""'Microsoft-Windows-Sysmon'"",""Guid"":""'...",1674818000.0,fallback,Splunk_UF_Windows_XML_WEC_WEF_Sysmon,f2c75b47cbb7,/tmp/cribldata/in/sysmon_office_doc_abuses_rel...
2,win-host-ctus-attack-range-212,,23,NT AUTHORITY\SYSTEM,"{""Name"":""'Microsoft-Windows-Sysmon'"",""Guid"":""'...",1674818000.0,fallback,Splunk_UF_Windows_XML_WEC_WEF_Sysmon,f2c75b47cbb7,/tmp/cribldata/in/sysmon_office_doc_abuses_rel...


### Dataset Overview

I believe it is always a good practice to understand the data that I will be working with, which is something analogous to how we identify assets and try to get visibility on the data classification before we start any assessment.

### Source:
- Windows endpoints --> Sysmon Logs (DNS) --> Splunk UF --> Cribl --> SIEM / Data Lake

In simple words, the dataset contains Windows Sysmon logs collected from Windows endpoints that were shipped via Splunk Universal Forwarder to Cribl, which orchestrates, normalizes, and routes them to the detection platform SIEM / Data Lake.


**File:** `data/sysmon_spearphish_cribl.json`  
**Format:** JSONL (low memory requirements, best for processing and streaming line by line, fault tolerant)  
**Time Range:** January 27, 2023 | 11: 19: 47 to 11: 34: 46 | Approximately 15 mins  
**Environment:** Looks like a simulated attack range (attackrange.local)

### Data Structure

Each record has two layers:
- Outer (added by Cribl during forwarding)
- Inner (_raw: which contains the actual sysmon event data)

### Some Initial Considerations

- EventCode (22) is basically a DNS event, and this will be a key event considering the provided hypothesis.
- Reason: Captures important fields like:
    - 'Image' (gives info on which process initiated the DNS lookup),
    - 'QueryName' (gives info on the domain that was queried),
    - 'QueryResults' (gives info on what IPs it resolved to)  


I think this is a good starting point. Under normal circumstances, Microsoft Office applications like winword.exe or excel.exe shouldnâ€™t be generating DNS requests on their own. So when we observe that behavior, it stands out as a potential anomaly. It may indicate that a malicious macro was executed, which then initiated outbound communication, possibly marking the early stages of C2 beaconing.


I could think of two ways of parsing the data from the JSON:
1) Using get_json_object
2) Using from_json

I opted to choose the **get_json_object** over from_json as there are multiple EventCode with completely different fields inside _raw. Using the get_json_object would allow me to extarct the fields that I need rather than defining one massive schema (required for from_json) that would most probably return nulls for major fields as _raw is a mix of EventCodes

In [11]:
# TODO: Parse the raw data into something relevant and usable

df_silver = df_bronze.select(
    # Extracting the outer
    F.col("Computer").alias("computer"),
    F.col("EventCode").alias("event_code"),
    F.col("User").alias("user"),
    F.col("_time").alias("epoch_time"),
    F.col("host").alias("cribl_host"),
    F.col("source").alias("log_source"),

    # Parsing key fields from the inner (_raw) 
    F.get_json_object(F.col("_raw"), "$.UtcTime").alias("event_time"),
    F.get_json_object(F.col("_raw"), "$.Image").alias("process_image"),
    F.get_json_object(F.col("_raw"), "$.ProcessId").alias("process_id"),
    F.get_json_object(F.col("_raw"), "$.ProcessGuid").alias("process_guid"),
    F.get_json_object(F.col("_raw"), "$.QueryName").alias("dns_query"),
    F.get_json_object(F.col("_raw"), "$.QueryResults").alias("dns_results"),
    F.get_json_object(F.col("_raw"), "$.Channel").alias("channel"),
    F.get_json_object(F.col("_raw"), "$.RuleName").alias("rule_name"),
)

# Creating a SQL view will be used in downstream detection
df_silver.createOrReplaceTempView("sysmon_silver")

print(f"Silver layer records: {df_silver.count()}")
df_silver.printSchema()

Silver layer records: 28017
root
 |-- computer: string (nullable = true)
 |-- event_code: string (nullable = true)
 |-- user: string (nullable = true)
 |-- epoch_time: double (nullable = true)
 |-- cribl_host: string (nullable = true)
 |-- log_source: string (nullable = true)
 |-- event_time: string (nullable = true)
 |-- process_image: string (nullable = true)
 |-- process_id: string (nullable = true)
 |-- process_guid: string (nullable = true)
 |-- dns_query: string (nullable = true)
 |-- dns_results: string (nullable = true)
 |-- channel: string (nullable = true)
 |-- rule_name: string (nullable = true)



In [12]:
# Provided PySpark Example
df_silver.limit(5).toPandas()

Unnamed: 0,computer,event_code,user,epoch_time,cribl_host,log_source,event_time,process_image,process_id,process_guid,dns_query,dns_results,channel,rule_name
0,win-host-ctus-attack-range-212,23,NT AUTHORITY\SYSTEM,1674818000.0,f2c75b47cbb7,/tmp/cribldata/in/sysmon_office_doc_abuses_rel...,2023-01-27 11:19:47.591,C:\Program Files\SplunkUniversalForwarder\bin\...,3764.0,{72106695-9B97-63D3-6D00-00000000BD02},,,Microsoft-Windows-Sysmon/Operational,-
1,win-dc-ctus-attack-range-460.attackrange.local,23,NT AUTHORITY\SYSTEM,1674818000.0,f2c75b47cbb7,/tmp/cribldata/in/sysmon_office_doc_abuses_rel...,2023-01-27 11:19:47.410,C:\Program Files\SplunkUniversalForwarder\bin\...,3928.0,{45AAC21C-9BAA-63D3-7B00-00000000BC02},,,Microsoft-Windows-Sysmon/Operational,-
2,win-host-ctus-attack-range-212,23,NT AUTHORITY\SYSTEM,1674818000.0,f2c75b47cbb7,/tmp/cribldata/in/sysmon_office_doc_abuses_rel...,2023-01-27 11:19:47.008,C:\Program Files\Amazon\SSM\amazon-ssm-agent.exe,1880.0,{72106695-9B85-63D3-1B00-00000000BD02},,,Microsoft-Windows-Sysmon/Operational,-
3,win-dc-ctus-attack-range-460.attackrange.local,10,,1674818000.0,f2c75b47cbb7,/tmp/cribldata/in/sysmon_office_doc_abuses_rel...,2023-01-27 11:19:47.238,,,,,,Microsoft-Windows-Sysmon/Operational,-
4,win-dc-ctus-attack-range-460.attackrange.local,10,,1674818000.0,f2c75b47cbb7,/tmp/cribldata/in/sysmon_office_doc_abuses_rel...,2023-01-27 11:19:47.061,,,,,,Microsoft-Windows-Sysmon/Operational,-


In [13]:
# Provided PySpark SQL Example
spark.sql("""
SELECT *
FROM sysmon_silver
LIMIT 5
""").toPandas()

Unnamed: 0,computer,event_code,user,epoch_time,cribl_host,log_source,event_time,process_image,process_id,process_guid,dns_query,dns_results,channel,rule_name
0,win-host-ctus-attack-range-212,23,NT AUTHORITY\SYSTEM,1674818000.0,f2c75b47cbb7,/tmp/cribldata/in/sysmon_office_doc_abuses_rel...,2023-01-27 11:19:47.591,C:\Program Files\SplunkUniversalForwarder\bin\...,3764.0,{72106695-9B97-63D3-6D00-00000000BD02},,,Microsoft-Windows-Sysmon/Operational,-
1,win-dc-ctus-attack-range-460.attackrange.local,23,NT AUTHORITY\SYSTEM,1674818000.0,f2c75b47cbb7,/tmp/cribldata/in/sysmon_office_doc_abuses_rel...,2023-01-27 11:19:47.410,C:\Program Files\SplunkUniversalForwarder\bin\...,3928.0,{45AAC21C-9BAA-63D3-7B00-00000000BC02},,,Microsoft-Windows-Sysmon/Operational,-
2,win-host-ctus-attack-range-212,23,NT AUTHORITY\SYSTEM,1674818000.0,f2c75b47cbb7,/tmp/cribldata/in/sysmon_office_doc_abuses_rel...,2023-01-27 11:19:47.008,C:\Program Files\Amazon\SSM\amazon-ssm-agent.exe,1880.0,{72106695-9B85-63D3-1B00-00000000BD02},,,Microsoft-Windows-Sysmon/Operational,-
3,win-dc-ctus-attack-range-460.attackrange.local,10,,1674818000.0,f2c75b47cbb7,/tmp/cribldata/in/sysmon_office_doc_abuses_rel...,2023-01-27 11:19:47.238,,,,,,Microsoft-Windows-Sysmon/Operational,-
4,win-dc-ctus-attack-range-460.attackrange.local,10,,1674818000.0,f2c75b47cbb7,/tmp/cribldata/in/sysmon_office_doc_abuses_rel...,2023-01-27 11:19:47.061,,,,,,Microsoft-Windows-Sysmon/Operational,-


## Part 2: Detection Engineering

### My Approach

#### Provided Hypothesis:




## Part 3: Additional Steps

### Part 3.1: Normalization

### Part 3.2: Alert Table

### Part 3.3: Enrichment

## Summary

Summarize your submission here, comments are helpful to add throughout your code as well.

### Resources / References Used:
- https://www.ultimatewindowssecurity.com/securitylog/encyclopedia/event.aspx?eventid=90022
- 