# Install Python dependencies 

In [1]:
!pip3 install --quiet pyspark==3.5.0
!pip3 install --quiet delta-spark==3.0.0
!pip3 install --quiet minio==7.2.0

# Download dataset and unzip

In [2]:
%%bash

curl -s https://btv.cachefly.net/DC30/Obsidian/obsidian_logs_DC30_v2.02.zip \
    --output /tmp/obsidian_logs_DC30_v2.02.zip
    
unzip -P "obsidian is never going to give you up" /tmp/obsidian_logs_DC30_v2.02.zip

Archive:  /tmp/obsidian_logs_DC30_v2.02.zip
   creating: filebeat/
  inflating: filebeat/hmail-2022.02.12.log  
  inflating: filebeat/hmail-2022.02.19.log  
  inflating: filebeat/osquery-2022.02.12.log  
  inflating: filebeat/osquery-2022.02.19.log  
  inflating: filebeat/sysmon-2022.02.12.log  
  inflating: filebeat/sysmon-2022.02.19.log  
  inflating: filebeat/wineventlogs-2022.02.12.log  
  inflating: filebeat/wineventlogs-2022.02.19.log  
   creating: zeek/
   creating: zeek/2022_02_19/
  inflating: zeek/2022_02_19/conn-summary.20_00_00-21_00_00.log  
  inflating: zeek/2022_02_19/weird.19_00_00-20_00_00.log  
  inflating: zeek/2022_02_19/dns.22_00_00-23_00_00.log  
  inflating: zeek/2022_02_19/smb_files.23_00_00-23_20_00.log  
  inflating: zeek/2022_02_19/notice.21_00_00-22_00_00.log  
  inflating: zeek/2022_02_19/notice.23_00_00-23_20_00.log  
  inflating: zeek/2022_02_19/capture_loss.23_00_00-23_20_00.log  
  inflating: zeek/2022_02_19/dns.23_00_00-23_20_00.log  
  inflating: zee

# Upload Osquery log files to Minio (S3)

In [3]:
from minio.error import S3Error
from minio import Minio
import os.path
import glob
import os

MINIO_ENDPOINT = str()
for name, value in os.environ.items():
    if name.endswith("_MINIO_PORT"):
        MINIO_ENDPOINT=value.replace("tcp://", "")
print(MINIO_ENDPOINT)

client = Minio(
    MINIO_ENDPOINT,
    access_key="analyst-svcacct",
    secret_key="analyst123",
    secure=False
)

found = client.bucket_exists("fallback")
print(found)

for log_file in glob.glob("filebeat/osquery-*.log"):
    client.fput_object(
        "logs-bronze", 
        f"osquery/{os.path.basename(log_file)}", 
        log_file,
    )
    print(
        log_file, "successfully uploaded as object",
        f"logs-bronze/osquery/{os.path.basename(log_file)} to bucket logs-bronze", 
    )

for log_file in glob.glob("filebeat/sysmon-*.log"):
    client.fput_object(
        "logs-bronze",
        f"sysmon/{os.path.basename(log_file)}", 
        log_file,
    )
    print(
        log_file, "successfully uploaded as object",
        f"logs-bronze/sysmon/{os.path.basename(log_file)} to bucket logs-bronze", 
    )

for log_file in glob.glob("filebeat/hmail-*.log"):
    client.fput_object(
        "logs-bronze",
        f"hmail/{os.path.basename(log_file)}", 
        log_file,
    )
    print(
        log_file, "successfully uploaded as object",
        f"logs-bronze/hmail/{os.path.basename(log_file)} to bucket logs-bronze", 
    )

for log_file in glob.glob("filebeat/wineventlogs-*.log"):
    client.fput_object(
        "logs-bronze",
        f"windows/{os.path.basename(log_file)}", 
        log_file,
    )
    print(
        log_file, "successfully uploaded as object",
        f"logs-bronze/wineventlogs/{os.path.basename(log_file)} to bucket logs-bronze", 
    )

for log_file in glob.glob("zeek/*/*.log"):
    client.fput_object(
        "logs-bronze",
        f"zeek/{os.path.basename(log_file)}", 
        log_file,
    )
    print(
        log_file, "successfully uploaded as object",
        f"logs-bronze/zeek/{os.path.basename(log_file)} to bucket logs-bronze", 
    )


10.152.183.94:9000
True
filebeat/osquery-2022.02.19.log successfully uploaded as object logs-bronze/osquery/osquery-2022.02.19.log to bucket logs-bronze
filebeat/osquery-2022.02.12.log successfully uploaded as object logs-bronze/osquery/osquery-2022.02.12.log to bucket logs-bronze
filebeat/sysmon-2022.02.12.log successfully uploaded as object logs-bronze/sysmon/sysmon-2022.02.12.log to bucket logs-bronze
filebeat/sysmon-2022.02.19.log successfully uploaded as object logs-bronze/sysmon/sysmon-2022.02.19.log to bucket logs-bronze
filebeat/hmail-2022.02.19.log successfully uploaded as object logs-bronze/hmail/hmail-2022.02.19.log to bucket logs-bronze
filebeat/hmail-2022.02.12.log successfully uploaded as object logs-bronze/hmail/hmail-2022.02.12.log to bucket logs-bronze
filebeat/wineventlogs-2022.02.19.log successfully uploaded as object logs-bronze/wineventlogs/wineventlogs-2022.02.19.log to bucket logs-bronze
filebeat/wineventlogs-2022.02.12.log successfully uploaded as object logs-br

# Download dependencies for Spark

In [4]:
%%bash
# find /usr/local/spark/jars/ -name 'hadoop-client-api-*.jar' | awk -F- '{print $4}' | grep -Eo '([0-9]\.)+[0-9]'
export HADOOP_VERSION="3.3.4"
export AWS_VERSION="1.12.599"
export DELTA_VESION="3.0.0"
export SCALA_VERSION="2.12"

curl -s -O https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/${HADOOP_VERSION}/hadoop-common-${HADOOP_VERSION}.jar
curl -s -O https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar
curl -s -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-core/${AWS_VERSION}/aws-java-sdk-core-${AWS_VERSION}.jar
curl -s -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-s3/${AWS_VERSION}/aws-java-sdk-s3-${AWS_VERSION}.jar
curl -s -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-dynamodb/${AWS_VERSION}/aws-java-sdk-dynamodb-${AWS_VERSION}.jar
curl -s -O https://repo1.maven.org/maven2/io/delta/delta-spark_${SCALA_VERSION}/${DELTA_VESION}/delta-spark_${SCALA_VERSION}-${DELTA_VESION}.jar
curl -s -O https://repo1.maven.org/maven2/io/delta/delta-storage/${DELTA_VESION}/delta-storage-${DELTA_VESION}.jar