# Create PySpark session

In [1]:
from delta import *
import pyspark
import os


exampleJars = [
    "hadoop-aws-3.3.4.jar",
    "aws-java-sdk-core-1.12.599.jar",
    "aws-java-sdk-s3-1.12.599.jar",
    "delta-storage-3.0.0.jar",
    "delta-spark_2.12-3.0.0.jar",
    "aws-java-sdk-dynamodb-1.12.599.jar",
    "hadoop-common-3.3.4.jar",
]


SPARK_ENDPOINT = str()
for name, value in os.environ.items():
    if name.endswith("SPARK_MASTER_SVC_SERVICE_HOST"):
        SPARK_ENDPOINT=f"spark://{value}:7077"
print(f"SPARK_ENDPOINT: {SPARK_ENDPOINT}")


MINIO_ENDPOINT = str()
for name, value in os.environ.items():
    if name.endswith("_MINIO_PORT"):
        MINIO_ENDPOINT=value.replace("tcp://", "")
print(f"MINIO_ENDPOINT: {MINIO_ENDPOINT}")


builder = (
    pyspark.sql.SparkSession.builder.appName("Myapp")
    # Sets the Spark master/captain URL to connect too.
    .master(SPARK_ENDPOINT)
    # JARs on disk to load into our Spark session
    .config("spark.jars", ",".join(exampleJars))
    # k8s service for Jupyter driver
    .config("spark.driver.host", "jupyter-driver")
    # Port for Jupyter driver
    .config("spark.driver.port", 2222)
    # Extending the capabilities of SQL searching with Delta tables
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')
    ####### AWS setup and creds #######
    .config("spark.hadoop.fs.s3a.access.key", "analyst")
    .config("spark.hadoop.fs.s3a.secret.key", "analyst123")
    .config("spark.hadoop.fs.s3a.endpoint", MINIO_ENDPOINT)
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.attempts.maximum", "1")
    .config("spark.hadoop.fs.s3a.connection.establish.timeout", "5000")
    .config("spark.hadoop.fs.s3a.connection.timeout", "10000")
)
spark = configure_spark_with_delta_pip(builder).getOrCreate()

print ("done")

SPARK_ENDPOINT: spark://10.152.183.194:7077
MINIO_ENDPOINT: 10.152.183.94:9000
done


# Read the delta table into a dataframe

To query the data we need to load the delta table data into a dataframe

In [2]:
df = spark.read.format("delta").load("s3a://logs-silver/osquery/osquery.delta")

# Query delta table

Below we are query the delta table for Osquery logs pertaining to the command line that contain the letter `b`.

In [3]:
from pyspark.sql import functions as F

df.printSchema()

df.where(df.table == "process_events") \
    .select(df.hostname, F.col("columns").getItem("cmdline").alias("cmdline")) \
    .where(F.col("columns").getItem("cmdline").contains("b")) \
    .show(10, False)

root
 |-- hostname: string (nullable = true)
 |-- action: string (nullable = true)
 |-- table: string (nullable = true)
 |-- unixTime: long (nullable = true)
 |-- columns: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+----------------+----------------------------------------------------------+
|hostname        |cmdline                                                   |
+----------------+----------------------------------------------------------+
|ip-172-16-55-120|/usr/lib/apt/apt-helper wait-online                       |
|ip-172-16-55-120|/lib/systemd/systemd-networkd-wait-online -q --timeout=30 |
|ip-172-16-55-120|/bin/sh /usr/lib/apt/apt.systemd.daily update             |
|ip-172-16-55-120|/usr/bin/dpkg --print-foreign-architectures               |
|ip-172-16-55-120|/bin/sh /usr/lib/apt/apt.systemd.daily lock_is_held update|
|ip-172-16-55-120|cmp -s apt.extended_states.0 /var/lib/apt/extended_states |
|ip-172-16-55-120|/bin/sh /usr

In [4]:
spark.stop()