# Using Spark on Kubernetes

This is a testing notebook and also "cheat sheet" to make sure everything is running and connecting
for my kubernetes spark setup

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os

# load spark session templates
from spark_utils import get_k8s_spark

## Objectstore Tests 

we are using Minio as our object store so firstly lets test it independent of spark
if we return buckets then all is good

In [None]:
from minio import Minio
from minio.error import S3Error

In [None]:
minio_client = Minio(
        "minio.minio-tenant.svc.cluster.local",
        access_key='AKIAIOSFODNN7EXAMPLE',
        secret_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY',
        secure=False
    )

In [None]:
buckets = minio_client.list_buckets()

for bucket in buckets:
    print(bucket.name, bucket.creation_date)

## Configs

These configs are set to work with the stack at: https://github.com/Data-drone/data_eng_kube.git

Note compared to Spark 2.x, Spark 3.x doesn't properly maven load spark.jars.packages:
https://issues.apache.org/jira/browse/SPARK-35084

We need to have at least the hadoop-aws jar already on drivers and executors to make things work more smoothly

In [None]:
SUBMIT_ARGS = "--packages org.apache.hadoop:hadoop-aws:3.2.0,com.amazonaws:aws-java-sdk:1.12.79 --jars {0} \
--driver-class-path {1} pyspark-shell".format(package_list, classPath)

BASIC_SUBMIT_ARGS = "--packages org.apache.hadoop:hadoop-aws:3.2.0 pyspark-shell"

os.environ["PYSPARK_SUBMIT_ARGS"] = BASIC_SUBMIT_ARGS

In [None]:
access_key = 'AKIAIOSFODNN7EXAMPLE' # os.environ['MINIO_ACCESS_KEY']
secret_key = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' # os.environ['MINIO_SECRET_KEY']

spark = (get_k8s_spark()
            .config("spark.kubernetes.container.image", 
                    "k3d-test-registry:5000/datadrone/k8s-spark-worker:3.1.2-hadoop3.2-rapids-k8s")
            .config("spark.kubernetes.container.image.pullPolicy", "Always")
            .config("spark.hadoop.fs.s3a.access.key", access_key)
            .config("spark.hadoop.fs.s3a.secret.key", secret_key)
            .config("spark.hadoop.fs.s3a.endpoint", "minio.minio-tenant.svc.cluster.local")
            .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
            .config("spark.hadoop.fs.s3a.path.style.access", True)
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            .config("spark.packages", "org.apache.hadoop:hadoop-aws:3.2.0")
            .appName("Spark K8s")
            .enableHiveSupport()
            .getOrCreate()
        )

In [None]:
# check loaded jars
print(spark.sparkContext._jsc.sc().listJars())

In [None]:
# test spark without reading data
# Create a distributed data set to test to the session
t = spark.sparkContext.parallelize(range(10))

# Calculate the approximate sum of values in the dataset
r = t.sumApprox(3)
print('Approximate sum: %s' % r)

# Generate some test data and run through Spark

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.DataFrame(np.random.randn(100000,20))

In [None]:
df.head()

In [None]:
sparkDF=spark.createDataFrame(df) 

In [None]:
sparkDF.printSchema()

# Load Data and write it to my object store

In [None]:
# Firstly create a new bucket

In [None]:
try:
    minio_client.make_bucket('testing-bucket')
except S3Error as err:
    print(err)

In [None]:
# need boto to pull from AWS
# !pip install boto3

In [None]:
import boto3
from botocore import UNSIGNED
from botocore.client import Config

In [None]:
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

In [None]:
# Configs
output_bucket = 'testing-bucket'
testing_file = 'green_tripdata_2015-07.csv'
load_path = 'trip data/' + testing_file
write_path = 'raw_data/' + testing_file

In [None]:
with open('green_tripdata_2015-07.csv', 'wb') as f:
        s3.download_fileobj('nyc-tlc', load_path, f)

In [None]:
minio_client.fput_object(output_bucket, write_path, testing_file)

## Reading the loaded Data with Spark

In [None]:
spark.conf.set("spark.sql.shuffle.partitions", spark.sparkContext.defaultParallelism)

Test read from minio

In [None]:
raw_data = spark.read.option("header", True).csv(os.path.join('s3a://data/raw_data/green_tripdata_2014-09.csv'))

In [None]:
raw_data.printSchema()

In [None]:
raw_data.take(10)

In [None]:
raw_data.count()

# Close out Session

In [None]:
# Shutdown Our Context
spark.stop()

# Testing Different Class Paths and loading downloaded libs

For larger libs we want to have them downloaded already and it would be good to be able to load libs from s3a paths so that we don't have to load workers and driver through local files.

It seems like we need to add things in the `jars` section for pyspark to work properly. We also need it on local as the jars get loaded together. So it will get stuck if the hadoop-aws jars aren't loaded when it tries to load a s3 pathed one.
Extra jars via `extraClassPath` don't seem to work either. Perhaps because it won't search through the classpaths on the initial spark initialisation? 

In [2]:
BASIC_SUBMIT_ARGS = ("--jars local:///opt/spark-jars/hadoop-aws-3.2.0.jar,"
                     "local:///opt/spark-jars/delta-core_2.12-1.0.0.jar,"
                     "local:///opt/spark-jars/aws-java-sdk-bundle-1.11.375.jar,"
                     "local:///opt/sparkRapidsPlugin/cudf-21.08.2-cuda11.jar,"
                     "local:///opt/sparkRapidsPlugin/rapids-4-spark_2.12-21.08.0.jar"
                     " pyspark-shell")

os.environ["PYSPARK_SUBMIT_ARGS"] = BASIC_SUBMIT_ARGS

In [3]:
access_key = 'AKIAIOSFODNN7EXAMPLE' # os.environ['MINIO_ACCESS_KEY']
secret_key = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' # os.environ['MINIO_SECRET_KEY']

# This cannot be triggered in the python code as the JVM will be activated when it hits the python builder starts
# .config("spark.packages", "org.apache.hadoop:hadoop-aws:3.2.0")

# debug
# .config("spark.kubernetes.executor.deleteOnTermination", "false")

spark = (get_k8s_spark()
            .config("spark.kubernetes.container.image", 
                    "k3d-test-registry:5000/datadrone/k8s-spark-worker:3.1.2-hadoop3.2-rapids-k8s")
            .config("spark.kubernetes.container.image.pullPolicy", "Always")
            .config("spark.hadoop.fs.s3a.access.key", access_key)
            .config("spark.hadoop.fs.s3a.secret.key", secret_key)
            .config("spark.hadoop.fs.s3a.endpoint", "minio.minio-tenant.svc.cluster.local")
            .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
            .config("spark.hadoop.fs.s3a.path.style.access", True)
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
            .config("spark.executor.resource.gpu.amount", "1")
            .config("spark.task.resource.gpu.amount", "1")
            .config("spark.executor.resource.gpu.discoveryScript", "/opt/sparkRapidsPlugin/getGpusResources.sh")
            .config("spark.executor.resource.gpu.vendor", "nvidia.com")
            .config("spark.plugins", "com.nvidia.spark.SQLPlugin")
            .config("spark.rapids.sql.concurrentGpuTasks", "2")
            .config("spark.kubernetes.executor.deleteOnTermination", "false")
            .appName("Spark K8s")
            .enableHiveSupport()
            .getOrCreate()
        )

21/10/03 12:22:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/03 12:23:20 WARN SQLExecPlugin: RAPIDS Accelerator 21.08.0 using cudf 21.08.2. To disable GPU support set `spark.rapids.sql.enabled` to false
21/10/03 12:23:20 WARN Plugin: Installing rapids UDF compiler extensions to Spark. The compiler is disabled by default. To enable it, set `spark.rapids.sql.udfCompiler.enabled` to true


In [4]:
raw_data = spark.read.option("header", True).csv(os.path.join('s3a://data/raw_data/green_tripdata_2014-09.csv'))

21/10/03 12:23:39 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [5]:
raw_data.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- lpep_pickup_datetime: string (nullable = true)
 |-- Lpep_dropoff_datetime: string (nullable = true)
 |-- Store_and_fwd_flag: string (nullable = true)
 |-- RateCodeID: string (nullable = true)
 |-- Pickup_longitude: string (nullable = true)
 |-- Pickup_latitude: string (nullable = true)
 |-- Dropoff_longitude: string (nullable = true)
 |-- Dropoff_latitude: string (nullable = true)
 |-- Passenger_count: string (nullable = true)
 |-- Trip_distance: string (nullable = true)
 |-- Fare_amount: string (nullable = true)
 |-- Extra: string (nullable = true)
 |-- MTA_tax: string (nullable = true)
 |-- Tip_amount: string (nullable = true)
 |-- Tolls_amount: string (nullable = true)
 |-- Ehail_fee: string (nullable = true)
 |-- Total_amount: string (nullable = true)
 |-- Payment_type: string (nullable = true)
 |-- Trip_type : string (nullable = true)



In [6]:
clean_warehouse = "s3a://data/warehouse/raw/green_taxi_pre2015"

delta_data = spark.read.option("header", True).format("delta").load(clean_warehouse)

21/10/03 12:23:51 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
21/10/03 12:25:49 WARN HeartbeatReceiver: Removing executor 1 with no recent heartbeats: 133229 ms exceeds timeout 120000 ms


In [7]:
delta_data.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- dropoff_datetime: string (nullable = true)
 |-- Store_and_fwd_flag: string (nullable = true)
 |-- RateCodeID: string (nullable = true)
 |-- Pickup_longitude: string (nullable = true)
 |-- Pickup_latitude: string (nullable = true)
 |-- Dropoff_longitude: string (nullable = true)
 |-- Dropoff_latitude: string (nullable = true)
 |-- Passenger_count: string (nullable = true)
 |-- Trip_distance: string (nullable = true)
 |-- Fare_amount: string (nullable = true)
 |-- Extra: string (nullable = true)
 |-- MTA_tax: string (nullable = true)
 |-- Tip_amount: string (nullable = true)
 |-- Tolls_amount: string (nullable = true)
 |-- Ehail_fee: string (nullable = true)
 |-- Total_amount: string (nullable = true)
 |-- Payment_type: string (nullable = true)
 |-- trip_type: string (nullable = true)



In [5]:
# Shutdown Our Context
spark.stop()

21/10/03 12:22:22 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed (this is expected if the application is shutting down.)
