# Using Spark on Kubernetes

This is a testing notebook and also "cheat sheet" to make sure everything is running and connecting
for my kubernetes spark setup

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os

# load spark session templates
from spark_utils import get_k8s_spark

## Objectstore Tests 

we are using Minio as our object store so firstly lets test it independent of spark
if we return buckets then all is good

In [None]:
from minio import Minio
from minio.error import S3Error

In [None]:
minio_client = Minio(
        "minio.minio-tenant.svc.cluster.local",
        access_key='AKIAIOSFODNN7EXAMPLE',
        secret_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY',
        secure=False
    )

In [None]:
buckets = minio_client.list_buckets()

for bucket in buckets:
    print(bucket.name, bucket.creation_date)

## Configs

These configs are set to work with the stack at: https://github.com/Data-drone/data_eng_kube.git

Note compared to Spark 2.x, Spark 3.x doesn't properly maven load spark.jars.packages:
https://issues.apache.org/jira/browse/SPARK-35084

We need to have at least the hadoop-aws jar already on drivers and executors to make things work more smoothly

In [None]:
#// https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk
#libraryDependencies += "com.amazonaws" % "aws-java-sdk" % "1.12.79"

#"com.amazonaws:aws-java-sdk:1.12.79"

In [None]:
SUBMIT_ARGS = "--packages org.apache.hadoop:hadoop-aws:3.2.0,com.amazonaws:aws-java-sdk:1.12.79 --jars {0} \
--driver-class-path {1} pyspark-shell".format(package_list, classPath)

BASIC_SUBMIT_ARGS = "--packages org.apache.hadoop:hadoop-aws:3.2.0 pyspark-shell"

os.environ["PYSPARK_SUBMIT_ARGS"] = BASIC_SUBMIT_ARGS

In [None]:
access_key = 'AKIAIOSFODNN7EXAMPLE' # os.environ['MINIO_ACCESS_KEY']
secret_key = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' # os.environ['MINIO_SECRET_KEY']

spark = (get_k8s_spark()
            .config("spark.kubernetes.container.image", 
                    "k3d-test-registry:5000/datadrone/k8s-spark-worker:3.1.2-hadoop3.2-rapids-k8s-basic")
            .config("spark.kubernetes.container.image.pullPolicy", "Always")
            .config("spark.hadoop.fs.s3a.access.key", access_key)
            .config("spark.hadoop.fs.s3a.secret.key", secret_key)
            .config("spark.hadoop.fs.s3a.endpoint", "minio.minio-tenant.svc.cluster.local")
            .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
            .config("spark.hadoop.fs.s3a.path.style.access", True)
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            .config("spark.packages", "org.apache.hadoop:hadoop-aws:3.2.0")
            .appName("Spark K8s")
            .enableHiveSupport()
            .getOrCreate()
        )

In [None]:
# need to add jars: org.apache.hadoop:hadoop-aws:3.2.0
#sparkConf.set("spark.jars.packages", ["org.apache.hadoop:hadoop-aws:3.2.0"])
#sparkConf.set("spark.jars.ivy", "/opt/")

In [None]:
# check loaded jars
print(spark.sparkContext._jsc.sc().listJars())

In [None]:
# test spark without reading data
# Create a distributed data set to test to the session
t = spark.sparkContext.parallelize(range(10))

# Calculate the approximate sum of values in the dataset
r = t.sumApprox(3)
print('Approximate sum: %s' % r)

# Generate some test data and run through Spark

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.DataFrame(np.random.randn(100000,20))

In [None]:
df.head()

In [None]:
sparkDF=spark.createDataFrame(df) 

In [None]:
sparkDF.printSchema()

# Load Data and write it to my object store

In [None]:
# Firstly create a new bucket

In [None]:
try:
    minio_client.make_bucket('testing-bucket')
except S3Error as err:
    print(err)

In [None]:
# need boto to pull from AWS
# !pip install boto3

In [None]:
import boto3
from botocore import UNSIGNED
from botocore.client import Config

In [None]:
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

In [None]:
# Configs
output_bucket = 'testing-bucket'
testing_file = 'green_tripdata_2015-07.csv'
load_path = 'trip data/' + testing_file
write_path = 'raw_data/' + testing_file

In [None]:
with open('green_tripdata_2015-07.csv', 'wb') as f:
        s3.download_fileobj('nyc-tlc', load_path, f)

In [None]:
minio_client.fput_object(output_bucket, write_path, testing_file)

## Reading the loaded Data with Spark

In [None]:
spark.conf.set("spark.sql.shuffle.partitions", spark.sparkContext.defaultParallelism)

Test read from minio

In [None]:
raw_data = spark.read.option("header", True).csv(os.path.join('s3a://data/raw_data/green_tripdata_2014-09.csv'))

In [None]:
raw_data.printSchema()

In [None]:
raw_data.take(10)

In [None]:
raw_data.count()

# Close out Session

In [None]:
# Shutdown Our Context
spark.stop()

# Testing Different Class Paths and loading downloaded libs

For larger libs we want to have them downloaded already and it would be good to be able to load libs from s3a paths so that we don't have to load workers and driver through local files

In [2]:
s3_jarpath = "s3a://spark-jars/spark-jars/"
local_jarpath = "/opt/spark-jars/"

package_list = "{1}hadoop-aws-3.2.0.jar,{1}delta-core_2.12-1.0.0.jar,\
{0}rapids-4-spark-2.12-21.08.0.jar,{0}cudf-21.08.2-cuda11.jar".format(s3_jarpath, local_jarpath)

classPath = "{1}hadoop-aws-3.2.0.jar:{1}delta-core_2.12-1.0.0.jar:\
{0}rapids-4-spark-2.12-21.08.0.jar:{0}cudf-21.08.2-cuda11.jar".format(s3_jarpath, local_jarpath)

In [2]:
AUTOLOAD_SUBMIT_ARGS = "--packages org.apache.hadoop:hadoop-aws:3.2.0 pyspark-shell"

"s3a://spark-jars/spark-jars/hadoop-aws-3.2.0.jar"
"--packages org.apache.hadoop:hadoop-aws:3.2.0 "

BASIC_SUBMIT_ARGS = (
                     "--jars local:///opt/spark-jars/hadoop-aws-3.2.0.jar,"
                     "local:///opt/spark-jars/delta-core_2.12-1.0.0.jar"
                     " pyspark-shell")

os.environ["PYSPARK_SUBMIT_ARGS"] = AUTOLOAD_SUBMIT_ARGS

In [3]:
access_key = 'AKIAIOSFODNN7EXAMPLE' # os.environ['MINIO_ACCESS_KEY']
secret_key = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' # os.environ['MINIO_SECRET_KEY']

# .config("spark.packages", "org.apache.hadoop:hadoop-aws:3.2.0")
spark = (get_k8s_spark()
            .config("spark.kubernetes.container.image", 
                    "k3d-test-registry:5000/datadrone/k8s-spark-worker:3.1.2-hadoop3.2-rapids-k8s-basic")
            .config("spark.kubernetes.container.image.pullPolicy", "Always")
            .config("spark.hadoop.fs.s3a.access.key", access_key)
            .config("spark.hadoop.fs.s3a.secret.key", secret_key)
            .config("spark.hadoop.fs.s3a.endpoint", "minio.minio-tenant.svc.cluster.local")
            .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
            .config("spark.hadoop.fs.s3a.path.style.access", True)
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            .config("spark.kubernetes.executor.deleteOnTermination", "false")
            .appName("Spark K8s")
            .enableHiveSupport()
            .getOrCreate()
        )

:: loading settings :: url = jar:file:/opt/conda/lib/python3.9/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-351883e2-6825-4437-a613-991a64e09c8c;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.2.0 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.375 in central
:: resolution report :: resolve 96ms :: artifacts dl 2ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.375 from central in [default]
	org.apache.hadoop#hadoop-aws;3.2.0 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------

In [4]:
raw_data = spark.read.option("header", True).csv(os.path.join('s3a://data/raw_data/green_tripdata_2014-09.csv'))

21/10/03 05:09:38 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [5]:
raw_data.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- lpep_pickup_datetime: string (nullable = true)
 |-- Lpep_dropoff_datetime: string (nullable = true)
 |-- Store_and_fwd_flag: string (nullable = true)
 |-- RateCodeID: string (nullable = true)
 |-- Pickup_longitude: string (nullable = true)
 |-- Pickup_latitude: string (nullable = true)
 |-- Dropoff_longitude: string (nullable = true)
 |-- Dropoff_latitude: string (nullable = true)
 |-- Passenger_count: string (nullable = true)
 |-- Trip_distance: string (nullable = true)
 |-- Fare_amount: string (nullable = true)
 |-- Extra: string (nullable = true)
 |-- MTA_tax: string (nullable = true)
 |-- Tip_amount: string (nullable = true)
 |-- Tolls_amount: string (nullable = true)
 |-- Ehail_fee: string (nullable = true)
 |-- Total_amount: string (nullable = true)
 |-- Payment_type: string (nullable = true)
 |-- Trip_type : string (nullable = true)



In [None]:
clean_warehouse = "s3a://data/warehouse/raw/green_taxi_pre2015"

delta_data = spark.read.option("header", True).format("delta").load(clean_warehouse)


In [None]:
# Shutdown Our Context
spark.stop()