# Using Spark on Kubernetes

This is a testing notebook and also "cheat sheet" to make sure everything is running and connecting
for my kubernetes spark setup

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import socket # to get the internal ipaddress for setting the spark driver
import os

## Objectstore Tests 

we are using Minio as our object store so firstly lets test it independent of spark
if we return buckets then all is good

In [2]:
from minio import Minio

In [3]:
minio_client = Minio(
        "minio.minio-tenant.svc.cluster.local",
        access_key='AKIAIOSFODNN7EXAMPLE',
        secret_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY',
        secure=False
    )

In [4]:
buckets = minio_client.list_buckets()

for bucket in buckets:
    print(bucket.name, bucket.creation_date)

testing-bucket 2021-09-26 06:04:35.273000+00:00
warehouse 2021-09-19 14:11:20.375000+00:00


## Configs

These configs are set to work with the stack at: https://github.com/Data-drone/data_eng_kube.git

Note compared to Spark 2.x, Spark 3.x doesn't properly maven load spark.jars.packages:
https://issues.apache.org/jira/browse/SPARK-35084

We need to have at least the hadoop-aws jar already on drivers and executors to make things work more smoothly

In [5]:
SUBMIT_ARGS = "--packages org.apache.hadoop:hadoop-aws:3.2.0 pyspark-shell"
os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS

In [6]:
sparkConf = SparkConf()
sparkConf.setMaster("k8s://https://kubernetes.default.svc.cluster.local:443")
sparkConf.setAppName("spark")
sparkConf.set("spark.kubernetes.container.image", "k3d-test-registry:5000/datadrone/spark-test2:latest")
sparkConf.set("spark.kubernetes.namespace", "jhub")
sparkConf.set("spark.executor.instances", "2")
sparkConf.set("spark.executor.cores", "4")
sparkConf.set("spark.executor.memory", "1g")
sparkConf.set("spark.driver.memory", "512m")
sparkConf.set("spark.executor.memory", "512m")
sparkConf.set("spark.pyspark.python", "/opt/conda/bin/python")
sparkConf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
sparkConf.set("spark.kubernetes.authenticate.serviceAccountName", "spark")

### Adding minio settings
# need to add jars: org.apache.hadoop:hadoop-aws:3.2.0
sparkConf.set("spark.jars.packages", ["org.apache.hadoop:hadoop-aws:3.2.0"])
#sparkConf.set("spark.jars.ivy", "/opt/")

access_key = 'AKIAIOSFODNN7EXAMPLE' # os.environ['MINIO_ACCESS_KEY']
secret_key = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' # os.environ['MINIO_SECRET_KEY']

sparkConf.set("spark.hadoop.fs.s3a.access.key", access_key)
sparkConf.set("spark.hadoop.fs.s3a.secret.key", secret_key)
sparkConf.set("spark.hadoop.fs.s3a.endpoint", "minio.minio-tenant.svc.cluster.local")
sparkConf.set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
sparkConf.set("spark.hadoop.fs.s3a.path.style.access", True)
sparkConf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            
#sparkConf.set("spark.driver.blockManager.port", "7777")
#sparkConf.set("spark.driver.port", "2222")

# we needed to set the ip address for the host for some reason...
sparkConf.set("spark.driver.host", socket.gethostbyname(socket.gethostname()))
sparkConf.set("spark.submit.deployMode", "client")

sparkConf.set("spark.driver.port", "7778")
sparkConf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
#sparkConf.set("spark.driver.blockManager.port", "7777")



<pyspark.conf.SparkConf at 0x7f7d083823d0>

In [7]:
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

:: loading settings :: url = jar:file:/opt/conda/envs/spark/lib/python3.8/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1040ce9b-5991-4c54-b0df-5a7e68e4352a;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.2.0 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.375 in central
:: resolution report :: resolve 107ms :: artifacts dl 2ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.375 from central in [default]
	org.apache.hadoop#hadoop-aws;3.2.0 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	--------------------------------

In [8]:
# check loaded jars
print(spark.sparkContext._jsc.sc().listJars())

Vector(spark://10.42.4.186:7778/jars/org.apache.hadoop_hadoop-aws-3.2.0.jar, spark://10.42.4.186:7778/jars/com.amazonaws_aws-java-sdk-bundle-1.11.375.jar)


In [9]:
# test spark without reading data
# Create a distributed data set to test to the session
t = spark.sparkContext.parallelize(range(10))

# Calculate the approximate sum of values in the dataset
r = t.sumApprox(3)
print('Approximate sum: %s' % r)

[Stage 0:>                                                          (0 + 8) / 8]

Approximate sum: 45.0


# Generate some test data and run through Spark

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.DataFrame(np.random.randn(100000,20))

In [None]:
df.head()

In [None]:
sparkDF=spark.createDataFrame(df) 

In [None]:
sparkDF.printSchema()

# Load Data and write it to my object store

In [None]:
# Firstly create a new bucket

In [None]:
try:
    minio_client.make_bucket('testing-bucket')
except ResponseError as err:
    print(err)

In [None]:
# need boto to pull from AWS
!pip install boto3

In [None]:
import boto3
from botocore import UNSIGNED
from botocore.client import Config

In [None]:
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

In [12]:
# Configs
output_bucket = 'testing-bucket'
testing_file = 'green_tripdata_2015-07.csv'
load_path = 'trip data/' + testing_file
write_path = 'raw_data/' + testing_file

In [None]:
with open('green_tripdata_2015-07.csv', 'wb') as f:
        s3.download_fileobj('nyc-tlc', load_path, f)

In [None]:
minio_client.fput_object(output_bucket, write_path, testing_file)

## Reading the loaded Data with Spark

In [10]:
spark.conf.set("spark.sql.shuffle.partitions", spark.sparkContext.defaultParallelism)

Test read from minio

In [13]:
raw_data = spark.read.option("header", True).csv(os.path.join('s3a://' + output_bucket, write_path))

21/09/28 00:42:43 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [14]:
raw_data.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- lpep_pickup_datetime: string (nullable = true)
 |-- Lpep_dropoff_datetime: string (nullable = true)
 |-- Store_and_fwd_flag: string (nullable = true)
 |-- RateCodeID: string (nullable = true)
 |-- Pickup_longitude: string (nullable = true)
 |-- Pickup_latitude: string (nullable = true)
 |-- Dropoff_longitude: string (nullable = true)
 |-- Dropoff_latitude: string (nullable = true)
 |-- Passenger_count: string (nullable = true)
 |-- Trip_distance: string (nullable = true)
 |-- Fare_amount: string (nullable = true)
 |-- Extra: string (nullable = true)
 |-- MTA_tax: string (nullable = true)
 |-- Tip_amount: string (nullable = true)
 |-- Tolls_amount: string (nullable = true)
 |-- Ehail_fee: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- Total_amount: string (nullable = true)
 |-- Payment_type: string (nullable = true)
 |-- Trip_type : string (nullable = true)



# Close out Session

In [None]:
# Shutdown Our Context
spark.stop()