# Using Spark on Kubernetes

This is a testing notebook and also "cheat sheet" to make sure everything is running and connecting
for my kubernetes spark setup

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import socket # to get the internal ipaddress for setting the spark driver
import os

## Objectstore Tests 

we are using Minio as our object store so firstly lets test it independent of spark
if we return buckets then all is good

In [2]:
from minio import Minio

In [3]:
minio_client = Minio(
        "minio.minio-tenant.svc.cluster.local",
        access_key='AKIAIOSFODNN7EXAMPLE',
        secret_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY',
        secure=False
    )

In [4]:
buckets = minio_client.list_buckets()

for bucket in buckets:
    print(bucket.name, bucket.creation_date)

testing-bucket 2021-09-26 06:04:35.273000+00:00
warehouse 2021-09-19 14:11:20.375000+00:00


## Configs

These configs are set to work with the stack at: https://github.com/Data-drone/data_eng_kube.git

Note compared to Spark 2.x, Spark 3.x doesn't properly maven load spark.jars.packages:
https://issues.apache.org/jira/browse/SPARK-35084

We need to have at least the hadoop-aws jar already on drivers and executors to make things work more smoothly

In [5]:
SUBMIT_ARGS = "--packages org.apache.hadoop:hadoop-aws:3.2.0 pyspark-shell"
os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS

In [6]:
!printenv

SHELL=/bin/bash
NVIDIA_VISIBLE_DEVICES=all
KUBERNETES_SERVICE_PORT_HTTPS=443
JUPYTERHUB_ADMIN_ACCESS=1
KUBERNETES_SERVICE_PORT=443
MINIFORGE_VERSION=4.10.3-3
PROXY_API_SERVICE_HOST=10.43.154.6
HOSTNAME=jupyter-jovyan
LANGUAGE=en_US.UTF-8
JUPYTERHUB_API_TOKEN=df54273901df450cbe85788e7543167a
NVIDIA_REQUIRE_CUDA=cuda>=11.1 brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441 driver>=450
PROXY_API_SERVICE_PORT=8001
JUPYTERHUB_BASE_URL=/jupyter/
NB_UID=1000
PROXY_PUBLIC_PORT_80_TCP=tcp://10.43.45.18:80
PROXY_PUBLIC_PORT=tcp://10.43.45.18:80
PROXY_PUBLIC_SERVICE_PORT_HTTP=80
PWD=/home/jovyan/spark_learn/notebooks
NVIDIA_DRIVER_CAPABILITIES=compute,utility
MEM_GUARANTEE=1073741824
JUPYTER_IMAGE=k3d-test-registry:5000/datadrone/spark_notebook_kube
PROXY_API_PORT_8001_TCP_ADDR=10.43.154.6
PYSPARK_SUBMIT_ARGS=--packages org.apache.hadoop:hadoop-aws:3.2.0 pyspark-shell
HUB_SERVICE_HOST=10.43.215.180
JUPYTERHUB_SERVER_NAME=
HOME=/home/jovyan
LANG=en_US.UTF-8
KUBERNETES_PORT_443_T

In [23]:
sparkConf = SparkConf()
sparkConf.setMaster("k8s://https://kubernetes.default.svc.cluster.local:443")
sparkConf.setAppName("spark")
sparkConf.set("spark.kubernetes.container.image", "k3d-test-registry:5000/datadrone/spark-test-k8s:latest")
sparkConf.set("spark.kubernetes.namespace", "jhub")
sparkConf.set("spark.executor.instances", "7")
sparkConf.set("spark.executor.cores", "2")
sparkConf.set("spark.driver.memory", "512m")
sparkConf.set("spark.executor.memory", "512m")
sparkConf.set("spark.kubernetes.pyspark.pythonVersion", "3")
sparkConf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
sparkConf.set("spark.kubernetes.authenticate.serviceAccountName", "spark")

### Adding minio settings
# need to add jars: org.apache.hadoop:hadoop-aws:3.2.0
sparkConf.set("spark.jars.packages", ["org.apache.hadoop:hadoop-aws:3.2.0"])
#sparkConf.set("spark.jars.ivy", "/opt/")

access_key = 'AKIAIOSFODNN7EXAMPLE' # os.environ['MINIO_ACCESS_KEY']
secret_key = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' # os.environ['MINIO_SECRET_KEY']

sparkConf.set("spark.hadoop.fs.s3a.access.key", access_key)
sparkConf.set("spark.hadoop.fs.s3a.secret.key", secret_key)
sparkConf.set("spark.hadoop.fs.s3a.endpoint", "minio.minio-tenant.svc.cluster.local")
sparkConf.set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
sparkConf.set("spark.hadoop.fs.s3a.path.style.access", True)
sparkConf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            
#sparkConf.set("spark.driver.blockManager.port", "7777")
#sparkConf.set("spark.driver.port", "2222")

# we needed to set the ip address for the host for some reason...
sparkConf.set("spark.driver.host", socket.gethostbyname(socket.gethostname()))
sparkConf.set("spark.submit.deployMode", "client")

sparkConf.set("spark.driver.port", "7778")
sparkConf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
#sparkConf.set("spark.driver.blockManager.port", "7777")



<pyspark.conf.SparkConf at 0x7f9263c5ccd0>

In [24]:
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

In [25]:
# check loaded jars
print(spark.sparkContext._jsc.sc().listJars())

Vector(spark://10.42.4.21:7778/jars/org.apache.hadoop_hadoop-aws-3.2.0.jar, spark://10.42.4.21:7778/jars/com.amazonaws_aws-java-sdk-bundle-1.11.375.jar)


# Generate some test data and run through Spark

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.DataFrame(np.random.randn(100000,20))

In [None]:
df.head()

In [None]:
sparkDF=spark.createDataFrame(df) 

In [None]:
sparkDF.printSchema()

# Load Data and write it to my object store

In [None]:
# Firstly create a new bucket

In [None]:
try:
    minio_client.make_bucket('testing-bucket')
except ResponseError as err:
    print(err)

In [None]:
# need boto to pull from AWS
!pip install boto3

In [None]:
import boto3
from botocore import UNSIGNED
from botocore.client import Config

In [None]:
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

In [12]:
# Configs
output_bucket = 'testing-bucket'
testing_file = 'green_tripdata_2015-07.csv'
load_path = 'trip data/' + testing_file
write_path = 'raw_data/' + testing_file

In [None]:
with open('green_tripdata_2015-07.csv', 'wb') as f:
        s3.download_fileobj('nyc-tlc', load_path, f)

In [None]:
minio_client.fput_object(output_bucket, write_path, testing_file)

## Reading the loaded Data with Spark

In [26]:
spark.conf.set("spark.sql.shuffle.partitions", spark.sparkContext.defaultParallelism)

In [None]:
raw_data = spark.read.option("header", True).csv(os.path.join('s3a://' + output_bucket, write_path))

[Stage 0:>                                                          (0 + 1) / 1]

In [None]:
raw_data.printSchema()

# Close out Session

In [22]:
# Shutdown Our Context
spark.stop()

21/09/27 13:32:19 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed (this is expected if the application is shutting down.)
