# Optimize Spark Tables

Explore the table structure in Minio and optimise the file size

In [9]:
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import socket

In [10]:
SUBMIT_ARGS = "--packages io.delta:delta-core_2.12:1.0.0,org.apache.hadoop:hadoop-aws:3.2.0 pyspark-shell"
os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS

In [11]:
sparkConf = SparkConf()
sparkConf.setMaster("k8s://https://kubernetes.default.svc.cluster.local:443")
sparkConf.setAppName("spark")
sparkConf.set("spark.kubernetes.container.image", "k3d-test-registry:5000/datadrone/spark-test2:latest")
sparkConf.set("spark.kubernetes.namespace", "jhub")
sparkConf.set("spark.executor.instances", "2")
sparkConf.set("spark.executor.cores", "4")
sparkConf.set("spark.executor.memory", "1g")
sparkConf.set("spark.driver.memory", "512m")
sparkConf.set("spark.executor.memory", "512m")
sparkConf.set("spark.pyspark.python", "/opt/conda/bin/python")
sparkConf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
sparkConf.set("spark.kubernetes.authenticate.serviceAccountName", "spark")
sparkConf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
sparkConf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")


### Adding minio settings
# need to add jars: org.apache.hadoop:hadoop-aws:3.2.0
#sparkConf.set("spark.jars.packages", ["org.apache.hadoop:hadoop-aws:3.2.0"])
#sparkConf.set("spark.jars.ivy", "/opt/")

access_key = 'AKIAIOSFODNN7EXAMPLE' # os.environ['MINIO_ACCESS_KEY']
secret_key = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' # os.environ['MINIO_SECRET_KEY']

sparkConf.set("spark.hadoop.fs.s3a.access.key", access_key)
sparkConf.set("spark.hadoop.fs.s3a.secret.key", secret_key)
sparkConf.set("spark.hadoop.fs.s3a.endpoint", "minio.minio-tenant.svc.cluster.local")
sparkConf.set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
sparkConf.set("spark.hadoop.fs.s3a.path.style.access", True)
sparkConf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            
#sparkConf.set("spark.driver.blockManager.port", "7777")
#sparkConf.set("spark.driver.port", "2222")

# we needed to set the ip address for the host for some reason...
sparkConf.set("spark.driver.host", socket.gethostbyname(socket.gethostname()))
sparkConf.set("spark.submit.deployMode", "client")

sparkConf.set("spark.driver.port", "7778")
sparkConf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
#sparkConf.set("spark.driver.blockManager.port", "7777")

<pyspark.conf.SparkConf at 0x7fdafc2d3f40>

In [12]:
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

:: loading settings :: url = jar:file:/opt/conda/envs/spark/lib/python3.8/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-b6b0cc11-57db-4a89-8283-87dfdd8cb2b9;1.0
	confs: [default]
	found io.delta#delta-core_2.12;1.0.0 in central
	found org.antlr#antlr4;4.7 in central
	found org.antlr#antlr4-runtime;4.7 in central
	found org.antlr#antlr-runtime;3.5.2 in central
	found org.antlr#ST4;4.0.8 in central
	found org.abego.treelayout#org.abego.treelayout.core;1.0.3 in central
	found org.glassfish#javax.json;1.0.4 in central
	found com.ibm.icu#icu4j;58.2 in central
	found org.apache.hadoop#hadoop-aws;3.2.0 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.375 in central
downloading https://repo1.maven.org/maven2/io/delta/delta-core_2.12/1.0.0/delta-core_2.12-1.0.0.jar ...
	[SUCCESSFUL ] io.delta#delta-core_2.12;1.0.0!delta-core_2.12.jar

# Get the table stats with Minio

In [13]:
from minio import Minio
import pandas as pd

In [16]:
k8s_minio_client = Minio(
        "minio.minio-tenant.svc.cluster.local",
        access_key='AKIAIOSFODNN7EXAMPLE',
        secret_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY',
        secure=False
    )

## Raw Tables

In [28]:
# quick check on sizes and stuff
contents = k8s_minio_client.list_objects('data', 
                                            recursive=True, 
                                            prefix='warehouse/raw')

obj_names = []
obj_length = []
for thing in contents:
    obj_names.append(thing.object_name)
    obj_length.append(thing.size)
    
data_dict = {'obj_name':obj_names, 'obj_length':obj_length}
raw_minio_df = pd.DataFrame(data_dict)
df_m1 = raw_minio_df['obj_name'].str.split('/', expand=True)
raw_minio_df['root'] = df_m1[0] 
raw_minio_df['zone'] = df_m1[1]
raw_minio_df['table'] = df_m1[2]
df_analysis = raw_minio_df.groupby('table').agg({'obj_length':['sum', 'mean', 'count']})

df_analysis['obj_length', 'avg_file_mb'] = df_analysis['obj_length', 'sum'] / 1.0e8
df_analysis['obj_length', 'num_files'] = df_analysis['obj_length', 'sum'] / 1.28e8
df_analysis['obj_length', 'num_files'] = df_analysis['obj_length', 'num_files'].round(0)
df_analysis

Unnamed: 0_level_0,obj_length,obj_length,obj_length,obj_length,obj_length
Unnamed: 0_level_1,sum,mean,count,avg_file_mb,num_files
table,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
green_merged,7057028575,33765690.0,209,70.570286,55.0
green_taxi_2015_h1,1247604063,27121830.0,46,12.476041,10.0
green_taxi_2015_h2_2016_h1,1528888117,30577760.0,50,15.288881,12.0
green_taxi_pre2015,2051119150,27348260.0,75,20.511191,16.0
yellow_merged,79708051347,118789900.0,671,797.080513,623.0
yellow_taxi_2015_2016_h1,16934079647,22669450.0,747,169.340796,132.0
yellow_taxi_pre2015,30147016905,33608710.0,897,301.470169,236.0
