# Optimize Spark Tables

Explore the table structure in Minio and optimise the file size

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os

# load spark session templates
from spark_utils import get_k8s_spark

# Initialise Spark Session

In [3]:
BASIC_SUBMIT_ARGS = ("--jars local:///opt/spark-jars/hadoop-aws-3.2.0.jar,"
                     "local:///opt/spark-jars/delta-core_2.12-1.0.0.jar,"
                     "local:///opt/spark-jars/aws-java-sdk-bundle-1.11.375.jar,"
                     "local:///opt/sparkRapidsPlugin/cudf-21.08.2-cuda11.jar,"
                     "local:///opt/sparkRapidsPlugin/rapids-4-spark_2.12-21.08.0.jar"
                     " pyspark-shell")

os.environ["PYSPARK_SUBMIT_ARGS"] = BASIC_SUBMIT_ARGS

In [4]:
access_key = 'AKIAIOSFODNN7EXAMPLE' # os.environ['MINIO_ACCESS_KEY']
secret_key = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' # os.environ['MINIO_SECRET_KEY']

# This cannot be triggered in the python code as the JVM will be activated when it hits the python builder starts
# .config("spark.packages", "org.apache.hadoop:hadoop-aws:3.2.0")

# debug
# .config("spark.kubernetes.executor.deleteOnTermination", "false")

spark = (get_k8s_spark()
            .config("spark.kubernetes.container.image", 
                    "k3d-test-registry:5000/datadrone/k8s-spark-worker:3.1.2-hadoop3.2-rapids-k8s")
            .config("spark.kubernetes.container.image.pullPolicy", "Always")
            .config("spark.hadoop.fs.s3a.access.key", access_key)
            .config("spark.hadoop.fs.s3a.secret.key", secret_key)
            .config("spark.hadoop.fs.s3a.endpoint", "minio.minio-tenant.svc.cluster.local")
            .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
            .config("spark.hadoop.fs.s3a.path.style.access", True)
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
            .config("spark.executor.resource.gpu.amount", "1")
            .config("spark.task.resource.gpu.amount", "1")
            .config("spark.driver.cores", "4")   
            .config("spark.driver.memory", "8g")
            .config("spark.executor.cores", "4")
            .config("spark.num.executors", 2)
            .config("spark.executor.memory", "10g")
            .config("spark.executor.resource.gpu.discoveryScript", "/opt/sparkRapidsPlugin/getGpusResources.sh")
            .config("spark.executor.resource.gpu.vendor", "nvidia.com")
            .config("spark.rapids.memory.pinnedPool.size", "2G")
            .config("spark.plugins", "com.nvidia.spark.SQLPlugin")
            .config("spark.rapids.sql.concurrentGpuTasks", "2")
            .config("spark.rapids.sql.udfCompiler.enabled", True)
            .appName("Spark K8s")
            .enableHiveSupport()
            .getOrCreate()
        )

21/10/03 13:47:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/03 13:47:40 WARN ResourceUtils: The configuration of cores (exec = 4 task = 1, runnable tasks = 4) will result in wasted resources due to resource gpu limiting the number of runnable tasks per executor to: 1. Please adjust your configuration.
21/10/03 13:47:44 WARN SQLExecPlugin: RAPIDS Accelerator 21.08.0 using cudf 21.08.2. To disable GPU support set `spark.rapids.sql.enabled` to false
21/10/03 13:47:44 WARN Plugin: Installing rapids UDF compiler extensions to Spark. The compiler is disabled by default. To enable it, set `spark.rapids.sql.udfCompiler.enabled` to true


# Get the table stats with Minio

In [None]:
from minio import Minio
import pandas as pd

In [None]:
k8s_minio_client = Minio(
        "minio.minio-tenant.svc.cluster.local",
        access_key='AKIAIOSFODNN7EXAMPLE',
        secret_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY',
        secure=False
    )

# Get Minio zones

In [None]:
# quick check on sizes and stuff
contents = k8s_minio_client.list_objects('data', 
                                            recursive=True, 
                                            prefix='warehouse')

obj_names = []
obj_length = []
for thing in contents:
    obj_names.append(thing.object_name)
    obj_length.append(thing.size)

## Raw Tables

In [None]:
# quick check on sizes and stuff
contents = k8s_minio_client.list_objects('data', 
                                            recursive=True, 
                                            prefix='warehouse/raw')

obj_names = []
obj_length = []
for thing in contents:
    obj_names.append(thing.object_name)
    obj_length.append(thing.size)
    
data_dict = {'obj_name':obj_names, 'obj_length':obj_length}
raw_minio_df = pd.DataFrame(data_dict)
df_m1 = raw_minio_df['obj_name'].str.split('/', expand=True)
raw_minio_df['root'] = df_m1[0] 
raw_minio_df['zone'] = df_m1[1]
raw_minio_df['table'] = df_m1[2]
df_analysis = raw_minio_df.groupby('table').agg({'obj_length':['sum', 'mean', 'count']})

df_analysis['obj_length', 'avg_file_mb'] = df_analysis['obj_length', 'sum'] / 1.0e8
df_analysis['obj_length', 'num_files'] = df_analysis['obj_length', 'sum'] / 1.28e8
df_analysis['obj_length', 'num_files'] = df_analysis['obj_length', 'num_files'].round(0)
df_analysis

# Quick Data Load

In [6]:
processed_data = "s3a://data/warehouse/processed/nyc_taxi_dataset"

delta_data = spark.read.option("header", True).format("delta").load(processed_data)

21/10/03 13:48:08 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
21/10/03 13:48:12 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [7]:
delta_data.count()

                                                                                

496002404

# Shutdown

In [None]:
spark.stop()