## https://spark.apache.org/docs/3.5.3/sql-performance-tuning.html

In [2]:
# setup spark on linux
from pyspark.sql import *

spark = (SparkSession.builder
         #.master("local") # local- no parallelizm at all, local[2] - 2 cores, local[*] - as many cores as local logical cores
         .appName("SparkSession#1")
         .config("spark.log.level", "ERROR")
         .enableHiveSupport() # enableHiveSupport() needed to make data persistent... 
         .config("spark.executor.memory", "4g")
         .config("spark.driver.memory", "4g")
         #.config("spark.sql.shuffle.partitions", "200")
         #.config("spark.default.parallelism", "200")
         .getOrCreate())

print('spark version:', spark.version)
print('spark.executor.memory: ', spark.sparkContext._conf.get('spark.executor.memory'))
print('spark.driver.memory: ', spark.sparkContext._conf.get('spark.driver.memory'))
print('Done.')

spark version: 3.5.5
spark.executor.memory:  4g
spark.driver.memory:  4g
Done.


In [3]:
# supporting functions

from datetime import datetime, timedelta
import random
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, TimestampType

def print_dict(d: dict, level: int = 0) -> None:
    """
    print optionally hiearchic dict structure nicely formatted
    """

    sp = "".ljust(level * 3)
    for k in d.keys():
        print(f"{sp}{k}: {d[k]}")
        
        if type(d[k]) is dict:
            print_dict(d[k], level + 1)
            
    if level == 0:
        print()

def show_plan(plan: DataFrame) -> None:
    for row in plan.collect():
        print_dict(row.asDict())

def generateDF(num_rows: int = 100) -> DataFrame:
    """
    generate a dataframe with random values including nulls, of num_rows records
    """

    df_schema = StructType([
        StructField("id", IntegerType(), True),
        StructField("name", StringType(), True),
        StructField("age", IntegerType(), True),
        StructField("from_dt", DateType(), True),
        StructField("to_dt", DateType(), True),
    ])
    
    lst = []
    for c in range(num_rows):
        from_dt = datetime.now().date() + timedelta(days=random.randint(0, 1000))
        to_dt = from_dt + timedelta(days=random.randint(1, 100))
    
        lst.append([
            c,
            None if random.random() < 0.1 else 'name-' + str(c).zfill(8),
            None if random.random() < 0.1 else random.randint(1, 100),
            from_dt,
            to_dt
             ])
    
    return spark.createDataFrame(lst, df_schema)

# list managed DB objects

def show_managed_db(db_name: str = "default") -> None:
    print("Available DBs:")
    spark.sql("Show databases").show(truncate=False)

    orig_db_name = spark.sql("select current_database()").collect()[0][0]

    print(f"Getting objects from {db_name}:")
    spark.sql(f"use {db_name}")
    spark.sql(f"SHOW TABLES IN {db_name}").show(truncate=False)
    spark.sql(f"SHOW VIEWS IN {db_name}").show(truncate=False)

print(f"Done.")

Done.


In [4]:
#spark.sql("CREATE DATABASE mydb")
show_managed_db("mydb")
print(f"Done.")

Available DBs:


25/04/11 20:09:07 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/04/11 20:09:07 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
25/04/11 20:09:08 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
25/04/11 20:09:08 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore jovyan@172.17.0.2


+---------+
|namespace|
+---------+
|default  |
|mydb     |
+---------+

Getting objects from mydb:


25/04/11 20:09:10 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|mydb     |tab_df2  |false      |
+---------+---------+-----------+

+---------+--------+-----------+
|namespace|viewName|isTemporary|
+---------+--------+-----------+
+---------+--------+-----------+

Done.


In [8]:
# run a join test with / without memory caching

print(f"create some test data for join...")
ct = datetime.now()
df1 = generateDF(num_rows = 10000000)
df2 = generateDF(num_rows = 1000000)
print(f"Done in {datetime.now() - ct}")

print(f"Saving the test DFs as parquet tables...")
df1.write.format("parquet").mode("overwrite").saveAsTable("tab_df1")
df2.write.format("parquet").mode("overwrite").saveAsTable("tab_df2")

#df1.write.option("maxRecordsPerFile", 100000).format("parquet").mode("overwrite").saveAsTable("tab_df1")
#df2.write.option("maxRecordsPerFile", 100000).format("parquet").mode("overwrite").saveAsTable("tab_df2")
print(f"Done.")

create some test data for join...
Done in 0:01:35.218784
Saving the test DFs as parquet tables...


25/04/11 20:17:20 WARN TaskSetManager: Stage 26 contains a task of very large size (297010 KiB). The maximum recommended task size is 1000 KiB.
25/04/11 20:17:32 WARN TaskSetManager: Stage 27 contains a task of very large size (29597 KiB). The maximum recommended task size is 1000 KiB.
[Stage 27:>                                                         (0 + 1) / 1]

Done.


                                                                                

In [12]:
# memory cache on single node gets much slower than without caching..???
print(f"Running join on test tables...")
ct = datetime.now()
plan = spark.sql("""
--EXPLAIN
SELECT COUNT(1) cnt 
FROM tab_df1 t1
    inner join
    tab_df2 t2
    on (t1.id = t2.id)
WHERE t1.age >= t2.age
""")
plan.show()
show_plan(plan)
print(f"Done in {datetime.now() - ct}")

## cache the 2 tables
spark.catalog.cacheTable("tab_df1")
spark.catalog.cacheTable("tab_df2")

print(f"Running join on cache (InMemoryRelation)...")
ct = datetime.now()
plan = spark.sql("""
--EXPLAIN
SELECT COUNT(1) cnt 
FROM tab_df1 t1
    inner join
    tab_df2 t2
    on (t1.id = t2.id)
WHERE t1.age >= t2.age
""")
plan.show()
show_plan(plan)
print(f"Done in {datetime.now() - ct}")

spark.catalog.uncacheTable("tab_df1", )
spark.catalog.uncacheTable("tab_df2")
print("Done.")

Running join on test tables...
+------+
|   cnt|
+------+
|408556|
+------+

cnt: 408556

Done in 0:00:01.718398
Running join on cache (InMemoryRelation)...


                                                                                

+------+
|   cnt|
+------+
|408556|
+------+





cnt: 408556

Done in 0:00:30.645252
Done.


                                                                                

In [31]:
# run a join test with / without partitioning
# processing partitioned data on single node is slow...???
from pyspark.sql.functions import col

print(f"Running join on test tables without partitioning...")

print("reading data")
df1 = spark.read.table("tab_df1")
df2 = spark.read.table("tab_df2")

ct = datetime.now()
df_res = (
    df1.alias("t1")
    .join(df2.alias("t2"), 
          (col("t1.id") == col("t2.id")),
         "inner").filter(df1.age >= df2.age)
    .select(col("t1.id"), col("t1.age"), col("t2.age"))
)
df_res.show()
print(f"Done in {datetime.now() - ct}")


print(f"Repartitioning data...")
df1 = df1.repartition(100)


print(f"df1 partition num: {df1.rdd.getNumPartitions()}")
print(f"df2 partition num: {df2.rdd.getNumPartitions()}")

print(f"Running join on test tables with partitioning...")
ct = datetime.now()
df_res = (
    df1.alias("t1")
    .join(df2.alias("t2"), 
          (col("t1.id") == col("t2.id")),
         "inner").filter(df1.age >= df2.age)
    .select(col("t1.id"), col("t1.age"), col("t2.age"))
)
df_res.show()
print(f"Done in {datetime.now() - ct}")

print("Done.")

Running join on test tables without partitioning...
reading data
+---+---+---+
| id|age|age|
+---+---+---+
|  1| 49| 17|
|  3| 89|  2|
|  4| 98| 15|
|  8| 85| 67|
| 10| 94| 84|
| 11| 80| 19|
| 13| 83| 11|
| 14| 68| 48|
| 15| 81| 75|
| 17| 48| 37|
| 19| 63| 29|
| 21| 68| 61|
| 22| 98| 81|
| 24| 34|  7|
| 26| 77| 31|
| 28| 25| 13|
| 30| 61|  9|
| 31| 99| 59|
| 32| 58| 55|
| 36| 75| 61|
+---+---+---+
only showing top 20 rows

Done in 0:00:00.613147
Repartitioning data...


[Stage 234:>                                                        (0 + 1) / 1]

df1 partition num: 100
df2 partition num: 1
Running join on test tables with partitioning...


                                                                                

+------+---+---+
|    id|age|age|
+------+---+---+
|662186| 80| 78|
|583252| 46| 36|
|862708| 67| 49|
|577002| 73| 59|
|491572| 79| 46|
|280573| 49| 18|
|258952| 39| 27|
|257231| 94| 73|
|319939| 92| 69|
|954189| 46| 16|
|619241| 75| 20|
|527500| 61| 45|
|930785| 19|  6|
|900271| 94| 24|
|935202| 25|  6|
|461122| 83| 38|
|636880| 62| 35|
|658642|100|  3|
|480626| 58| 12|
|797179|100| 11|
+------+---+---+
only showing top 20 rows

Done in 0:00:07.981605
Done.


In [None]:
# hints
