## spark partitioning: 
    partitionBy(col) - very much dependent on the selectivity of the values (small vs large files). partition pruning possible!
        e.g. by load_id, or by date, but date (year, month is only supported in pyspark 4.0.0)
    repartition(partnum, col) - split the records into  <partnum> files using HASH algorithm. parallel processing ok, but no partition pruning
    repartitionByRange(partnum, col) - split the records into  <partnum> files using RANGE algorythm, based on data sampling. parallel processing ok, but no partition pruning
    write.option("maxRecordsPerFile", recnum) - split the records into recnum per file roughly(!).  parallel processing ok, but no partition pruning
    reduceByKey() - aggregate inside a partition only
    
    
## partition to_date datetype historical column into year/months by using spark 4.0.0 function?
    partitionedBy(pyspark.sql.functions.partitioning.months(col)) - the only reasonable partitioning would be by date (e.g. month or year) which will only be supported in pyspark 4.0.0
    

In [1]:
# setup spark on linux
from pyspark.sql import *

if SparkSession.getActiveSession() == None:
    print("Active spark session not found")
else:
    print("Active spark session found")

spark = (SparkSession.builder
         #.master("local") # local- no parallelizm at all, local[2] - 2 cores, local[*] - as many cores as local logical cores
         .appName("SparkSession#1")
         .enableHiveSupport() # enableHiveSupport() needed to make data persistent... 
         .config("spark.log.level", "ERROR")
         #.config("spark.driver.allowMultipleContexts", True)
         #.config("spark.sql.cbo.enabled", True)
         #.config("spark.sql.cbo.optimizer", True)
         #.config("spark.executor.memory", "4g")
         #.config("spark.driver.memory", "4g")
         .getOrCreate())

print('spark version:', spark.version)
print('Done.')

Active spark session not found


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/13 15:29:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".


spark version: 3.5.5
Done.


In [6]:
# change some configs
from pyspark import SparkConf, SparkContext

try:
    spark.stop()
except:
    None

spark = (SparkSession.builder
        #.master("local") # local- no parallelizm at all, local[2] - 2 cores, local[*] - as many cores as local logical cores
        .appName("SparkSession#1")
        .enableHiveSupport() # enableHiveSupport() needed to make data persistent... 
        .config("spark.log.level", "ERROR")
        .config("spark.driver.allowMultipleContexts", True)
        .config('spark.cores.max', '4')
        .config('spark.executor.cores', '2')
        .config('spark.driver.memory','2g')
        .config('spark.executor.memory', '4g')
        .getOrCreate())

# check config
sc = spark.sparkContext
config = sc.getConf().getAll()
print(config)

[('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false'), ('spark.app.startTime', '1747143032136'), ('spark.executor.memory', '4g'), ('spark.app.submitTime', '1747

Setting Spark log level to "ERROR".


In [5]:
# stop spark session (localhost:4040 will be reset, unaccessible until a new session starts)
spark.stop()
print('Done.')

Done.


In [8]:
# test without any narrow (operates within a partition, no shuffling) or wide (operates on multiple partitions, shuffling is required) function

import random

lst = [
    (
        c, 
        f'Store_{random.randint(1, 100)}',
        f'Product_{random.randint(1, 1000)}',  
        random.randint(5, 100)
     )
    for c in range(100000)]
rdd = spark.sparkContext.parallelize(lst, 20)

print(f"Number of partitions: {rdd.getNumPartitions()}")

for row in rdd.take(10):
    print(f"{row[0]}, {row[1]}, {row[2]}, {row[3]}")

print("Done.")

Number of partitions: 20
0, Store_75, Product_422, 23
1, Store_38, Product_71, 71
2, Store_41, Product_123, 10
3, Store_69, Product_411, 61
4, Store_15, Product_881, 71
5, Store_24, Product_621, 23
6, Store_53, Product_416, 55
7, Store_15, Product_269, 34
8, Store_70, Product_497, 91
9, Store_82, Product_11, 97
Done.


In [11]:
# apply narrow transformation (e.g.: map)
discounted_rdd = rdd.map(lambda x: (x[0], x[1], x[2], 0 if x[3] is None else round(x[3]*0.9, 2)))
print(f"Number of partitions by default: {discounted_rdd.getNumPartitions()}")

for row in discounted_rdd.take(10): # take() makes the rdd an iterable list
    print(f"{row[0]}, {row[1]}, {row[2]}, {row[3]}")

print("Done.")

Number of partitions by default: 20
0, Store_75, Product_422, 20.7
1, Store_38, Product_71, 63.9
2, Store_41, Product_123, 9.0
3, Store_69, Product_411, 54.9
4, Store_15, Product_881, 63.9
5, Store_24, Product_621, 20.7
6, Store_53, Product_416, 49.5
7, Store_15, Product_269, 30.6
8, Store_70, Product_497, 81.9
9, Store_82, Product_11, 87.3
Done.


In [5]:
# apply wide transformation (e.g.: reducebyKey, because the same store could be in different partitions)
store_sales_rdd = discounted_rdd.map(lambda x: (x[1], x[3]))  # key value pairs for store / sold-price

print("store_sales_rdd:")
for row in store_sales_rdd.take(10):
    print(f"{row[0]}, {row[1]}")
    
total_sales_rdd = store_sales_rdd.reduceByKey(lambda x, y: x + y) # merge the values(x,y) per key according to lambda function (addition)

print("total_sales_rdd (sorted - all):")
for row in total_sales_rdd.sortBy(lambda x: x[0]).collect(): # sort by x[0]
    print(f"{row[0]}, {row[1]}")

print("Done.")

store_sales_rdd:
Store_79, 66.6
Store_50, 49.5
Store_70, 52.2
Store_3, 48.6
Store_77, 89.1
Store_61, 58.5
Store_10, 14.4
Store_71, 5.4
Store_85, 51.3
Store_21, 9.0
total_sales_rdd (sorted - all):




Store_1, 47258.1
Store_10, 47618.1
Store_100, 44203.5
Store_11, 46812.6
Store_12, 47889.9
Store_13, 46645.2
Store_14, 48726.9
Store_15, 47983.50000000001
Store_16, 46007.999999999985
Store_17, 49195.799999999996
Store_18, 49141.799999999996
Store_19, 45961.2
Store_2, 48431.70000000001
Store_20, 49540.5
Store_21, 44891.99999999999
Store_22, 45115.20000000001
Store_23, 49547.7
Store_24, 48068.10000000001
Store_25, 49866.29999999999
Store_26, 44555.399999999994
Store_27, 50857.200000000004
Store_28, 45452.700000000004
Store_29, 47119.5
Store_3, 45395.100000000006
Store_30, 46899.9
Store_31, 46566.899999999994
Store_32, 49765.5
Store_33, 47819.7
Store_34, 46121.399999999994
Store_35, 46304.100000000006
Store_36, 48351.59999999999
Store_37, 47301.299999999996
Store_38, 48435.29999999999
Store_39, 48004.19999999999
Store_4, 44557.2
Store_40, 48171.60000000001
Store_41, 48733.200000000004
Store_42, 44543.700000000004
Store_43, 47921.4
Store_44, 47403.00000000001
Store_45, 47803.5
Store_46, 46

                                                                                

In [21]:
# performance of count() with different parititioning numbers
from datetime import datetime, timedelta

# partitions: 1
rdd = spark.sparkContext.parallelize(lst, 1)
ct = datetime.now()
print(f"Number of partitions: {rdd.getNumPartitions()}, count: {rdd.count()}")
discounted_rdd = rdd.map(lambda x: (x[0], x[1], x[2], 0 if x[3] is None else round(x[3]*0.9, 2)))
store_sales_rdd = discounted_rdd.map(lambda x: (x[1], x[3]))  # key value pairs for store / sold-price
total_sales_rdd = store_sales_rdd.reduceByKey(lambda x, y: x + y)
for row in total_sales_rdd.take(2):
    print(f"{row[0]}, {row[1]}")
print(f"Done in {datetime.now() - ct}")

# partitions: 2
rdd = spark.sparkContext.parallelize(lst, 2)
ct = datetime.now()
print(f"Number of partitions: {rdd.getNumPartitions()}, count: {rdd.count()}")
discounted_rdd = rdd.map(lambda x: (x[0], x[1], x[2], 0 if x[3] is None else round(x[3]*0.9, 2)))
store_sales_rdd = discounted_rdd.map(lambda x: (x[1], x[3]))  # key value pairs for store / sold-price
total_sales_rdd = store_sales_rdd.reduceByKey(lambda x, y: x + y)
for row in total_sales_rdd.take(2):
    print(f"{row[0]}, {row[1]}")
print(f"Done in {datetime.now() - ct}")

# partitions: 4
rdd = spark.sparkContext.parallelize(lst, 4)
ct = datetime.now()
print(f"Number of partitions: {rdd.getNumPartitions()}, count: {rdd.count()}")
discounted_rdd = rdd.map(lambda x: (x[0], x[1], x[2], 0 if x[3] is None else round(x[3]*0.9, 2)))
store_sales_rdd = discounted_rdd.map(lambda x: (x[1], x[3]))  # key value pairs for store / sold-price
total_sales_rdd = store_sales_rdd.reduceByKey(lambda x, y: x + y)
for row in total_sales_rdd.take(2):
    print(f"{row[0]}, {row[1]}")
print(f"Done in {datetime.now() - ct}")

# partitions: 20
rdd = spark.sparkContext.parallelize(lst, 20)
ct = datetime.now()
print(f"Number of partitions: {rdd.getNumPartitions()}, count: {rdd.count()}")
discounted_rdd = rdd.map(lambda x: (x[0], x[1], x[2], 0 if x[3] is None else round(x[3]*0.9, 2)))
store_sales_rdd = discounted_rdd.map(lambda x: (x[1], x[3]))  # key value pairs for store / sold-price
total_sales_rdd = store_sales_rdd.reduceByKey(lambda x, y: x + y)
for row in total_sales_rdd.take(2):
    print(f"{row[0]}, {row[1]}")
print(f"Done in {datetime.now() - ct}")

# partitions: 50
rdd = spark.sparkContext.parallelize(lst, 50)
ct = datetime.now()
print(f"Number of partitions: {rdd.getNumPartitions()}, count: {rdd.count()}")
discounted_rdd = rdd.map(lambda x: (x[0], x[1], x[2], 0 if x[3] is None else round(x[3]*0.9, 2)))
store_sales_rdd = discounted_rdd.map(lambda x: (x[1], x[3]))  # key value pairs for store / sold-price
total_sales_rdd = store_sales_rdd.reduceByKey(lambda x, y: x + y)
for row in total_sales_rdd.take(2):
    print(f"{row[0]}, {row[1]}")
print(f"Done in {datetime.now() - ct}")
print("Done.")

Number of partitions: 1, count: 100000
Store_75, 47582.09999999999
Store_38, 46957.499999999985
Done in 0:00:00.648159
Number of partitions: 2, count: 100000
Store_38, 46957.50000000001
Store_41, 46283.40000000004
Done in 0:00:00.554230
Number of partitions: 4, count: 100000
Store_38, 46957.5
Store_53, 46169.09999999999
Done in 0:00:00.574467


                                                                                

Number of partitions: 20, count: 100000


                                                                                

Store_91, 46575.0
Store_5, 46870.200000000004
Done in 0:00:02.060508


                                                                                

Number of partitions: 50, count: 100000




Store_5, 46870.19999999999
Store_43, 49484.69999999998
Done in 0:00:05.178291
Done.


                                                                                

In [22]:
# cache and persistence
from pyspark import StorageLevel

discounted_rdd = rdd.map(lambda x: (x[0], x[1], x[2], 0 if x[3] is None else round(x[3]*0.9, 2)))
store_sales_rdd = discounted_rdd.map(lambda x: (x[1], x[3]))  # key value pairs for store / sold-price
print(f"Calculating without cache (1)...")

ct = datetime.now()
total_sales_rdd = store_sales_rdd.reduceByKey(lambda x, y: x + y)
print(f"{total_sales_rdd.collect()}")
print(f"Done in {datetime.now() - ct}")

print(f"Calculating without cache (2)...")
ct = datetime.now()
print(f"{total_sales_rdd.collect()}")
print(f"Done in {datetime.now() - ct}")

print(f"Calculating with cache (1)...")
ct = datetime.now()
total_sales_rdd_cached = store_sales_rdd.reduceByKey(lambda x, y: x + y).cache()
print(f"{total_sales_rdd_cached.collect()}")
print(f"Done in {datetime.now() - ct}")

print(f"Calculating with cache (2)...")
ct = datetime.now()
print(f"{total_sales_rdd_cached.collect()}")
print(f"Done in {datetime.now() - ct}")

print(f"Calculating with persist (1)...")
ct = datetime.now()
total_sales_rdd_peristed = store_sales_rdd.reduceByKey(lambda x, y: x + y).persist(StorageLevel.MEMORY_AND_DISK) # store in memory, spill to disk if needed
print(f"{total_sales_rdd_peristed.collect()}")
print(f"Done in {datetime.now() - ct}")

print(f"Calculating with persist (2)...")
ct = datetime.now()
print(f"{total_sales_rdd_peristed.collect()}")
print(f"Done in {datetime.now() - ct}")

# remove from cache & storage
total_sales_rdd_cached.unpersist()
total_sales_rdd_peristed.unpersist()

Calculating without cache (1)...


                                                                                

[('Store_5', 46870.19999999999), ('Store_43', 49484.69999999998), ('Store_45', 45533.7), ('Store_92', 48852.00000000001), ('Store_35', 43554.6), ('Store_32', 47685.6), ('Store_83', 48815.10000000001), ('Store_86', 48418.200000000004), ('Store_99', 49761.00000000001), ('Store_44', 46052.1), ('Store_42', 54375.299999999996), ('Store_26', 46850.4), ('Store_6', 47938.5), ('Store_78', 48230.1), ('Store_53', 46169.1), ('Store_60', 48076.200000000004), ('Store_11', 48035.70000000002), ('Store_76', 50866.20000000001), ('Store_91', 46575.0), ('Store_77', 46448.09999999999), ('Store_2', 48123.0), ('Store_97', 44341.200000000004), ('Store_34', 43551.89999999999), ('Store_82', 46419.3), ('Store_90', 47033.10000000002), ('Store_13', 49746.600000000006), ('Store_72', 48070.799999999996), ('Store_79', 47737.80000000001), ('Store_71', 46295.09999999999), ('Store_14', 44536.49999999999), ('Store_57', 44807.4), ('Store_63', 46746.90000000001), ('Store_41', 46283.4), ('Store_47', 46030.50000000001), ('St

                                                                                

[('Store_5', 46870.19999999999), ('Store_43', 49484.69999999998), ('Store_45', 45533.7), ('Store_92', 48852.00000000001), ('Store_35', 43554.6), ('Store_32', 47685.6), ('Store_83', 48815.10000000001), ('Store_86', 48418.200000000004), ('Store_99', 49761.00000000001), ('Store_44', 46052.1), ('Store_42', 54375.299999999996), ('Store_26', 46850.4), ('Store_6', 47938.5), ('Store_78', 48230.1), ('Store_53', 46169.1), ('Store_60', 48076.200000000004), ('Store_11', 48035.70000000002), ('Store_76', 50866.20000000001), ('Store_91', 46575.0), ('Store_77', 46448.09999999999), ('Store_2', 48123.0), ('Store_97', 44341.200000000004), ('Store_34', 43551.89999999999), ('Store_82', 46419.3), ('Store_90', 47033.10000000002), ('Store_13', 49746.600000000006), ('Store_72', 48070.799999999996), ('Store_79', 47737.80000000001), ('Store_71', 46295.09999999999), ('Store_14', 44536.49999999999), ('Store_57', 44807.4), ('Store_63', 46746.90000000001), ('Store_41', 46283.4), ('Store_47', 46030.50000000001), ('St

                                                                                

[('Store_5', 46870.19999999999), ('Store_43', 49484.69999999998), ('Store_45', 45533.7), ('Store_92', 48852.00000000001), ('Store_35', 43554.6), ('Store_32', 47685.6), ('Store_83', 48815.10000000001), ('Store_86', 48418.200000000004), ('Store_99', 49761.00000000001), ('Store_44', 46052.1), ('Store_42', 54375.299999999996), ('Store_26', 46850.4), ('Store_6', 47938.5), ('Store_78', 48230.1), ('Store_53', 46169.1), ('Store_60', 48076.200000000004), ('Store_11', 48035.70000000002), ('Store_76', 50866.20000000001), ('Store_91', 46575.0), ('Store_77', 46448.09999999999), ('Store_2', 48123.0), ('Store_97', 44341.200000000004), ('Store_34', 43551.89999999999), ('Store_82', 46419.3), ('Store_90', 47033.10000000002), ('Store_13', 49746.600000000006), ('Store_72', 48070.799999999996), ('Store_79', 47737.80000000001), ('Store_71', 46295.09999999999), ('Store_14', 44536.49999999999), ('Store_57', 44807.4), ('Store_63', 46746.90000000001), ('Store_41', 46283.4), ('Store_47', 46030.50000000001), ('St



[('Store_5', 46870.19999999999), ('Store_43', 49484.69999999998), ('Store_45', 45533.7), ('Store_92', 48852.00000000001), ('Store_35', 43554.6), ('Store_32', 47685.6), ('Store_83', 48815.10000000001), ('Store_86', 48418.200000000004), ('Store_99', 49761.00000000001), ('Store_44', 46052.1), ('Store_42', 54375.299999999996), ('Store_26', 46850.4), ('Store_6', 47938.5), ('Store_78', 48230.1), ('Store_53', 46169.1), ('Store_60', 48076.200000000004), ('Store_11', 48035.70000000002), ('Store_76', 50866.20000000001), ('Store_91', 46575.0), ('Store_77', 46448.09999999999), ('Store_2', 48123.0), ('Store_97', 44341.200000000004), ('Store_34', 43551.89999999999), ('Store_82', 46419.3), ('Store_90', 47033.10000000002), ('Store_13', 49746.600000000006), ('Store_72', 48070.799999999996), ('Store_79', 47737.80000000001), ('Store_71', 46295.09999999999), ('Store_14', 44536.49999999999), ('Store_57', 44807.4), ('Store_63', 46746.90000000001), ('Store_41', 46283.4), ('Store_47', 46030.50000000001), ('St

                                                                                

PythonRDD[155] at RDD at PythonRDD.scala:53

In [38]:
# define data generation utility

from datetime import datetime, timedelta
import random
from pyspark.sql import functions as sf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, TimestampType
import shutil

def generateDF(rows: int)->DataFrame:

    # defining DF structure
    df_schema = StructType([
        StructField("id", IntegerType(), True),
        StructField("name", StringType(), True),
        StructField("age", IntegerType(), True),
        StructField("district", IntegerType(), True),
        StructField("from_date", DateType(), True), # StructField("from_date", DateType(), True),
        StructField("to_date", DateType(), True), #StructField("to_date", DateType(), True),
        # tech columns
        StructField("to_date_year_month", IntegerType(), True),
    ])
    
    l = []
    ct = datetime.now()
    print(f"Creating a dataset ({rows} rows)...")
    for c in range(rows):
        from_date = datetime.now() + timedelta(days = random.randint(0, 1000))
        #d = datetime.now() + timedelta(days = random.randint(0, 1000))
        #from_date = int((datetime.now() + timedelta(days = random.randint(0, 1000))).strftime("%Y%m%d"))
        
        to_date = from_date + timedelta(days = random.randint(1, 100))
        #to_date = int((d + timedelta(days = random.randint(1, 100))).strftime("%Y%m%d"))
        
        l.append([c, 
                  ('name-' + str(c)),  
                  None if random.random() < 0.1 else random.randint(1, 100),  
                  None if random.random() < 0.1 else random.randint(1000, 1004),
                  from_date,
                  to_date,
                  #to_date // 100,
                  int(to_date.strftime("%Y%m")),
                 ])
    
    df = spark.createDataFrame(l, df_schema)
    print(f"Done in {datetime.now() - ct}")
    return df

print("Done.")

Done.


In [40]:
# partitioning (hash, range, round-robin) test
# CANNOT FIND A WAY to PARTITION a date (to_date) into date ranges!!!

df = generateDF(100)
df.printSchema()
df.show(5)

spark.sql("create database if not exists mytestdb")
spark.sql("Show databases").show()
spark.sql("use mytestdb")
spark.sql("select current_database()").show()
spark.sql("Show tables").show(truncate=False)

print("Dropping table...")
spark.sql('drop table if exists t_my_date_partitioned')

# save as managed 
print("Saving table t_my_date_partitioned...")
(
    df.write
    #.option("header", True)
    .mode("overwrite")
    .partitionBy("to_date_year_month") 
    .format("parquet") # parquet, csv for testing / readability
    .saveAsTable("t_my_date_partitioned")
)

# creating view to manage technical column
print("creating view v_my_date_partitioned to manage technical column DOES NOT HELP!!!...")
spark.sql("""
create or replace view v_my_date_partitioned as
SELECT *
FROM t_my_date_partitioned t
WHERE 1 = 1
AND t.to_date_year_month = to_number(date_format(to_date, 'yyyyMM'),'000000')
--AND t.to_date_year_month = floor(t.to_date / 100)
""")

print("Done.")

Creating a dataset (100 rows)...
Done in 0:00:00.028466
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- district: integer (nullable = true)
 |-- from_date: date (nullable = true)
 |-- to_date: date (nullable = true)
 |-- to_date_year_month: integer (nullable = true)

+---+------+---+--------+----------+----------+------------------+
| id|  name|age|district| from_date|   to_date|to_date_year_month|
+---+------+---+--------+----------+----------+------------------+
|  0|name-0| 72|    1004|2025-06-04|2025-09-03|            202509|
|  1|name-1| 39|    NULL|2025-10-28|2026-01-14|            202601|
|  2|name-2|  7|    1003|2027-01-11|2027-03-14|            202703|
|  3|name-3| 93|    1002|2027-05-22|2027-06-09|            202706|
|  4|name-4| 44|    1003|2027-12-19|2028-03-23|            202803|
+---+------+---+--------+----------+----------+------------------+
only showing top 5 rows

+---------+
|namespace|
+---------+

                                                                                

creating view v_my_date_partitioned to manage technical column DOES NOT HELP!!!...
Done.


In [41]:
# Querying table using SQL
print("Querying the table...")
spark.sql("""
select from_date, to_date, count(1) cnt
from t_my_date_partitioned
group by from_date, to_date
order by from_date, to_date nulls first
""").show(5, truncate=False)

# Drop table
#print("Dropping table...")
#spark.sql('drop table if exists t_my_date_partitioned')

spark.sql("Show tables").show(truncate=False)

Querying the table...
+----------+----------+---+
|from_date |to_date   |cnt|
+----------+----------+---+
|2025-06-04|2025-09-03|1  |
|2025-06-07|2025-06-26|1  |
|2025-06-15|2025-09-04|1  |
|2025-08-16|2025-11-05|1  |
|2025-08-24|2025-11-11|1  |
+----------+----------+---+
only showing top 5 rows

+---------+---------------------+-----------+
|namespace|tableName            |isTemporary|
+---------+---------------------+-----------+
|mytestdb |t_my_date_partitioned|false      |
|mytestdb |v_my_date_partitioned|false      |
+---------+---------------------+-----------+



In [44]:
plan = spark.sql("""
select t.id, t.age
FROM t_my_date_partitioned t
WHERE 1 = 1
AND to_date_year_month >= to_number('202710','000000')
AND t.from_date <= to_date('20271015','yyyyMMdd')
AND t.to_date > to_date('20271015','yyyyMMdd')
""")
plan.show()


+---+---+
| id|age|
+---+---+
| 86| 90|
| 97| 29|
| 27| 23|
| 42| 20|
| 72| 11|
| 31| 43|
| 96| 83|
+---+---+



In [45]:
# show SQL explain plan

def print_dict(d: dict, level: int = 0) -> None:
    """ print optionally hiearchic dict structure nicely formatted
    """

    sp = "".ljust(level * 3)
    for k in d.keys():
        print(f"{sp}{k}: {d[k]}")
        
        if type(d[k]) is dict:
            print_dict(d[k], level + 1)
            
    if level == 0:
        print()

print("exec plan for table select with partition filter:")
plan = spark.sql("""
explain
select *
FROM t_my_date_partitioned t
WHERE 1 = 1
AND t.to_date_year_month >= to_number('202710','000000')
""")

for row in plan.collect():
    None
    print_dict(row.asDict())

#####################################################################
print("exec plan for table select with partition filter and extra filter:")
plan = spark.sql("""
explain
select t.id, t.age
FROM t_my_date_partitioned t
WHERE 1 = 1
AND to_date_year_month >= to_number('202710','000000')
AND t.from_date <= to_date('20271015','yyyyMMdd')
AND t.to_date > to_date('20271015','yyyyMMdd')
""")

for row in plan.collect():
    print_dict(row.asDict())

#####################################################################
# date type for to_date 
print("exec plan for VIEW select with partition filter and extra filter using view.  DOES NOT HELP!!!...:")
plan = spark.sql("""
explain
select *
FROM v_my_date_partitioned t
WHERE 1 = 1
--AND t.to_date_year_month >= to_number('202710','000000')
AND t.from_date <= to_date('20271015','yyyyMMdd')
AND t.to_date > to_date('20271015','yyyyMMdd')
""")

# int type for to_date
plan = spark.sql("""
explain
select *
FROM v_my_date_partitioned t
WHERE 1 = 1
--AND t.to_date_year_month >= 202710
AND t.from_date <= 20271015
AND t.to_date > 20271015
""")

for row in plan.collect():
    None
    print_dict(row.asDict())

exec plan for table select with partition filter:
plan: == Physical Plan ==
*(1) ColumnarToRow
+- FileScan parquet spark_catalog.mytestdb.t_my_date_partitioned[id#577,name#578,age#579,district#580,from_date#581,to_date#582,to_date_year_month#583] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(7 paths)[file:/home/jovyan/work/various_tests/spark/Performance_tuning/spark-wa..., PartitionFilters: [isnotnull(to_date_year_month#583), (to_date_year_month#583 >= 202710)], PushedFilters: [], ReadSchema: struct<id:int,name:string,age:int,district:int,from_date:date,to_date:date>



exec plan for table select with partition filter and extra filter:
plan: == Physical Plan ==
*(1) Project [id#577, age#579]
+- *(1) Filter (((isnotnull(from_date#581) AND isnotnull(to_date#582)) AND (from_date#581 <= 2027-10-15)) AND (to_date#582 > 2027-10-15))
   +- *(1) ColumnarToRow
      +- FileScan parquet spark_catalog.mytestdb.t_my_date_partitioned[id#577,age#579,from_date#581,to_d

In [46]:
# show python DF explain plan

def print_dict(d: dict, level: int = 0) -> None:
    """ print optionally hiearchic dict structure nicely formatted
    """

    sp = "".ljust(level * 3)
    for k in d.keys():
        print(f"{sp}{k}: {d[k]}")
        
        if type(d[k]) is dict:
            print_dict(d[k], level + 1)
            
    if level == 0:
        print()

print("Reading DF from table...")
spark.sql("use mytestdb")
df = (
    spark
    .read
    .table("t_my_date_partitioned")
)
df.show(4, truncate=False)

print("plan with partition filter (PartitionFilters[...]):")
df1 = (
    df
    .filter("to_date_year_month >= 202710")
    .select(df.id, df.age, df.from_date, df.to_date, df.to_date_year_month)
    .orderBy(df.from_date, df.to_date, df.id)
    )

df1.explain()

df1.show(100, truncate=False)

for row in df1.collect():
    None
    #print_dict(row.asDict())

###################################################################
print("plan with partition filter (PartitionFilters[...]) and extra filter:")
df1 = (
    df
    .filter("to_date_year_month >= 202710")
    .filter("from_date <= to_date('20271015','yyyyMMdd') AND to_date > to_date('20271015','yyyyMMdd')")
    .select(df.id, df.age, df.from_date, df.to_date, df.to_date_year_month)
    .orderBy(df.from_date, df.to_date, df.id)
    )

df1.explain()

df1.show(100, truncate=False)

for row in df1.collect():
    None
    #print_dict(row.asDict())

Reading DF from table...
+---+-------+---+--------+----------+----------+------------------+
|id |name   |age|district|from_date |to_date   |to_date_year_month|
+---+-------+---+--------+----------+----------+------------------+
|52 |name-52|3  |1003    |2026-01-10|2026-03-14|202603            |
|63 |name-63|82 |1000    |2025-12-04|2026-03-14|202603            |
|70 |name-70|92 |1003    |2026-02-01|2026-03-15|202603            |
|26 |name-26|25 |1000    |2026-02-14|2026-03-02|202603            |
+---+-------+---+--------+----------+----------+------------------+
only showing top 4 rows

plan with partition filter (PartitionFilters[...]):
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [from_date#581 ASC NULLS FIRST, to_date#582 ASC NULLS FIRST, id#577 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(from_date#581 ASC NULLS FIRST, to_date#582 ASC NULLS FIRST, id#577 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=550]
      +- FileScan parquet spark_catalo

In [59]:
print("plan with partition filter (PartitionFilters[...]):")
df1 = (
    df
    .filter("to_date_year_month >= 202710")
    .filter("from_date <= to_date('20271015','yyyyMMdd') AND to_date > to_date('20271015','yyyyMMdd')")
    .select(df.id, df.age, df.from_date, df.to_date, df.to_date_year_month)
    .orderBy(df.from_date, df.to_date, df.id)
    )

df1.explain()

#df1.show(100, truncate=False)

plan with partition filter (PartitionFilters[...]):
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [from_date#513 ASC NULLS FIRST, to_date#514 ASC NULLS FIRST, id#509 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(from_date#513 ASC NULLS FIRST, to_date#514 ASC NULLS FIRST, id#509 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=2175]
      +- Filter (((isnotnull(from_date#513) AND isnotnull(to_date#514)) AND (from_date#513 <= 2027-10-15)) AND (to_date#514 > 2027-10-15))
         +- FileScan parquet spark_catalog.mytestdb.t_my_date_partitioned[id#509,age#511,from_date#513,to_date#514,to_date_year_month#515] Batched: true, DataFilters: [isnotnull(from_date#513), isnotnull(to_date#514), (from_date#513 <= 2027-10-15), (to_date#514 > ..., Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/work/various_tests/spark/spark-warehouse/mytestdb.db..., PartitionFilters: [isnotnull(to_date_year_month#515), (to_date_year_month#515 = 202710)], Push

In [7]:
# partition by various methods
from datetime import datetime, timedelta
import random
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, TimestampType

df_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("from_dt", DateType(), True),
    StructField("to_dt", DateType(), True),
])

rows = 100
lst = []
for c in range(rows):
    from_dt = datetime.now().date() + timedelta(days=random.randint(0, 1000))
    to_dt = from_dt + timedelta(days=random.randint(1, 100))

    lst.append([
        c,
        'name-' + str(c).zfill(8),
        None if random.random() < 0.1 else random.randint(1, 100),
        from_dt,
        to_dt
         ])

print(f"lst size: {len(lst)}")

df = spark.createDataFrame(lst, df_schema)

    
spark.sql("create database if not exists testdb")
spark.sql("Show databases").show()
spark.sql("use testdb")
spark.sql("show tables").show()

df.show(truncate = False)

#df.groupBy("to_dt").agg("*", "count").orderBy("to_dt")

# partition by all occuring to_dt values
#df.writeTo("my_partitioned_table").partitionedBy(partitioning.months("to_dt")).createOrReplace() # spark 4.0.0 (buggy)
df.write.partitionBy("to_dt").mode("overwrite").parquet('mydf_1.parquet')  # spark 3.5

# partition by a given number of partitions using HASH
# processing the data must be done as one (no pruning), only splitting the file to chunks is achieved
df1 = df.repartition(4, "to_dt")
df1.write.mode("overwrite").parquet('mydf_2.parquet')  # spark 3.5

# partition by a given number of partitions using RANGE by sampling, to automatically estimate the ranges
# processing the data must be done as one, only splitting the file to chunks is achieved
df1 = df.repartitionByRange(4, "to_dt")
df1.write.mode("overwrite").parquet('mydf_3.parquet')  # spark 3.5

# limit the records, thus partitioning data in file roughly(!) equally.  full data processing is necessary
df.write.mode("overwrite").option("maxRecordsPerFile", 20).csv("mydf_4.csv")

print(datetime.now().strftime("%Y%m%d %H:%M:%S"), ": Done.")

lst size: 100
+---------+
|namespace|
+---------+
|  default|
| mytestdb|
|   testdb|
+---------+

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+

+---+-------------+----+----------+----------+
|id |name         |age |from_dt   |to_dt     |
+---+-------------+----+----------+----------+
|0  |name-00000000|72  |2026-06-22|2026-07-20|
|1  |name-00000001|19  |2026-06-27|2026-08-19|
|2  |name-00000002|98  |2026-11-16|2026-12-11|
|3  |name-00000003|52  |2025-09-12|2025-09-13|
|4  |name-00000004|65  |2027-11-30|2028-02-09|
|5  |name-00000005|18  |2027-02-20|2027-03-01|
|6  |name-00000006|59  |2025-11-16|2025-12-15|
|7  |name-00000007|67  |2025-05-05|2025-08-13|
|8  |name-00000008|72  |2025-04-10|2025-04-11|
|9  |name-00000009|29  |2025-04-23|2025-07-26|
|10 |name-00000010|72  |2026-05-31|2026-07-30|
|11 |name-00000011|52  |2026-01-03|2026-02-22|
|12 |name-00000012|46  |2027-12-13|2027-12-17|
|13 |name-00

                                                                                

20250408 15:00:11 : Done.
