In [0]:
orders = spark.read.json('/public/retail_db_json/orders')
from pyspark.sql.functions import date_format

In [0]:
# PARTITIONING

# .partitionBy(*cols) let's us partition by a set of columns
orders\
    .withColumn("order_date", date_format("order_date", "yyyyMMdd"))\
    .coalesce(1) \
    .write \
    .partitionBy("order_date") \
    .parquet('/user/root/retail_db/orders_by_date', mode='overwrite')

# Folder for every unique value is created
# Every partition consists of several files

# Partition pruning
# Filtering by a column that was a key used to partition by the dataset will reduce the amount of data that needs to be read automatically

In [0]:
# .partitionBy(*cols) let's us partition by a set of columns

orders\
    .withColumn("order_date", date_format("order_date", "yyyyMMdd"))\
    .coalesce(1) \
    .write \
    .partitionBy("order_date") \
    .parquet('/user/root/retail_db/orders_by_date', mode='overwrite')

In [0]:
# Every partition consists of several files
dbutils.fs.ls('/user/root/retail_db/orders_by_date/order_date=20130725/')

[FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20130725/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1744450598000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20130725/_committed_5012110333595032784', name='_committed_5012110333595032784', size=122, modificationTime=1744450598000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20130725/_started_5012110333595032784', name='_started_5012110333595032784', size=0, modificationTime=1744450544000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20130725/part-00000-tid-5012110333595032784-210e978a-5404-465e-acc0-6b29b168121d-8-1.c000.snappy.parquet', name='part-00000-tid-5012110333595032784-210e978a-5404-465e-acc0-6b29b168121d-8-1.c000.snappy.parquet', size=2975, modificationTime=1744450544000)]

In [0]:
# Folder for every unique value is created
dbutils.fs.ls('/user/root/retail_db/orders_by_date/')

[FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1744450637000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20130725/', name='order_date=20130725/', size=0, modificationTime=1744450544000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20130726/', name='order_date=20130726/', size=0, modificationTime=1744450544000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20130727/', name='order_date=20130727/', size=0, modificationTime=1744450544000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20130728/', name='order_date=20130728/', size=0, modificationTime=1744450544000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20130729/', name='order_date=20130729/', size=0, modificationTime=1744450544000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20130730/', name='order_date=20130730/', size=0, modific

In [0]:
# Partitioning by multiple columns

orders\
    .withColumn("order_date", date_format("order_date", "yyyyMMdd"))\
    .coalesce(1) \
    .write \
    .partitionBy("order_date", "order_status") \
    .parquet('/user/root/retail_db/orders_by_date', mode='overwrite')

[FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1744457347000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/_committed_1627945022112623619', name='_committed_1627945022112623619', size=35, modificationTime=1744452589000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20130725/', name='order_date=20130725/', size=0, modificationTime=1744450544000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20130726/', name='order_date=20130726/', size=0, modificationTime=1744450544000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20130727/', name='order_date=20130727/', size=0, modificationTime=1744450544000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20130728/', name='order_date=20130728/', size=0, modificationTime=1744450544000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20130729/', name='order_date=201307

In [0]:
# Subfolders for every column:
dbutils.fs.ls('/user/root/retail_db/orders_by_date/order_date=20140707/')

[FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20140707/_committed_1627945022112623619', name='_committed_1627945022112623619', size=234, modificationTime=1744452586000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20140707/_committed_7645702687718374015', name='_committed_7645702687718374015', size=124, modificationTime=1744457306000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20140707/order_status=CANCELED/', name='order_status=CANCELED/', size=0, modificationTime=1744457267000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20140707/order_status=CLOSED/', name='order_status=CLOSED/', size=0, modificationTime=1744457267000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20140707/order_status=COMPLETE/', name='order_status=COMPLETE/', size=0, modificationTime=1744457267000),
 FileInfo(path='dbfs:/user/root/retail_db/orders_by_date/order_date=20140707/order_status=ON_HOLD/

In [0]:
# Partition pruning
# Filtering by a column that was a key used to partition by the dataset will reduce the amount of data that needs to be read automatically

spark.read.parquet('/user/root/retail_db/orders_by_date').filter('order_date = "20140707"').count()