In [0]:
your_storage_account_access_key =""
spark.conf.set("fs.azure.account.key.kaninipro.dfs.core.windows.net",your_storage_account_access_key)

spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
spark.conf.set("spark.databricks.optimizer.adaptive.enabled", "false")

In [0]:
from pyspark.sql.functions import col, collect_list, struct,  max, min
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType


#without Z order

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, DoubleType
from datetime import date


# Define schema
schema = StructType([
    StructField("order_id", IntegerType(), False),
    StructField("user_id", IntegerType(), False),
    StructField("date", DateType(), False),
    StructField("order_value", DoubleType(), False),
    StructField("product_id", IntegerType(), False),
    StructField("qty", IntegerType(), False)
])

# Sample data
data = [
    (1, 101, date(2025, 1, 5), 1200.50, 501, 2),
    (1, 102, date(2025, 1, 6), 800.00, 502, 1),
    (1, 103, date(2025, 1, 8), 1500.75, 503, 3),
    (1, 104, date(2025, 1, 10), 450.25, 504, 2),
    (1, 105, date(2025, 1, 12), 2200.00, 505, 5),
    (2, 101, date(2025, 2, 2), 975.50, 506, 1),
    (2, 102, date(2025, 2, 4), 600.00, 507, 2),
    (2, 103, date(2025, 2, 10), 780.25, 508, 3),
    (2, 104, date(2025, 2, 15), 3000.00, 509, 4),
    (2, 105, date(2025, 3, 1), 1300.50, 510, 2),
    (3, 101, date(2025, 3, 3), 1100.75, 501, 1),
    (3, 102, date(2025, 3, 5), 950.00, 502, 2),
    (3, 103, date(2025, 3, 7), 2500.00, 503, 5),
    (3, 104, date(2025, 3, 10), 700.25, 504, 2),
    (3, 105, date(2025, 3, 15), 1700.50, 505, 3),
    (4, 101, date(2025, 4, 2), 800.00, 506, 2),
    (4, 102, date(2025, 4, 5), 900.75, 507, 1),
    (4, 103, date(2025, 4, 8), 2600.00, 508, 4),
    (4, 104, date(2025, 4, 12), 1850.25, 509, 2),
    (4, 105, date(2025, 4, 20), 3000.00, 510, 5),
    (5, 101, date(2025, 5, 1), 950.75, 501, 3),
    (5, 102, date(2025, 5, 4), 1200.00, 502, 2),
    (5, 103, date(2025, 5, 10), 1350.25, 503, 3),
    (5, 104, date(2025, 5, 4), 1200.00, 502, 2),
    (5, 105, date(2025, 5, 10), 1350.25, 503, 3)
]

# Create DataFrame
orders_df = spark.createDataFrame(data, schema=schema)

spark.sql("drop table if exists order_table")

orders_df.write.mode("overwrite").format("delta")\
                                .option("path","abfss://data@kaninipro.dfs.core.windows.net/test_order_table")\
                                .saveAsTable("test_order_table")


In [0]:
df = (
    spark.read.format("delta")
    .load("abfss://data@kaninipro.dfs.core.windows.net/test_order_table")
    .select(
        "*",
        col("_metadata.file_path").alias("file_path"),
        col("_metadata.file_name").alias("file_name"),
        col("_metadata.file_size").alias("file_size_bytes"),
        col("_metadata.file_modification_time").alias("file_modified_time")
    )
)

agg_df = df.groupBy("file_name").agg(collect_list(struct("order_id", "user_id")))

display(agg_df)

In [0]:
display(df.groupBy("file_name").count())

In [0]:
#query1
df =spark.table("test_order_table")

display(df.filter((col("order_id")==1) & (col("user_id")==103)))
# number of files read 1

In [0]:
#query2
df =spark.table("test_order_table")

display(df.filter(col("order_id")==1))
# number of files read 1

In [0]:
#query3
df =spark.table("test_order_table")

display(df.filter(col("user_id")==103))
# number of files read 4

#City-weather data prepare

In [0]:
source_df = spark.read.format("parquet").load("abfss://data@kaninipro.dfs.core.windows.net/city_weather_parquet")

In [0]:
# write without zorder
source_df.write.mode("overwrite").format("delta")\
    .save("abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta")

In [0]:
#sorted by record_id, city_id
source_df.sort("record_id", "city_id")\
    .write.mode("overwrite").format("delta")\
    .save("abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_sorted")

In [0]:
#write with zorder by record_id, city_id
source_df.write.mode("overwrite").format("delta")\
    .save("abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_zorder")

# optimize the data along with Z ordering
spark.sql("""
          optimize delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_zorder` 
          ZORDER BY (record_id, city_id)
          """)


#aggregation analysis

In [0]:
path = "abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_zorder"

df = (
    spark.read.format("delta")
    .load(path)
    .select(
        "*",
        col("_metadata.file_path").alias("parquet_file_path"),
        col("_metadata.file_name").alias("parquet_file_name"),
        col("_metadata.file_size").alias("file_size_bytes"),
        col("_metadata.file_modification_time").alias("file_modified_time")
    )
)

agg_df = df.groupBy("parquet_file_name").agg(
    max("record_id").alias("max_record_id"),
    min("record_id").alias("min_record_id"),
    max("city_id").alias("max_city_id"),
    min("city_id").alias("min_city_id"),
    min("date").alias("min_date"),
    max("date").alias("max_date")
)

display(agg_df)

In [0]:
display(agg_df.filter(col("max_record_id")>=7687991955583)\
        .filter(col("min_record_id")<=7687991955583)\
        .filter(col("max_city_id")>=4769)\
        .filter(col("min_city_id")<=4769))


#result compare

##standard

In [0]:
%sql
-- query 1 standard
select * from delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta`
where record_id = 7687991955583 and city_id = 4769

In [0]:
%sql
-- query 2 standard
select * from delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta`
where record_id = 7687991955583 

In [0]:
%sql
-- query 3 standard
select * from delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta`
where city_id = 4769

##sorted

In [0]:
%sql
-- query 1 sorted
select * from delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_sorted`
where record_id = 7687991955583 and city_id = 4769

In [0]:
%sql
-- query 2 sorted
select * from delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_sorted`
where record_id = 7687991955583 

In [0]:
%sql
-- query 3 sorted
select * from delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_sorted`
where city_id = 4769

##zordered

In [0]:
%sql
-- query 1 z_ordered
select * from delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_zorder`
where record_id = 7687991955583 and city_id = 4769

In [0]:
%sql
-- query 2 z_ordered
select * from delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_zorder`
where record_id = 7687991955583 

In [0]:
%sql
-- query 3 z_ordered
select * from delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_zorder`
where city_id = 4769