In [0]:
your_storage_account_access_key =""
spark.conf.set("fs.azure.account.key.kaninipro.dfs.core.windows.net",your_storage_account_access_key)

spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
spark.conf.set("spark.databricks.optimizer.adaptive.enabled", "false")

In [0]:
from pyspark.sql.functions import col, collect_list, struct,  max, min
from pyspark.sql.types import StructType, StructField, IntegerType, StringType,DateType, DoubleType
from datetime import date


#City-weather data prepare

In [0]:
source_df = spark.read.format("parquet").load("abfss://data@kaninipro.dfs.core.windows.net/city_weather_parquet")

In [0]:
# write without zorder
source_df.write.mode("overwrite").format("delta")\
    .save("abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta")

In [0]:
#sorted by record_id, city_id
source_df.sort("record_id", "city_id")\
    .write.mode("overwrite").format("delta")\
    .save("abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_sorted")

In [0]:
#write with zorder by record_id, city_id
source_df.write.mode("overwrite").format("delta")\
    .save("abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_zorder")

# optimize the data along with Z ordering
spark.sql("""
          optimize delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_zorder` 
          ZORDER BY (record_id, city_id)
          """)


#aggregation analysis

In [0]:
path = "abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_zorder"

df = (
    spark.read.format("delta")
    .load(path)
    .select(
        "*",
        col("_metadata.file_path").alias("parquet_file_path"),
        col("_metadata.file_name").alias("parquet_file_name"),
        col("_metadata.file_size").alias("file_size_bytes"),
        col("_metadata.file_modification_time").alias("file_modified_time")
    )
)

agg_df = df.groupBy("parquet_file_name").agg(
    max("record_id").alias("max_record_id"),
    min("record_id").alias("min_record_id"),
    max("city_id").alias("max_city_id"),
    min("city_id").alias("min_city_id"),
    min("date").alias("min_date"),
    max("date").alias("max_date")
)

display(agg_df)

In [0]:
display(agg_df.filter(col("max_record_id")>=7687991955583)\
        .filter(col("min_record_id")<=7687991955583)\
        .filter(col("max_city_id")>=4769)\
        .filter(col("min_city_id")<=4769))


#result compare

##standard

In [0]:
%sql
-- query 1 standard
select * from delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta`
where record_id = 7687991955583 and city_id = 4769

In [0]:
%sql
-- query 2 standard
select * from delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta`
where record_id = 7687991955583 

In [0]:
%sql
-- query 3 standard
select * from delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta`
where city_id = 4769

##sorted

In [0]:
%sql
-- query 1 sorted
select * from delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_sorted`
where record_id = 7687991955583 and city_id = 4769

In [0]:
%sql
-- query 2 sorted
select * from delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_sorted`
where record_id = 7687991955583 

In [0]:
%sql
-- query 3 sorted
select * from delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_sorted`
where city_id = 4769

##zordered

In [0]:
%sql
-- query 1 z_ordered
select * from delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_zorder`
where record_id = 7687991955583 and city_id = 4769

In [0]:
%sql
-- query 2 z_ordered
select * from delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_zorder`
where record_id = 7687991955583 

In [0]:
%sql
-- query 3 z_ordered
select * from delta.`abfss://data@kaninipro.dfs.core.windows.net/city_weather_delta_zorder`
where city_id = 4769