In [0]:
adls_account = "insurancedatalake01"
storage_key = os.environ.get("AZURE_STORAGE_KEY")

spark.conf.set(
    f"fs.azure.account.key.{adls_account}.dfs.core.windows.net",
    storage_key
)

In [0]:
# ============================================================
# 05_optimize_and_benchmark_factclaim
# ============================================================

import time

# -------- CONFIG --------
adls_account = "insurancedatalake01"   
container = "datalake"
table_name = "fact_claims"

gold_path = f"abfss://{container}@{adls_account}.dfs.core.windows.net/gold/{table_name}"
full_table_name = f"gold.{table_name}"

print("Gold Table Location:", gold_path)
print("Table Name:", full_table_name)


# ======== HELPER FUNCTION =========
def measure_performance(df):
    metrics = {}

    start = time.time()
    df.count()
    metrics["read_sec"] = time.time()-start

    start = time.time()
    df.filter("ClaimAmount > 10000").count()
    metrics["filter_sec"] = time.time() - start

    start = time.time()
    df.groupBy("IncidentSeverity").count().collect()
    metrics["groupby_sec"] = time.time() - start

    return metrics


#----BEFORE OPTIMIZATION----

df_before = spark.read.format("delta").load(gold_path).cache()
df_before.count()
before = measure_performance(df_before)

#----OPTIMIZE + ZORDER----

spark.sql(f"""
          OPTIMIZE {full_table_name}
          ZORDER BY (CustomerID, PolicyNumber)
          """)
spark.sql(f"VACUUM {full_table_name} RETAIN 168 HOURS")

#----AFTER OPTIMIZATION----
df_after = spark.read.format("delta").load(gold_path).cache()
df_after.count()
after = measure_performance(df_after)

#----COMPARISON----
print("\n=== Comparison (Before vs After) ===")

comparison = {
    "read_improvement_pct": round((before["read_sec"] - after["read_sec"]) / before["read_sec"] * 100, 2),
    "filter_improvement_pct": round((before["filter_sec"] - after["filter_sec"]) / before["filter_sec"] * 100, 2),
    "groupby_improvement_pct": round((before["groupby_sec"] - after["groupby_sec"]) / before["groupby_sec"] * 100, 2),
}

comparison

Gold Table Location: abfss://datalake@insurancedatalake01.dfs.core.windows.net/gold/fact_claims
Table Name: gold.fact_claims

=== Comparison (Before vs After) ===
Out[3]: {'read_improvement_pct': 61.33,
 'filter_improvement_pct': 74.1,
 'groupby_improvement_pct': 64.09}