In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, max

spark = SparkSession.builder.getOrCreate()

# ---------------------------------------------------
# 1. Load Cleaned Silver Table
# ---------------------------------------------------
df_silver = spark.read.format("delta").load("/mnt/delta/silver/inventory_shipments_joined_clean")

# ---------------------------------------------------
# 2. Inventory Summary per Vendor
# ---------------------------------------------------
inventory_summary = df_silver.groupBy("vendor_id", "vendor_name").agg(
    count("item_id").alias("total_inventory_items")
)

# ---------------------------------------------------
# 3. Shipment Summary per Vendor
# ---------------------------------------------------
shipment_summary = df_silver.groupBy("vendor_id", "vendor_name").agg(
    count("shipment_id").alias("total_shipments"),
    max("shipment_date").alias("last_shipment_date")
)

# ---------------------------------------------------
# 4. Join Aggregated Results
# ---------------------------------------------------
vendor_summary = inventory_summary.join(
    shipment_summary,
    on=["vendor_id", "vendor_name"],
    how="outer"
).fillna(0)

# ---------------------------------------------------
# 5. Write Gold Output
# ---------------------------------------------------
vendor_summary.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save("/mnt/delta/gold/vendor_summary_clean")

# Optional: Register table
spark.sql("DROP TABLE IF EXISTS vendor_summary_clean")
spark.sql("""
    CREATE TABLE vendor_summary_clean
    USING DELTA
    LOCATION '/mnt/delta/gold/vendor_summary_clean'
""")

In [0]:
gold_df = spark.read.format("delta").load("/mnt/delta/gold/vendor_summary_clean")
gold_df.show(truncate=False)

print(f"Total records in Gold layer: {gold_df.count()}")

In [0]:
%sql
SELECT * FROM vendor_summary_clean ORDER BY vendor_id