In [0]:
"""
gold_summary.py

Aggregates cleaned Silver layer data to generate a vendor-level summary for reporting.

- Counts total inventory items and shipments per vendor
- Tracks last shipment date
- Outputs to: /mnt/delta/gold/vendor_summary_clean

Table registered as: vendor_summary_clean
"""

In [0]:
from pyspark.sql.functions import col, max, countDistinct

# Load cleaned Silver
df_prep = spark.read.format("delta").load("/mnt/delta/silver/final_vendor_summary_prep")

# Load compliance and alias it to resolve ambiguity
df_compliance = spark.read.format("delta").load("/mnt/delta/silver/vendor_compliance_clean").alias("compliance")

# Join and aggregate
df_gold = df_prep.join(df_compliance, on="vendor_id", how="left") \
    .groupBy("vendor_id", "vendor_name") \
    .agg(
        countDistinct("invoice_id").alias("total_invoices"),
        max("due_date").alias("latest_due_date"),
        max("invoice_date").alias("latest_invoice_date"),
        max(col("compliance.last_audit_date")).alias("last_audit_date"),
        max(col("compliance.compliance_score")).alias("compliance_score"),
        max(col("compliance.status")).alias("compliance_status")
    )

# Write Gold
df_gold.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save("/mnt/delta/gold/vendor_summary_clean")

display(df_gold)

In [0]:
print("df_prep columns:", df_prep.columns)
print("df_compliance columns:", df_compliance.columns)

In [0]:
df_prep = spark.read.format("delta").load("/mnt/delta/silver/final_vendor_summary_prep")

display(df_prep)

In [0]:
df_compliance = spark.read.format("delta").load("/mnt/delta/silver/vendor_compliance_clean")

display(df_compliance)

In [0]:
gold_df = spark.read.format("delta").load("/mnt/delta/gold/vendor_summary_clean")
gold_df.show(truncate=False)

print(f"Total records in Gold layer: {gold_df.count()}")

In [0]:
%sql
SELECT * FROM vendor_summary_clean ORDER BY vendor_id