In [0]:
from pyspark.sql.functions import col, to_date, upper, trim, when
from pyspark.sql.types import IntegerType

# Step 1: Load Bronze
df = spark.read.format("delta").load("/mnt/delta/bronze/finance_invoices")


# Step 2: Clean columns
df_clean = (
    df.withColumn("vendor", upper(trim(col("vendor"))))
      .withColumn("invoice_date", to_date(col("invoice_date")))
      .withColumn("due_date", to_date(col("due_date")))
      .withColumn("paid_flag", when(col("paid") == "Yes", 1).otherwise(0).cast(IntegerType()))
      .drop("paid", "vendor_name", "rating", "location")  # ❌ drop unreliable fields
)

# Step 3: Create vendor mapping DataFrame

# Create mapping DataFrame
vendor_map = [
    ("WOLFE LLC", "V010"),
    ("MOORE-BERNARD", "V008"),
    ("GARCIA-JAMES", "V006"),
    ("ABBOTT-MUNOZ", "V001"),
    ("BLAIR PLC", "V003"),
    ("DUDLEY GROUP", "V004"),
    ("ARNOLD LTD", "V002"),
    ("MCCLURE, WARD AND LEE", "V007"),
    ("WILLIAMS AND SONS", "V009"),
    ("GALLOWAY-WYATT", "V005"),
]

df_map = spark.createDataFrame(vendor_map, ["vendor", "vendor_id"])

# Clean original vendor names to ensure match
df_enriched = df_clean.withColumn("vendor", upper(trim(col("vendor"))))

# Join on cleaned vendor name
df_final = df_enriched.join(df_map, on="vendor", how="inner")

# Step 6: Reorder columns
df_final = df_final.select(
    "vendor_id", "invoice_id", "amount_usd", "invoice_date", "due_date",
    "paid_flag", "source_file", "ingestion_type"
)
display(df_final)

# Step 7: Overwrite Silver path
output_path = "dbfs:/mnt/delta/silver/finance_invoices_v2"
dbutils.fs.rm(output_path, recurse=True)

try:
    df_final.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .partitionBy("invoice_date") \
        .save(output_path)
    print("✅ Write successful.")
except Exception as e:
    print("❌ Write failed:", e)


In [0]:
display(df_final)

In [0]:
# df = spark.read.format("delta").load("/mnt/delta/bronze/finance_invoices")

# df = df.withColumn("invoice_date", to_date(col("invoice_date")))

# df.filter("invoice_date IS NULL").show(5)