In [0]:
from pyspark.sql.functions import col, to_date, upper, trim, when
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, DateType, IntegerType
from pyspark.sql.types import BooleanType
# Load Bronze data
df = spark.read.format("delta").load("/mnt/delta/bronze/finance_invoices")

# Clean and enrich
df_clean = (
    df.withColumn("vendor", upper(trim(col("vendor"))))
      .withColumn("invoice_date", to_date(col("invoice_date")))
      .withColumn("due_date", to_date(col("due_date")))
      .withColumn("paid_flag", when(col("paid") == "Yes", True).otherwise(False))
      .drop("paid")
)

# Drop nulls from key fields
df_clean = df_clean.dropna(subset=["invoice_id", "vendor", "amount_usd"])

# Enforce schema explicitly (optional but good for pipeline consistency)
schema = StructType([
    StructField("invoice_id", StringType(), False),
    StructField("vendor", StringType(), False),
    StructField("amount_usd", DoubleType(), False),
    StructField("invoice_date", DateType(), True),
    StructField("due_date", DateType(), True),
    StructField("paid_flag", BooleanType(), True),
    StructField("source_file", StringType(), True),
    StructField("ingestion_type", StringType(), True),
])
# Apply schema explicitly
df_clean = spark.createDataFrame(df_clean.rdd, schema=schema)

# Clean up existing location (if needed)
import shutil
dbutils.fs.rm("dbfs:/mnt/delta/silver/finance_invoices", recurse=True)
from pyspark.sql.functions import col, to_date, upper, trim, when
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, DateType, IntegerType

# Load Bronze
df = spark.read.format("delta").load("/mnt/delta/bronze/finance_invoices")

# Clean and enrich
df_clean = (
    df.withColumn("vendor", upper(trim(col("vendor"))))
      .withColumn("invoice_date", to_date(col("invoice_date")))
      .withColumn("due_date", to_date(col("due_date")))
      .withColumn("paid_flag", when(col("paid") == "Yes", 1).otherwise(0))
      .drop("paid")
)

# Drop nulls from key fields
df_clean = df_clean.dropna(subset=["invoice_id", "vendor", "amount_usd"])

# 🔍 Remove bad rows from paid_flag (only keep if numeric or 0/1)
df_clean = df_clean.filter(col("paid_flag").isin(0, 1))

df_clean.filter("invoice_date IS NULL").count()
# Define schema
schema = StructType([
    StructField("invoice_id", StringType(), False),
    StructField("vendor", StringType(), False),
    StructField("amount_usd", DoubleType(), False),
    StructField("invoice_date", DateType(), True),
    StructField("due_date", DateType(), True),
    StructField("paid_flag", IntegerType(), True),
    StructField("source_file", StringType(), True),
    StructField("ingestion_type", StringType(), True),
])

# Apply schema
df_clean = spark.createDataFrame(df_clean.rdd, schema=schema)


# Optional: Clean up the destination to ensure no conflict
dbutils.fs.rm("dbfs:/mnt/delta/silver/finance_invoices_v2", recurse=True)

# Write cleanly
try:
    df_clean.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .partitionBy("invoice_date") \
        .save("dbfs:/mnt/delta/silver/finance_invoices_v2")
    print("✅ Write successful.")
except Exception as e:
    print("❌ Write failed:", e)



In [0]:
# Optional: Clean up the destination to ensure no conflict
dbutils.fs.rm("dbfs:/mnt/delta/silver/finance_invoices_v2", recurse=True)

# Write cleanly
df.write.format("delta") \
  .mode("overwrite") \
  .option("overwriteSchema", "true") \
  .partitionBy("invoice_date") \
  .save("dbfs:/mnt/delta/silver/finance_invoices_v2")



In [0]:
df = spark.read.format("delta").load("/mnt/delta/bronze/finance_invoices")

df = df.withColumn("invoice_date", to_date(col("invoice_date")))

df.filter("invoice_date IS NULL").show(5)