In [0]:
from pyspark.sql.functions import input_file_name, lit

# Step 1: Load the CSV file
path = "/mnt/raw-ingest/finance_invoice_data.csv"

df_finance = (
    spark.read
        .option("header", "true")
        .option("inferSchema", "true")
        .csv(path)
)

# Step 2: Add source metadata
df_finance = df_finance.withColumn("source_file", input_file_name())
df_finance = df_finance.withColumn("ingestion_type", lit("finance_invoices"))

# Step 3: Write to Bronze Delta
df_finance.write.format("delta").mode("overwrite").save("/mnt/delta/bronze/finance_invoices")

# Optional: Review
df_finance.printSchema()
df_finance.show(10, truncate=False)


In [0]:
# Step 1: Read from your bronze Delta table
df_finance = spark.read.format("delta").load("/mnt/delta/bronze/finance_invoices")

# Step 2: Create a small DataFrame to act as the right side of the join
from pyspark.sql import Row

small_reference_data = spark.createDataFrame([
    Row(vendor_id="V001", vendor_name="Vendor A"),
    Row(vendor_id="V002", vendor_name="Vendor B"),
    Row(vendor_id="V003", vendor_name="Vendor C")
])

# Step 3: Join on a likely common key (adjust if needed based on your schema)
# Replace "vendor_id" with the actual join key in your df_finance if different
df_joined = df_finance.join(small_reference_data, on="vendor_id", how="inner")

# Step 4: Trigger an action to invoke execution (and AQE)
df_joined.count()