In [0]:
"""
mock_finance_invoices.py

Loads mock finance invoice data from a raw CSV file and writes it to the Bronze Delta Lake layer.

- Reads data from: `/mnt/raw-ingest/finance_invoice_data.csv`
- Adds metadata columns: `source_file`, `ingestion_type`
- Writes Delta table to: `/mnt/delta/bronze/finance_invoices`

Uses:
- `input_file_name()` to track original file path
- `lit("finance_invoices")` to label ingestion source

Example:
    %run /Repos/username@databricks.com/databricks-pipelines/.../write_utils
"""


In [0]:
%run /Repos/brucejenks@live.com/databricks-pipelines/pipeline1_batch_delta/utils/write_utils

In [0]:

from pyspark.sql.functions import input_file_name, lit

path = "/mnt/raw-ingest/finance_invoice_data.csv"

df_finance = (
    spark.read
        .option("header", "true")
        .option("inferSchema", "true")
        .csv(path)
        .withColumn("source_file", input_file_name())
        .withColumn("ingestion_type", lit("finance_invoices"))
)

write_df_to_delta(df_finance, "/mnt/delta/bronze/finance_invoices")


In [0]:
# # Step 1: Read from your bronze Delta table
# df_finance = spark.read.format("delta").load("/mnt/delta/bronze/finance_invoices")

# # Step 2: Create a small DataFrame to act as the right side of the join
# from pyspark.sql import Row

# small_reference_data = spark.createDataFrame([
#     Row(vendor_id="V001", vendor_name="Vendor A"),
#     Row(vendor_id="V002", vendor_name="Vendor B"),
#     Row(vendor_id="V003", vendor_name="Vendor C")
# ])

# # Step 3: Join on a likely common key (adjust if needed based on your schema)
# # Replace "vendor_id" with the actual join key in your df_finance if different
# df_joined = df_finance.join(small_reference_data, on="vendor_id", how="inner")

# # Step 4: Trigger an action to invoke execution (and AQE)
# df_joined.count()