In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import upper, trim, col

spark = SparkSession.builder.getOrCreate()

# -------------------------------------
# Step 1: Load Clean Silver Data
# -------------------------------------
df_finance = spark.read.format("delta").load("dbfs:/mnt/delta/silver/finance_invoices_v2")
df_vendors = spark.read.format("delta").load("/mnt/delta/silver/vendors_clean")

# -------------------------------------
# Step 2: Standardize join keys
# -------------------------------------
df_finance = df_finance.withColumn("vendor_id", upper(trim(col("vendor_id"))))
df_vendors = df_vendors.withColumn("vendor_id", upper(trim(col("vendor_id"))))

# -------------------------------------
# Step 3: Join on vendor_id
# -------------------------------------
df_joined = df_finance.join(df_vendors, on="vendor_id", how="left")

# Optional: Select and rename columns
df_joined = df_joined.select(
    "invoice_id", "vendor_id", "amount_usd", "invoice_date", "due_date",
    "paid_flag", "source_file", "ingestion_type",
    col("name").alias("vendor_name"),
    "location", "rating"
)

# -------------------------------------
# Step 4: Write Silver Join Output
# -------------------------------------
df_joined.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save("/mnt/delta/silver/finance_with_vendor_info")

print("✅ Silver join completed.")

display(df_joined)