In [0]:
df = spark.read.table("fraud_lakehouse.silver.silver_card_transactions")
df.write.format("delta").mode("overwrite").saveAsTable("fraud_lakehouse.gold.gold_card_transactions")
spark.read.table("fraud_lakehouse.gold.gold_card_transactions").display()

In [0]:
from pyspark.sql.functions import (
    col, sha2, concat_ws, lit, current_timestamp, 
    date_format, broadcast, when, to_timestamp
)
from delta.tables import *




df_merch_master =  "fraud_lakehouse.silver.dim_merchant_master"
df_cust_master  = "fraud_lakehouse.silver.dim_customer_master"
df_silver_txn   =  "fraud_lakehouse.silver.silver_card_transactions"


gold_dim_merch = "fraud_lakehouse.gold.dim_merchants"
gold_dim_cust  = "fraud_lakehouse.gold.dim_customers"
gold_fact_txn  = "fraud_lakehouse.gold.fact_transactions"

Generate Surrogate Key
Write / Merge Logic
SCD Type 1 Merge: Update existing, Insert new

In [0]:
df_merch_master = spark.read.table(df_merch_master)

dim_merch_df = (
    df_merch_master
    .select(
        col("merchant_id"),
        col("merchant_name"),
        col("merchant_category"),
        col("merchant_country"),
        col("is_online_only")
    )
    .distinct()
   
    .withColumn("merchant_sk", sha2(concat_ws("||", col("merchant_id")), 256))
    .withColumn("updated_at", current_timestamp())
)


if not spark.catalog.tableExists(gold_dim_merch):
    print(f"Creating {gold_dim_merch}...")
    dim_merch_df.write.format("delta").mode("overwrite").saveAsTable(gold_dim_merch)
else:
    print(f"Updating {gold_dim_merch}...")
    target_merch = DeltaTable.forName(spark, gold_dim_merch)
    
  
    (target_merch.alias("t")
        .merge(
            dim_merch_df.alias("s"),
            "t.merchant_id = s.merchant_id"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )


In [0]:
from pyspark.sql.functions import *
from delta.tables import DeltaTable


gold_dim_cust = "fraud_lakehouse.gold.dim_customer"


if isinstance(df_cust_master, str):
    df_cust_master = spark.table(df_cust_master)


source_cust = (
    df_cust_master
    .select("customer_id", "home_country", "risk_segment", "signup_date")
    .distinct()
    .withColumn(
        "row_hash",
        sha2(concat_ws("||", col("risk_segment"), col("home_country")), 256)
    )
)


if not spark.catalog.tableExists(gold_dim_cust):

    (
        source_cust
        .withColumn(
            "customer_sk",
            sha2(concat_ws("||", col("customer_id"), col("row_hash")), 256)
        )
        .withColumn("effective_from", current_timestamp())
        .withColumn("effective_to", lit(None).cast("timestamp"))
        .withColumn("is_current", lit(True))
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(gold_dim_cust)
    )

else:

    existing_cols = spark.table(gold_dim_cust).columns

    if "row_hash" not in existing_cols:
        spark.sql(f"""
            ALTER TABLE {gold_dim_cust}
            ADD COLUMNS (row_hash STRING)
        """)

    target_table = DeltaTable.forName(spark, gold_dim_cust)

    updates_df = (
        source_cust.alias("s")
        .join(
            target_table.toDF().alias("t"),
            (col("s.customer_id") == col("t.customer_id")) &
            (col("t.is_current") == True),
            "left"
        )
        .where(
            col("t.customer_id").isNull() |
            (col("s.row_hash") != col("t.row_hash"))
        )
        .select(
            col("s.customer_id"),
            col("s.home_country"),
            col("s.risk_segment"),
            col("s.signup_date"),
            col("s.row_hash")
        )
    )

    if not updates_df.isEmpty():

        
        (
            target_table.alias("t")
            .merge(
                updates_df.alias("s"),
                "t.customer_id = s.customer_id AND t.is_current = true"
            )
            .whenMatchedUpdate(
                condition="t.row_hash <> s.row_hash",
                set={
                    "is_current": lit(False),
                    "effective_to": current_timestamp()
                }
            )
            .execute()
        )

       
        (
            updates_df
            .withColumn(
                "customer_sk",
                sha2(concat_ws("||", col("customer_id"), col("row_hash")), 256)
            )
            .withColumn("effective_from", current_timestamp())
            .withColumn("effective_to", lit(None).cast("timestamp"))
            .withColumn("is_current", lit(True))
            .write
            .format("delta")
            .mode("append")
            .saveAsTable(gold_dim_cust)
        )


CDC

In [0]:
%python
from pyspark.sql.functions import col, lit, when, broadcast, date_format, to_timestamp, row_number
from pyspark.sql.window import Window
from delta.tables import DeltaTable


df_silver_txn = spark.read.table("fraud_lakehouse.silver.silver_card_transactions")

dim_cust_active = (
    spark.read.table("fraud_lakehouse.gold.dim_customer")
    .filter(col("is_current") == True)
)

dim_merch_all = spark.read.table("fraud_lakehouse.gold.dim_merchants")

fx_df = spark.read.table("fraud_lakehouse.silver.dim_fx_rates").select("country", "usd_rate")


df_txn_clean = (
    df_silver_txn
    .drop("usd_rate", "transaction_amount_usd")
    .withColumn("transaction_ts", to_timestamp(col("transaction_ts")))
    .join(fx_df, "country", "left")
    .withColumn(
        "transaction_amount_usd",
        col("transaction_amount") * col("usd_rate")
    )
)


staged_fact = (
    df_txn_clean.alias("txn")
    .join(
        broadcast(dim_cust_active).alias("c"),
        col("txn.customer_id") == col("c.customer_id"),
        "left"
    )
    .join(
        broadcast(dim_merch_all).alias("m"),
        col("txn.merchant_id") == col("m.merchant_id"),
        "left"
    )
    .select(
        col("txn.transaction_id"),
        
        when(col("c.customer_sk").isNull(), lit("-1"))
            .otherwise(col("c.customer_sk")).alias("customer_fk"),

        when(col("m.merchant_sk").isNull(), lit("-1"))
            .otherwise(col("m.merchant_sk")).alias("merchant_fk"),
        
        col("txn.transaction_amount").cast("double").alias("amount"),
        col("txn.transaction_amount_usd").cast("double").alias("amount_usd"),
        
        col("txn.fraud_flag").cast("int"),
        col("txn.card_type"),
        col("txn.device_type"),
        col("txn.transaction_ts"),
        
        date_format(col("txn.transaction_ts"), "yyyy-MM-dd").alias("date_key")
    )
)

# Deduplicate staged_fact - keep the most recent record per (transaction_id, date_key)
window_spec = Window.partitionBy("transaction_id", "date_key").orderBy(col("transaction_ts").desc())
staged_fact_deduped = (
    staged_fact
    .withColumn("row_num", row_number().over(window_spec))
    .filter(col("row_num") == 1)
    .drop("row_num")
)

gold_fact_txn = "fraud_lakehouse.gold.fact_transactions"

if not spark.catalog.tableExists(gold_fact_txn):
    (
        staged_fact_deduped.write
        .format("delta")
        .partitionBy("date_key")
        .mode("overwrite")
        .saveAsTable(gold_fact_txn)
    )
else:
    target_fact = DeltaTable.forName(spark, gold_fact_txn)

    (
        target_fact.alias("t")
        .merge(
            staged_fact_deduped.alias("s"),
            "t.transaction_id = s.transaction_id AND t.date_key = s.date_key"
        )
        .whenMatchedUpdate(
            condition="t.fraud_flag <> s.fraud_flag",
            set={"fraud_flag": col("s.fraud_flag")}
        )
        .whenNotMatchedInsertAll()
        .execute()
    )

Z-Order co-locates similar data for faster filtering
Clean up old files (Retention: 7 days)


In [0]:

print("Optimizing Fact Table Layout...")
spark.sql(f"""
    OPTIMIZE fraud_lakehouse.gold.fact_transactions
    ZORDER BY (customer_fk, merchant_fk)
""")


spark.sql(f"VACUUM fraud_lakehouse.gold.fact_transactions RETAIN 168 HOURS")

print("Gold Layer Star Schema built successfully!")

In [0]:
%sql
ALTER TABLE fraud_lakehouse.gold.fact_transactions 
SET TBLPROPERTIES (delta.enableChangeDataFeed = true)