In [0]:
%sql
create database if not exists hive_metastore.bankdb

In [0]:
%sql
CREATE TABLE IF NOT EXISTS hive_metastore.bankdb.accounts (
    account_id INT,
    customer_id INT,
    account_type STRING,
    balance FLOAT,
    CreatedDate TIMESTAMP,
    UpdatedDate TIMESTAMP,
    CreatedBy STRING,
    UpdatedBy STRING,
    HashKey BIGINT
)
USING DELTA
LOCATION '/mnt/project2/gold/accounts';

In [0]:
%sql
CREATE TABLE IF NOT EXISTS hive_metastore.bankdb.customers (
    customer_id INT,
    first_name STRING,
    last_name STRING,
    address STRING,
    city STRING,
    state STRING,
    zip STRING,
    CreatedDate TIMESTAMP,
    UpdatedDate TIMESTAMP,
    CreatedBy STRING,
    UpdatedBy STRING,
    HashKey BIGINT
)
USING DELTA
LOCATION '/mnt/project2/gold/customers';

In [0]:
%sql
CREATE TABLE IF NOT EXISTS hive_metastore.bankdb.loans (
    loan_id INT,
    customer_id INT,
    loan_amount FLOAT,
    interest_rate FLOAT,
    loan_term INT,
    CreatedDate TIMESTAMP,
    UpdatedDate TIMESTAMP,
    CreatedBy STRING,
    UpdatedBy STRING,
    HashKey BIGINT
)
USING DELTA
LOCATION '/mnt/project2/gold/loans'

In [0]:
%sql
CREATE TABLE IF NOT EXISTS hive_metastore.bankdb.loan_payments (
    payment_id INT,
    loan_id INT,
    payment_date DATE,
    payment_amount FLOAT,
    CreatedDate TIMESTAMP,
    UpdatedDate TIMESTAMP,
    CreatedBy STRING,
    UpdatedBy STRING,
    HashKey BIGINT
)
USING DELTA
LOCATION '/mnt/project2/gold/loan_payments';


In [0]:
%sql
CREATE TABLE IF NOT EXISTS hive_metastore.bankdb.transactions (
    transaction_id INT,
    account_id INT,
    transaction_date DATE,
    transaction_amount FLOAT,
    transaction_type STRING,
    CreatedDate TIMESTAMP,
    UpdatedDate TIMESTAMP,
    CreatedBy STRING,
    UpdatedBy STRING,
    HashKey BIGINT
)
USING DELTA
LOCATION '/mnt/project2/gold/transactions';

In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import current_date, current_timestamp, lit , col , crc32, concat_ws


In [0]:
tables = [
    {
        "source_path": "/mnt/project2/silver/accounts_cleaned",
        "target_path": "/mnt/project2/gold/accounts",
        "primary_key": "account_id"
    },
    {
        "source_path": "/mnt/project2/silver/customers_cleaned",
        "target_path": "/mnt/project2/gold/customers",
        "primary_key": "customer_id"
    },
    {
        "source_path": "/mnt/project2/silver/loans_cleaned",
        "target_path": "/mnt/project2/gold/loans",
        "primary_key": "loan_id"
    },
    {
        "source_path": "/mnt/project2/silver/loan_payments_cleaned",
        "target_path": "/mnt/project2/gold/loan_payments",
        "primary_key": "payment_id"
    },
    {
        "source_path": "/mnt/project2/silver/transactions_cleaned",
        "target_path": "/mnt/project2/gold/transactions",
        "primary_key": "transaction_id"
    }
]


In [0]:

# Loop through each table
for table in tables:

    # Step 1: Read Source Data
    source_df = spark.read.format("delta").load(table["source_path"])

    # Step 2: Drop rows where Primary Key is NULL
    source_df = source_df.dropna(subset=[table["primary_key"]])

    # Step 3: Add Metadata Columns for Insert Operation
    source_df = source_df.withColumn("CreatedDate", current_timestamp()) \
                         .withColumn("UpdatedDate", current_timestamp()) \
                         .withColumn("CreatedBy", lit("databricks")) \
                         .withColumn("UpdatedBy", lit("databricks"))

    # Step 4: Add HashKey Column using CRC32 (ignoring metadata fields)
    source_df = source_df.withColumn(
        "HashKey",
        crc32(
            concat_ws("||", *[
                col_name for col_name in source_df.columns
                if col_name not in ["CreatedDate", "CreatedBy", "UpdatedDate", "UpdatedBy", "HashKey"]
            ])
        )
    )

    # Step 5: Read Target Gold Table
    target_delta = DeltaTable.forPath(spark, table["target_path"])

    # Step 6: Prepare update and insert mappings
    update_set = {}
    for col_name in source_df.columns:
        if col_name not in [table['primary_key'], "CreatedDate", "CreatedBy"]:
            if col_name == "UpdatedDate":
                update_set[col_name] = "current_timestamp()"
            elif col_name == "UpdatedBy":
                update_set[col_name] = "'databricks-updated'"
            else:
                update_set[col_name] = f"source.{col_name}"

    insert_values = {col_name: f"source.{col_name}" for col_name in source_df.columns}

    # Step 7: Create dynamic comparison condition (only update if any value changed)
    compare_condition = " OR ".join([
        f"target.{col_name} <> source.{col_name}" 
        for col_name in source_df.columns 
        if col_name not in ["CreatedDate", "CreatedBy", "UpdatedDate", "UpdatedBy", "HashKey"]
    ])

    # Step 8: Perform Merge Operation
    (
        target_delta.alias("target")
        .merge(
            source_df.alias("source"),
            f"target.{table['primary_key']} = source.{table['primary_key']}"
        )
        .whenMatchedUpdate(
            condition=compare_condition,
            set=update_set
        )
        .whenNotMatchedInsert(
            values=insert_values
        )
        .execute()
    )

# Step 9: Display all tables from Gold Layer to confirm
for table in tables:
    print(f"Showing Data from: {table['target_path']}")
    display(spark.read.format("delta").load(table["target_path"]))
    print("\n--------------------------\n")


Showing Data from: /mnt/project2/gold/accounts


account_id,customer_id,account_type,balance,CreatedDate,UpdatedDate,CreatedBy,UpdatedBy,HashKey
31,71,Savings,125.75,2025-04-27T18:34:43.196Z,2025-04-27T18:34:43.196Z,databricks,databricks,1980638888
85,65,Savings,800.25,2025-04-27T18:34:43.196Z,2025-04-27T18:34:43.196Z,databricks,databricks,2548014642
65,69,Savings,550.25,2025-04-27T18:34:43.196Z,2025-04-27T18:34:43.196Z,databricks,databricks,304827082
53,86,Savings,400.25,2025-04-27T18:34:43.196Z,2025-04-27T18:34:43.196Z,databricks,databricks,2004993884
78,4,Checking,7900.5,2025-04-27T18:34:43.196Z,2025-04-27T18:34:43.196Z,databricks,databricks,1018885386
34,41,Checking,3500.5,2025-04-27T18:34:43.196Z,2025-04-27T18:34:43.196Z,databricks,databricks,482012914
81,70,Savings,750.25,2025-04-27T18:34:43.196Z,2025-04-27T18:34:43.196Z,databricks,databricks,3251687720
28,7,Checking,2900.0,2025-04-27T18:34:43.196Z,2025-04-27T18:34:43.196Z,databricks,databricks,610557400
76,22,Checking,7700.0,2025-04-27T18:34:43.196Z,2025-04-27T18:34:43.196Z,databricks,databricks,778329537
26,25,Checking,2800.5,2025-04-27T18:34:43.196Z,2025-04-27T18:34:43.196Z,databricks,databricks,781115167



--------------------------

Showing Data from: /mnt/project2/gold/customers


customer_id,first_name,last_name,address,city,state,zip,CreatedDate,UpdatedDate,CreatedBy,UpdatedBy,HashKey
31,David,Sanchez,3030 Maple Ave,North Bay,ON,P1B0A1,2025-04-27T18:34:45.537Z,2025-04-27T18:34:45.537Z,databricks,databricks,3042266065
65,Daniel,Bryant,6464 Redwood Dr,Elmvale,ON,L0L0A1,2025-04-27T18:34:45.537Z,2025-04-27T18:34:45.537Z,databricks,databricks,194000654
53,James,Jenkins,5252 Willow Rd,Queensville,ON,L0G0A1,2025-04-27T18:34:45.537Z,2025-04-27T18:34:45.537Z,databricks,databricks,3582412600
78,Abigail,Cole,7777 Fir St,Sundridge,ON,P0A0A1,2025-04-27T18:34:45.537Z,2025-04-27T18:34:45.537Z,databricks,databricks,481330017
34,Olivia,Reed,3333 Birch Blvd,Orillia,ON,L3V0A1,2025-04-27T18:34:45.537Z,2025-04-27T18:34:45.537Z,databricks,databricks,3830528236
81,Michael,Owens,8080 Willow Rd,Mattawa,ON,P0H0A1,2025-04-27T18:34:45.537Z,2025-04-27T18:34:45.537Z,databricks,databricks,3690495345
28,Emily,Edwards,2727 Beech Dr,Brantford,ON,N3T0A1,2025-04-27T18:34:45.537Z,2025-04-27T18:34:45.537Z,databricks,databricks,279532449
76,Evelyn,Wallace,7575 Birch Blvd,Huntsville,ON,P1H0A1,2025-04-27T18:34:45.537Z,2025-04-27T18:34:45.537Z,databricks,databricks,3949251898
26,Abigail,Parker,2525 Poplar St,Barrie,ON,L4M0A1,2025-04-27T18:34:45.537Z,2025-04-27T18:34:45.537Z,databricks,databricks,3632072372
27,James,Evans,2626 Ash Blvd,Guelph,ON,N1H0A1,2025-04-27T18:34:45.537Z,2025-04-27T18:34:45.537Z,databricks,databricks,22743571



--------------------------

Showing Data from: /mnt/project2/gold/loans


loan_id,customer_id,loan_amount,interest_rate,loan_term,CreatedDate,UpdatedDate,CreatedBy,UpdatedBy,HashKey
31,71,10000.75,6.5,60,2025-04-27T18:34:47.702Z,2025-04-27T18:34:47.702Z,databricks,databricks,2873838954
85,65,25000.25,5.0,36,2025-04-27T18:34:47.702Z,2025-04-27T18:34:47.702Z,databricks,databricks,1058304091
65,69,25000.25,5.5,36,2025-04-27T18:34:47.702Z,2025-04-27T18:34:47.702Z,databricks,databricks,1662139857
53,86,15000.25,5.0,36,2025-04-27T18:34:47.702Z,2025-04-27T18:34:47.702Z,databricks,databricks,3538448823
78,4,27500.5,4.0,48,2025-04-27T18:34:47.702Z,2025-04-27T18:34:47.702Z,databricks,databricks,3789958153
34,41,30000.5,4.5,48,2025-04-27T18:34:47.702Z,2025-04-27T18:34:47.702Z,databricks,databricks,3870156656
81,70,10000.25,5.5,36,2025-04-27T18:34:47.702Z,2025-04-27T18:34:47.702Z,databricks,databricks,56734823
28,7,27500.0,3.5,24,2025-04-27T18:34:47.702Z,2025-04-27T18:34:47.702Z,databricks,databricks,1830646000
76,22,17500.0,3.5,24,2025-04-27T18:34:47.702Z,2025-04-27T18:34:47.702Z,databricks,databricks,2703813577
26,25,17500.5,4.5,48,2025-04-27T18:34:47.702Z,2025-04-27T18:34:47.702Z,databricks,databricks,2136442269



--------------------------

Showing Data from: /mnt/project2/gold/loan_payments


payment_id,loan_id,payment_date,payment_amount,CreatedDate,UpdatedDate,CreatedBy,UpdatedBy,HashKey
31,42,2024-01-31,1600.0,2025-04-27T18:34:49.528Z,2025-04-27T18:34:49.528Z,databricks,databricks,1546514440
85,36,2024-03-25,4300.0,2025-04-27T18:34:49.528Z,2025-04-27T18:34:49.528Z,databricks,databricks,2985753122
65,16,2024-03-05,3300.0,2025-04-27T18:34:49.528Z,2025-04-27T18:34:49.528Z,databricks,databricks,3969983274
53,84,2024-02-22,2700.0,2025-04-27T18:34:49.528Z,2025-04-27T18:34:49.528Z,databricks,databricks,4201604407
78,59,2024-03-18,3950.0,2025-04-27T18:34:49.528Z,2025-04-27T18:34:49.528Z,databricks,databricks,181842987
34,75,2024-02-03,1750.0,2025-04-27T18:34:49.528Z,2025-04-27T18:34:49.528Z,databricks,databricks,3932526866
81,92,2024-03-21,4100.0,2025-04-27T18:34:49.528Z,2025-04-27T18:34:49.528Z,databricks,databricks,3901577614
28,9,2024-01-28,1450.0,2025-04-27T18:34:49.528Z,2025-04-27T18:34:49.528Z,databricks,databricks,2490852440
76,37,2024-03-16,3850.0,2025-04-27T18:34:49.528Z,2025-04-27T18:34:49.528Z,databricks,databricks,429584168
26,87,2024-01-26,1350.0,2025-04-27T18:34:49.528Z,2025-04-27T18:34:49.528Z,databricks,databricks,1765597607



--------------------------

Showing Data from: /mnt/project2/gold/transactions


transaction_id,account_id,transaction_date,transaction_amount,transaction_type,CreatedDate,UpdatedDate,CreatedBy,UpdatedBy,HashKey
31,71,2024-01-31,100.5,Deposit,2025-04-27T18:34:51.348Z,2025-04-27T18:34:51.348Z,databricks,databricks,4072259915
85,65,2024-03-25,250.0,Deposit,2025-04-27T18:34:51.348Z,2025-04-27T18:34:51.348Z,databricks,databricks,23840219
65,69,2024-03-05,250.0,Deposit,2025-04-27T18:34:51.348Z,2025-04-27T18:34:51.348Z,databricks,databricks,3647168622
53,86,2024-02-22,150.0,Deposit,2025-04-27T18:34:51.348Z,2025-04-27T18:34:51.348Z,databricks,databricks,3540279397
78,4,2024-03-18,275.75,Withdrawal,2025-04-27T18:34:51.348Z,2025-04-27T18:34:51.348Z,databricks,databricks,2084232636
34,41,2024-02-03,300.25,Withdrawal,2025-04-27T18:34:51.348Z,2025-04-27T18:34:51.348Z,databricks,databricks,2765226270
81,70,2024-03-21,100.5,Deposit,2025-04-27T18:34:51.348Z,2025-04-27T18:34:51.348Z,databricks,databricks,650618025
28,7,2024-01-28,275.75,Withdrawal,2025-04-27T18:34:51.348Z,2025-04-27T18:34:51.348Z,databricks,databricks,701320262
76,22,2024-03-16,175.0,Withdrawal,2025-04-27T18:34:51.348Z,2025-04-27T18:34:51.348Z,databricks,databricks,2724838028
26,25,2024-01-26,175.0,Withdrawal,2025-04-27T18:34:51.348Z,2025-04-27T18:34:51.348Z,databricks,databricks,1658304691



--------------------------

