###Importing the modules

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable

###Creating schema for gold layer

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS retail_analytics.gold;

###Reading data from silver layer and selecting the needed columns

In [0]:
dim_cust_df = (
    spark.read.table("retail_analytics.silver.customers")
    .select(
        col("Customer_ID").alias("customer_id"),
        col("Name").alias("customer_name"),
        col("Email").alias("email"),
        col("City").alias("city"),
        col("Country").alias("country"),
        col("Gender").alias("gender"),
        col("Date_Of_Birth").alias("date_of_birth"),
        col("Job_Title").alias("job_title")
    )
    .dropDuplicates(["customer_id"])
)

###Adding metadata

In [0]:
cust_silver_df = spark.read.table("retail_analytics.silver.customers")

dim_cust_df = (
    cust_silver_df
    .withColumn("_created_at", current_timestamp())
    .withColumn("_updated_at", current_timestamp())
)

###Creating gold table

In [0]:
GOLD_CUSTOMERS_TABLE = "retail_analytics.gold.dim_customers"

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {GOLD_CUSTOMERS_TABLE} (
    customer_sk BIGINT GENERATED ALWAYS AS IDENTITY,
    customer_id INT,
    name STRING,
    email STRING,
    telephone STRING,
    city STRING,
    country STRING,
    gender STRING,
    date_of_birth DATE,
    job_title STRING,
    _created_at TIMESTAMP,
    _updated_at TIMESTAMP
)
USING DELTA
""")

DataFrame[]

###Merge process(SCD-1)

In [0]:
dim_cust_tbl = DeltaTable.forName(spark, GOLD_CUSTOMERS_TABLE)

(
    dim_cust_tbl.alias("tgt")
    .merge(
        dim_cust_df.alias("src"),
        "tgt.customer_id = src.customer_id"
    )
    .whenMatchedUpdate(set={
        "name": "src.name",
        "email": "src.email",
        "telephone": "src.telephone",
        "city": "src.city",
        "country": "src.country",
        "gender": "src.gender",
        "date_of_birth": "src.date_of_birth",
        "job_title": "src.job_title",
        "_updated_at": "current_timestamp()"
    })
    .whenNotMatchedInsert(values={
        "customer_id": "src.customer_id",
        "name": "src.name",
        "email": "src.email",
        "telephone": "src.telephone",
        "city": "src.city",
        "country": "src.country",
        "gender": "src.gender",
        "date_of_birth": "src.date_of_birth",
        "job_title": "src.job_title",
        "_created_at": "current_timestamp()",
        "_updated_at": "current_timestamp()"
    })
    .execute()
)

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
spark.read.table("retail_analytics.gold.dim_customers").limit(5).display()   

customer_sk,customer_id,name,email,telephone,city,country,gender,date_of_birth,job_title,_created_at,_updated_at
1,1021,Frank Abbott,frank.abbott@fake_yahoo.com,001-838-768-0484x92375,New York,United States,Male,1982-12-02,International aid/development worker,2026-01-20T05:58:31.756Z,2026-01-20T06:05:26.732Z
2,2737,Daniel Mcgrath,daniel.mcgrath@fake_hotmail.com,+1-510-591-0885x50078,New York,United States,Male,2001-07-01,Not available,2026-01-20T05:58:31.756Z,2026-01-20T06:05:26.732Z
3,2936,Lisa Bradford,lisa.bradford@fake_hotmail.com,(579)444-9109,New York,United States,Female,2001-03-05,Media buyer,2026-01-20T05:58:31.756Z,2026-01-20T06:05:26.732Z
4,3139,Jay Williams,jay.williams@fake_gmail.com,(268)754-7512x32370,New York,United States,Male,1997-08-09,Museum/gallery conservator,2026-01-20T05:58:31.756Z,2026-01-20T06:05:26.732Z
5,3683,Danielle Thomas,danielle.thomas@fake_hotmail.com,892-793-6555x2118,New York,United States,Female,1999-10-21,Furniture conservator/restorer,2026-01-20T05:58:31.756Z,2026-01-20T06:05:26.732Z


In [0]:
spark.read.table("retail_analytics.gold.dim_customers").count()

1643306