# Read Source Tables

In [0]:
customers = spark.table('workspace.silver_pyspark.crm_customers')
display(customers.limit(2))

In [0]:
cust_az = spark.table('workspace.silver_pyspark.erp_cust_az')
display(cust_az.limit(2))

In [0]:
cust_loc = spark.table('workspace.silver_pyspark.erp_loc_a101')
display(cust_loc.limit(2))

In [0]:
from pyspark.sql.functions import col, when, coalesce, lit

(
    customers.alias("c")
    .join(
        cust_az.alias("ca"),
        col("c.customer_number") == col("ca.customer_number"),
        "left",
    )
    .join(
        cust_loc.alias("cl"),
        col("c.customer_number") == col("cl.customer_number"),
        "left",
    )
       .select(
        col("c.gender"), col("ca.gender"))
    .where(
        col("c.gender") != col("ca.gender")
        
    )
).display()

In [0]:
from pyspark.sql.functions import col,when,coalesce,lit
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
window_spec = Window.orderBy(col('c.customer_id'))
dim_cust = (customers.alias('c')
.join(
    cust_az.alias('ca'), col('c.customer_number') == col('ca.customer_number'), 'left'
    )
.join(
    cust_loc.alias('cl'), col('c.customer_number') == col('cl.customer_number'), 'left'
    )
    .withColumn(
        'gendr',
        when(col('c.gender') != 'n/a', col('c.gender'))             
        .otherwise(
            coalesce(col('ca.gender'), lit('n/a'))
        ))
    .withColumn('customer_key', row_number().over(window_spec))
    .select(
        col('customer_key'),
        col('c.customer_id'),
        col('c.customer_number'), 
        col('c.customer_name'),
        col('c.marital_status'),
        col('gendr').alias('gender'),
        col('ca.birthdate'),
        col('cl.country')
        )
    )

In [0]:
dim_cust.display()

# Drop the Tsrget table if it already Exists

In [0]:
spark.sql("""DROP TABLE IF EXISTS workspace.gold_pyspark.dim_customers""")

# Create the target table & load the joined tables o/p into it

In [0]:
dim_cust.write.format('delta').mode('overwrite').saveAsTable('workspace.gold_pyspark.dim_customers')

# Sanity Checks of Target table

In [0]:
dim_cust = spark.table('workspace.gold_pyspark.dim_customers')
dim_cust.display()

# View the table Changes

In [0]:
spark.sql("""DESCRIBE HISTORY workspace.gold_pyspark.dim_customers""").display()