In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
dbutils.widgets.text("init_load_flag", "0")
init_load_flag = int(dbutils.widgets.get("init_load_flag"))

# **Data Consumption**

In [0]:
df = spark.sql("select * from azuredb_catalog.silver.customers_silver")
df.display()

# **Deduplication**

In [0]:
df = df.dropDuplicates(subset=['customer_id'])


## New vs Old Records filter

In [0]:
if init_load_flag == 0:
    
    df_old = spark.sql('''select DimCustomerKey, customer_id, create_date, update_date 
                       from azuredb_catalog.gold.DimCustomers''')

else:

    df_old = spark.sql('''select 0 DimCustomerKey, 0 customer_id, 0 create_date, 0 update_date 
                       from azuredb_catalog.silver.customers_silver 
                       where 1=0''')


In [0]:
df_old.display()

**Renaming df_old Columns**

In [0]:
df_old = df_old.withColumnRenamed("DimCustomerKey", "old_DimCustomerKey")\
               .withColumnRenamed("customer_id", "old_customer_id")\
               .withColumnRenamed("create_date", "old_create_date")\
               .withColumnRenamed("update_date", "old_update_date")

## **Historical Record Join for Change Detection**

In [0]:
df_join = df.join(df_old, df['customer_id'] == df_old['old_customer_id'], 'left')

In [0]:
df_join.display()

## **New vs Old Records Segregation**

In [0]:
df_new = df_join.filter(df_join['old_DimCustomerKey'].isNull())
df_old = df_join.filter(df_join['old_DimCustomerKey'].isNotNull())

## Preparing df_old

In [0]:
#Dropping all the unecessary columns
df_old = df_old.drop('old_customer_id', 'old_update_date')

#Renaming "old_DimCustomerKey" column to "DimCustomerKey"
df_old = df_old.withColumnRenamed("old_DimCustomerKey", "DimCustomerKey")

#Renaming "old_create_date" column to "create_date"
df_old = df_old.withColumnRenamed("old_create_date", "create_date")
df_old = df_old.withColumn("create_date", to_timestamp(col("create_date")))

#Recreating "update_date" column with current timestamp
df_old = df_old.withColumn("update_date", current_timestamp())


In [0]:
df_old.display()

## Preparing df_new

In [0]:
#Dropping all the unecessary columns
df_new = df_new.drop('old_DimCustomerKey', 'old_customer_id', 'old_update_date', 'old_create_date', 'current_date')


#Recreating "update_date" , "create_date" columns with current timestamp
df_new = df_new.withColumn("update_date", current_timestamp())
df_new = df_new.withColumn("create_date", current_timestamp())


In [0]:
df_new.display()

## Surrogate Key - From 1

In [0]:
df_new = df_new.withColumn("DimCustomerKey", monotonically_increasing_id()+lit(1))

# **Integrating Max Surrogate Key**

In [0]:
if init_load_flag == 1:
    
    max_surrogate_key = 0

else:

    df_maxsur = spark.sql("select max(DimCustomerKey) as max_surrogate_key from azuredb_catalog.gold.DimCustomers")

    # Converting df_maxsur(dataframe) to max_surrogate_key (variable) using .collect function
    max_surrogate_key = df_maxsur.collect()[0]['max_surrogate_key']



In [0]:
df_new = df_new.withColumn("DimCustomerKey", lit(max_surrogate_key)+col("DimCustomerKey"))

# **Union of df_old and df_new**

In [0]:
df_final = df_new.unionByName(df_old)

In [0]:
df_new.display()

# **SCD Type -1 (Upsert)**

In [0]:
from delta.tables import DeltaTable

In [0]:
if spark.catalog.tableExists("azuredb_catalog.gold.DimCustomers"):

    dlt_obj = DeltaTable.forPath(spark, "abfss://gold@dbxdev.dfs.core.windows.net/DimCustomers")

    dlt_obj.alias("target").merge(df_final.alias("source"), "source.DimCustomerKey=target.DimCustomerKey")\
        .whenMatchedUpdateAll()\
        .whenNotMatchedInsertAll()\
        .execute()

else:

    df_final.write.mode("overwrite")\
                  .format("delta")\
                  .option("path","abfss://gold@dbxdev.dfs.core.windows.net/DimCustomers")\
                  .saveAsTable("azuredb_catalog.gold.DimCustomers")

In [0]:
%sql
SELECT * FROM azuredb_catalog.gold.dimcustomers