In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable
from pyspark.sql.window import Window

In [2]:
source_schema = 'silver_sales'

target_schema = 'gold_sales'

surrogate_key = 'DimSalesPersonKey'

key_col = 'SalesPersonID'

In [3]:
df_src = spark.sql(f"""
                SELECT
                    sp.BusinessEntityID AS SalesPersonID,
                    p.PersonType, p.Title, p.FirstName, p.MiddleName, p.LastName,
                    p.EmailPromotion, pp.PhoneNumber, pnt.`Name` AS PhoneNumberType,
                    ed.EmailAddress,
                    sp.SalesQuota,
                    st.`Name` AS TerritoryName,
                    CURRENT_TIMESTAMP() AS StartDate,
                    CAST('3000-12-31 00:00:00' AS TIMESTAMP) AS EndDate,
                    'Y' AS IsCurrent
                FROM {source_schema}.SalesPerson sp
                LEFT JOIN {source_schema}.Person p
                On sp.BusinessEntityID = p.BusinessEntityID
                LEFT JOIN {source_schema}.PersonPhone pp
                ON pp.BusinessEntityID = p.BusinessEntityID
                LEFT JOIN {source_schema}.EmailAddress ed
                ON ed.BusinessEntityID = p.BusinessEntityID
                LEFT JOIN {source_schema}.PhoneNumberType pnt
                ON pp.PhoneNumberTypeID = pnt.PhoneNumberTypeID
                LEFT JOIN {source_schema}.SalesTerritory st
                ON sp.TerritoryID = st.TerritoryID
                """)

In [4]:
if spark.catalog.tableExists(f"{target_schema}.DimSalesPerson"):
    df_trg = spark.sql(f"""
                    SELECT {key_col}, {surrogate_key}, CreatedAt, UpdatedAt 
                    FROM {target_schema}.DimSalesPerson
                    """)
else:
    df_trg = spark.sql(f"""
                    SELECT 
                    CAST('0' AS INT) AS {key_col}, 
                    CAST('0' AS INT) AS {surrogate_key}, 
                    CAST('1900-01-01 00:00:00' AS TIMESTAMP) AS CreatedAt,
                    CAST('1900-01-01 00:00:00' AS TIMESTAMP) AS UpdatedAt
                    WHERE 1=0
                    """)

In [5]:
df_src.createOrReplaceTempView("src")

df_trg.createOrReplaceTempView("trg")

df_join = spark.sql(f"""
                SELECT 
                    src.*,
                    trg.{surrogate_key}, trg.CreatedAt, trg.UpdatedAt
                FROM
                src
                LEFT JOIN
                trg
                ON src.{key_col} = trg.{key_col}
                """)

In [6]:
df_old = df_join.filter(col(f"{surrogate_key}").isNotNull())

df_new = df_join.filter(col(f"{surrogate_key}").isNull())

In [7]:
df_old_enr = df_old.withColumn("UpdatedAt", current_timestamp())

In [8]:
if spark.catalog.tableExists(f"{target_schema}.DimSalesPerson"):
    max_surrogate_key = spark.sql(f"""
                        SELECT MAX({surrogate_key}) FROM {target_schema}.DimSalesPerson
                        """).collect()[0][0]
    w = Window.orderBy(monotonically_increasing_id())
    df_new_enr = df_new.withColumn(f"{surrogate_key}", row_number().over(w) + lit(max_surrogate_key))\
                        .withColumn("CreatedAt", current_timestamp())\
                        .withColumn("UpdatedAt", current_timestamp())
else:
    max_surrogate_key = 0
    w = Window.orderBy(monotonically_increasing_id())
    df_new_enr = df_new.withColumn(f"{surrogate_key}", row_number().over(w) + lit(max_surrogate_key))\
                        .withColumn("CreatedAt", current_timestamp())\
                        .withColumn("UpdatedAt", current_timestamp())
max_surrogate_key

In [9]:
df_union = df_old_enr.unionByName(df_new_enr)

df_union.createOrReplaceTempView("df_final")

In [10]:
if spark.catalog.tableExists(f"{target_schema}.DimSalesPerson"):
    spark.sql(F"""
            MERGE INTO {target_schema}.DimSalesPerson AS trg
            USING df_final AS src
            ON trg.{surrogate_key} = src.{surrogate_key}
            AND trg.IsCurrent = 'Y'

            WHEN MATCHED AND (
                trg.PersonType <> src.PersonType OR
                trg.FirstName <> src.FirstName OR
                trg.MiddleName <> src.MiddleName OR
                trg.LastName <> src.LastName OR
                trg.PhoneNumber <> src.PhoneNumber OR
                trg.PhoneNumberType <> src.PhoneNumberType OR
                trg.EmailAddress <> src.EmailAddress OR
                trg.TerritoryName <> src.TerritoryName OR
                trg.SalesQuota <> src.SalesQuota
            )
            THEN UPDATE SET
                trg.IsCurrent = 'N',
                trg.EndDate = Current_Timestamp();
    """)

    spark.sql(f"""
            MERGE INTO {target_schema}.DimSalesPerson AS trg
            USING df_final AS src
            ON trg.{surrogate_key} = src.{surrogate_key}
            AND trg.IsCurrent = 'Y'
            WHEN NOT MATCHED THEN INSERT *

    """)

else:
    df_union.write.format("delta")\
                    .mode("append")\
                    .option("path", "abfss://gold@dlcontoso.dfs.core.windows.net/sales/DimSalesPerson")\
                    .saveAsTable(f"{target_schema}.DimSalesPerson")