In [7]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable
from pyspark.sql.window import Window

#### *Incremnetal Load Date*

In [8]:
source_schema = 'silver_sales'

target_schema = 'gold_sales'

backdate = ''

cdc_col = 'ModifiedDate'

In [9]:
if spark.catalog.tableExists(f"{target_schema}.FactOrder"):
    if len(backdate) == 0:
        last_load_date = spark.sql(f"""
                                SELECT MAX({cdc_col}) FROM {target_schema}.FactOrder
                                """).collect()[0][0]
    else:
        last_load_date = backdate
else:
    last_load_date = '1900-01-01 00:00:00'

last_load_date

In [10]:
df_src = spark.sql(f"""
                    SELECT * FROM {source_schema}.salesorderheader 
                    WHERE {cdc_col} > '{last_load_date}'
                    """)

df_src.createOrReplaceTempView("src")

df_src.count()

In [11]:
df_fact_src = spark.sql("""
                Select 
                    sh.SalesOrderID,
                    dr.DimResellerKey,
                    drc.DimCustomerKey,
                    dsp.DimSalesPersonKey,
                    sh.RevisionNumber, sh.OrderDate, sh.DueDate, sh.ShipDate, sh.Status, sh.OnlineOrderFlag, sh.SalesOrderNumber,
                    sh.PurchaseOrderNumber, sh.AccountNumber, sh.TerritoryID, sh.BillToAddressID, sh.ShipToAddressID, sh.ShipMethodID,
                    sh.CreditCardID, sh.Subtotal, sh.TaxAmt, sh.Freight, sh.TotalDue, sh.ModifiedDate
                FROM
                src sh
                LEFT JOIN gold_sales.dimreseller dr
                    ON sh.CustomerID = dr.CustomerID
                    AND sh.OrderDate >= dr.StartDate
                    AND sh.OrderDate < dr.EndDate
                LEFT JOIN gold_sales.dimretailcustomer drc
                    ON sh.CustomerID = drc.CustomerID
                    AND sh.OrderDate >= drc.StartDate
                    AND sh.OrderDate < drc.EndDate
                LEFT JOIN gold_sales.dimsalesperson dsp
                    ON sh.SalesPersonID = dsp.SalesPersonID
                    AND sh.OrderDate >= dsp.StartDate
                    AND sh.OrderDate < dsp.EndDate
                """)

In [12]:
if spark.catalog.tableExists(f"{target_schema}.FactOrder"):
    dlt_obj = DeltaTable.forName(spark, f"{target_schema}.FactOrder")
    dlt_obj.alias("trg").merge(df_fact_src.alias("src"), "trg.SalesOrderID = src.SalesOrderID")\
                        .whenMatchedUpdateAll(condition=f"src.{cdc_col} > trg.{cdc_col}")\
                        .whenNotMatchedInsertAll()\
                        .execute()
else:
    df_fact_src.write.format('delta')\
                .mode('append')\
                .option('path', 'abfss://gold@dlcontoso.dfs.core.windows.net/sales/FactOrder')\
                .saveAsTable('gold_sales.FactOrder')