# Config stuff

In [None]:

import ConnectionConfig as cc
from delta import DeltaTable
from datetime import datetime

In [None]:
cc.setupEnvironment()
spark = cc.startLocalCluster("dimSalesIncrementalLoad")
spark.getActiveSession()

## Incremental load

After the sales Rep dimension is filled for the first time, the logic to update the dimension has to be handled differently. A change of a record in the source system has to be handled as a change in the dimension. The SCD2 logic is used to handle this.

The SCD2 implementation requires a more complex transformation to correctly handle changes in the source files. For detailed information consult the comments in the code.
### Setting the parameters
The timestamp of the job is used to set the scd_end date of the previous record and the scd_start date of the new record.

In [None]:
run_timestamp =datetime.now() #The job runtime is stored in a variable

### Read source table

##### 1. READ EXISTING DIMENSION
Read the existing deltaTable (as a deltaTable object, not a Dataframe). Make the table available as a View.

In [None]:
dt_dimSalesRep = DeltaTable.forPath(spark,".\spark-warehouse\dimsalesrep")

dt_dimSalesRep.toDF().filter('current = true').createOrReplaceTempView("dimSalesRep_current")

#DEBUG CODE TO SHOW CONTENT OF DIMENSION
#spark.sql("select salesRepID, name, office, salesRepSK, md5  from dimSalesRep_current ").show()


##### 2 READ SOURCE TABLE
Creating dataframe with source table (from operational system). Transformed to the dimension format.
The surrogate key is a uuid to be sure it's unique.
md5 hash is used to identify changes in the source table.
A view is created of the resulting dataframe to make it available for the next step.

In [None]:
cc.set_connection("tutorial_op")

#a. Reading from a JDBC source
df_operational_sales_rep = spark.read \
    .format("jdbc") \
    .option("driver" , "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "dbo.salesrep") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "salesRepID") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()

df_operational_sales_rep.createOrReplaceTempView("operational_sales_rep")

#b. Transforming the source to the dimension format
df_dim_sales_rep_new = spark.sql( "select uuid() as source_salesRepSK, \
                                        salesRepId as source_salesRepId, \
                                        name as source_name, \
                                        office as source_office, \
                                        md5(concat( name, office)) as source_md5 \
                                    from operational_sales_rep")

df_dim_sales_rep_new.createOrReplaceTempView("dimSalesRep_new")

#DEBUG CODE TO SHOW CONTENT OF SOURCE
#df_dim_sales_rep_new.printSchema()
#df_dim_sales_rep_new.show()
#spark.sql("select * from dimSalesRep_new").show()
#df_dim_sales_rep.write.format("delta").mode("overwrite").saveAsTable("dimSalesRep")



##### 3 DETECT CHANGES
Dataframe to identify SCD2 changed rows.
First a join between SOURCE (operational system) and DIMENSION (dwh) is performed
   The md5 hash is used to identify differences.
   The list contains:
       - updated source-rows (the join finds a rowand the md5 is different)  and
       - new source-rows (the leftouter join does not find a row in the dimension (dwh.salesRepId is null)

In [None]:

detectedChanges=spark.sql(f"select * \
                          from dimSalesRep_new source \
                          left outer join dimSalesRep_current dwh on dwh.salesRepID == source.source_salesRepId and dwh.current == true \
                          where dwh.salesRepId is null or dwh.md5 <> source.source_md5")

detectedChanges.createOrReplaceTempView("detectedChanges")

#DEBUG CODE TO SHOW CONTENT OF DETECTED CHANGES
#detectedChanges.show()



##### 4 TRANSOFRM TO UPSERTS
Every updated and new source-row requires the insertion of a new record in the SCD2 dimension.
Updated source-rows also require an update of the existing scd-fields.

Rows without mergeKey will be inserted in the dimension table.
Rows with mergekey will be updated in the dimension

In [None]:

df_upserts = spark.sql(f"select null as mergeKey, \
                                source_salesRepSK as salesRepSK,\
                                source_salesRepId as salesRepId,\
                                source_name as name,\
                                source_office as office,\
                                to_timestamp('{run_timestamp}') as scd_start, \
                                to_timestamp('2100-12-12','yyyy-MM-dd') as scd_end,\
                                source_md5 as md5,\
                                true as current\
                        from  detectedChanges\
                        union \
                        select salesRepId as mergeKey,\
                                salesRepSK,\
                                salesRepId,\
                                name,\
                                office,\
                                scd_start,\
                                to_timestamp('{run_timestamp}') as scd_end,\
                                md5, \
                                false \
                                from detectedChanges \
                        where current is not null")

df_upserts.createOrReplaceTempView("upserts")


##### 5 PERFORM MERGE DIMSALESREP AND UPSERTS
merge looks for a matching dwh.salesRepID (in the dimension) for mergeKey
   - when a match is found (the dimension table contains a row where its salesRepId corresponds with one of the mergekeys)  -> perform update of row to close the period and set current to "false"
   - when no match is found (there is no salesRepID in the dimension because the mergeKey is null) -> perform an insert with the data from the updserts table (from the source). The scd-start is filled with the run_timestamp)

In [None]:
spark.sql("MERGE INTO dimSalesRep_current AS target \
          using upserts AS source ON target.salesRepID = source.mergeKey \
          WHEN MATCHED THEN UPDATE SET * \
          WHEN NOT MATCHED THEN INSERT *")

#DEBUG CODE TO SHOW CONTENT OF DIMENSION
dt_dimSalesRep.toDF().sort("salesRepID", "scd_start").show(100)


## Delete the spark session

In [None]:
spark.stop()