### Config stuff

In [None]:
import random
import pyspark
from pyspark.sql import SparkSession, functions
import ConnectionConfig as cc
from pyspark.sql.functions import *

In [None]:
cc.setupEnvironment()
spark = cc.startLocalCluster("dimSalesRepInit")
spark.getActiveSession()

# Initial load
We will create a slowly changing dimension type 2 called dimSalesRep based on a sourceTable in our operational database called dbo.salesrep. SCD2  tables demand extra care and  because we will store hirstorical values of the dimension with the help of ranges.
This notebook will create the table and fill it with the initial data. A second notebook will be used for increments of new and changed data.

This is an example of the expected output (salesRepSK is different
```
+----------+-------------+-------------+-----------+-------------------+-------------------+--------------------+-------+
|salesRepID|         name|       office| salesRepSK|          scd_start|            scd_end|                 md5|current|
+----------+-------------+-------------+-----------+-------------------+-------------------+--------------------+-------+
|a46add1...|      Z. Jane|     New York|          0|1990-01-01 00:00:00|2100-12-12 00:00:00|303db545462092a92...|   true|
|s1fedf1...|   P. Chapman|       Berlin|          1|1990-01-01 00:00:00|2100-12-12 00:00:00|14b094c31bf9e4149...|   true|
|d5e6f77...|     T. Crane|     New York|          2|1990-01-01 00:00:00|2100-12-12 00:00:00|6c062f95defda9dc3...|   true|
```




In [None]:
cc.set_connection("tutorial_op")

df_operational_sales_rep = spark.read \
    .format("jdbc") \
    .option("driver" , "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "dbo.salesrep") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "salesRepID") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()

#Method 1 : Use the DataFrame API
#df_dim_sales_rep = df_sales_rep.withColumn("salesRepSK", expr("uuid()")) \
#    .withColumn("scd_start", lit("1990-01-01").cast("timestamp")) \
#    .withColumn("scd_end", lit("2100-12-12").cast("timestamp")) \
#    .withColumn("md5", md5(concat( col('name'), col('office')))) \
#    .withColumn("current", lit(True))

#Method 2 : Use SQL
df_operational_sales_rep.createOrReplaceTempView("dimSalesRep")
df_dim_sales_rep = spark.sql("select uuid() as salesRepSK, *, to_timestamp('1999-01-01','yyyy-MM-dd') as scd_start, to_timestamp('2100-12-12','yyyy-MM-dd') as scd_end, md5(concat( name, office)) as md5, True as current  from dimSalesRep")

df_dim_sales_rep.printSchema()
df_dim_sales_rep.show()
df_dim_sales_rep.write.format("delta").mode("overwrite").saveAsTable("dimSalesRep")

#spark.sql("ALTER TABLE dimSalesRep  ADD columns (salesRepSK long GENERATED ALWAYS AS IDENTITY (START WITH 0 INCREMENT BY 1)")

* The function lit() is used when you want a fixed column value for every row. In this case scd_start, scd_end and current.
* The function md5() performs a md5-hash function on the preferred columns. This is needed to detect scd2 changes. When a specific column changes, the md5-hash will change. Include all SCD2 columns in the md5-hash function.

## Delete the spark session

In [None]:
spark.stop()