### Config stuff

In [1]:
import random
import pyspark
from pyspark.sql import SparkSession, functions
import ConnectionConfig as cc
from delta import DeltaTable
from pyspark.sql.functions import *

In [2]:
from delta import configure_spark_with_delta_pip

builder = SparkSession.builder \
    .appName("DimDate") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.shuffle.partitions", "4") \
    .config("spark.driver.extraClassPath", ":".join(cc.jars)) \
    .master("local[*]")
builder = configure_spark_with_delta_pip(builder)
spark = builder.getOrCreate()
builder.getOrCreate()

# Initial load
We will create a slowly changing dimension type 2 called dimSalesRep based on a sourceTable in our operational database called dbo.salesrep. SCD2  tables demand extra care and  because we will store hirstorical values of the dimension with the help of ranges.
This notebook will create the table and fill it with the initial data. A second notebook will be used for increments of new and changed data.

This is an example of the expected output
```
+----------+-------------+-------------+-----------+-------------------+-------------------+--------------------+-------+
|salesRepID|         name|       office| salesRepSK|          scd_start|            scd_end|                 md5|current|
+----------+-------------+-------------+-----------+-------------------+-------------------+--------------------+-------+
|         1|      Z. Jane|     New York|          0|1990-01-01 00:00:00|2100-12-12 00:00:00|303db545462092a92...|   true|
|         2|   P. Chapman|       Berlin|          1|1990-01-01 00:00:00|2100-12-12 00:00:00|14b094c31bf9e4149...|   true|
|         3|     T. Crane|     New York|          2|1990-01-01 00:00:00|2100-12-12 00:00:00|6c062f95defda9dc3...|   true|
```




In [3]:
cc.set_connection("mydb")

sales_df = spark.read \
    .format("jdbc") \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "dbo.salesrep") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "salesRepID") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load() \
    .withColumn("salesRepSK", monotonically_increasing_id()) \
    .withColumn("scd_start", lit("1990-01-01").cast("timestamp")) \
    .withColumn("scd_end", lit("2100-12-12").cast("timestamp")) \
    .withColumn("md5", md5(concat( col('name'), col('office')))) \
    .withColumn("current", lit(True))
# first run


sales_df.printSchema()
sales_df.show()
sales_df.write.format("delta").mode("overwrite").saveAsTable("dimSalesRep")

#spark.sql("ALTER TABLE dimSalesRep  ADD columns (salesRepSK long GENERATED ALWAYS AS IDENTITY (START WITH 0 INCREMENT BY 1)")

root
 |-- salesRepID: decimal(11,0) (nullable = true)
 |-- name: string (nullable = true)
 |-- office: string (nullable = true)
 |-- salesRepSK: long (nullable = false)
 |-- scd_start: timestamp (nullable = true)
 |-- scd_end: timestamp (nullable = true)
 |-- md5: string (nullable = true)
 |-- current: boolean (nullable = false)

+----------+-------------+-------------+-----------+-------------------+-------------------+--------------------+-------+
|salesRepID|         name|       office| salesRepSK|          scd_start|            scd_end|                 md5|current|
+----------+-------------+-------------+-----------+-------------------+-------------------+--------------------+-------+
|         1|      Z. Jane|      Chicago|          0|1990-01-01 00:00:00|2100-12-12 00:00:00|cbf61f481bec12d90...|   true|
|         2|   P. Chapman|       Berlin|          1|1990-01-01 00:00:00|2100-12-12 00:00:00|14b094c31bf9e4149...|   true|
|         3|     T. Crane|     New York|          2|1990-0

* The function lit() is used when you want a fixed column value for every row. In this case scd_start, scd_end and current.
* The function md5() performs a md5-hash function on the preferred columns. This is needed to detect changes in the incremental load notebook.

In [10]:
spark.stop()