### Config stuff

In [11]:

from pyspark.sql import SparkSession, functions
import ConnectionConfig as cc
from pyspark.sql.functions import *


In [3]:
from delta import configure_spark_with_delta_pip
builder = SparkSession.builder \
    .appName("FactSales") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.shuffle.partitions", "4") \
    .config("spark.driver.extraClassPath", ":".join(cc.jars)) \
    .master("local[*]")
builder = configure_spark_with_delta_pip(builder)
spark = builder.getOrCreate()
builder.getOrCreate()

# Fact transformations
This notebooks creates the sales fact table from scratch based on the operational source table "sales"
When creating a fact table always follow the listed steps in order.


#### 1 READ NECESSARY SOURCE TABLE(S) AND PERFORM TRANSFORMATIONS
**When reading from the source table make sure you include all data necessary:**
- to calculate the measure values
- the source table keys that you have to use to lookup the correct surrogate keys in the dimension tables.

**If more than one table is needed to gather the necesary information you can opt for one of two strategies:**
- Use a select query when reading from the jdbc source with the spark.read operation. Avoid complex queries because the operational database needs a lot of resources to run those queries.
- Perform a spark.read operation for each table separately and join the tables within Spark. The joins will take place on the cluster instead of the database. You limit the database recources used, but there can be a significant overhead of unnecessary data tranferred to the cluster.


In this case we just rename Amount and create a default count_mv column.
*In this case, the transformations are minimal.
In reality, those transformations can be far more complex. If so, it can be advisable to work out the transforms in more then one step.*



In [12]:
cc.set_connection("mydb")
sale_src_df = spark.read \
    .format("jdbc") \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "dbo.sales") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "Order_ID") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 1000) \
    .load()\
    .withColumnRenamed("Amount","revenue_mv")\
    .withColumn("count_mv", lit(1))

#sale_src_df.show(20)


#### 2 MAKE DIMENSION TABLES AVAILABLE AS VIEWS

In [13]:
dim_date = spark.read.format("delta").load("spark-warehouse/dimdate")
dim_salesrep = spark.read.format("delta").load("spark-warehouse/dimsalesrep/")
dim_date.createOrReplaceTempView("dimDate")
dim_salesrep.createOrReplaceTempView("dimSalesRep")


#### 3 Build the fact table

Within the creation of a fact table always perform these two tasks:
1.   Include the measures of the fact
2. Use the dimension tables to look up the surrogate keys that correspond with the natural key value. In case of SCD2 dimension use the scd_start en scd_end to find the correct version of the data in the dimension


In [14]:
sale_src_df.createOrReplaceTempView("sales_source")
salesFactFromSource = spark.sql("select src.Order_ID as OrderID, dd.dateSk as dateSK, ds.salesrepSK, src.count_mv as countMV, src.revenue_mv as revenue_mv \
          from sales_source as src \
          left outer join dimdate as dd on src.Order_Date = cast(dd.CalendarDate as DATE) \
           left outer join dimSalesRep as ds \
                      on src.SalesRepID = ds.SalesRepID \
                      and src.Order_Date > ds.scd_start \
                      and src.Order_Date <= ds.scd_end")

salesFactFromSource.show(10)


+-------+------+----------+-------+----------+
|OrderID|dateSK|salesrepSK|countMV|revenue_mv|
+-------+------+----------+-------+----------+
|     57|   866|         1|      1| 778297706|
|     58|   320|         1|      1|2100696115|
|     59|  1012|         1|      1| 642114638|
|     60|  1375|         1|      1|1440206513|
|     61|   954|         1|      1|1244596895|
|     62|   957|         1|      1|1662557955|
|     63|    62|         1|      1| 844183988|
|     64|   229|         1|      1| 681975050|
|     65|   684|         1|      1|1504576144|
|     66|   876|         1|      1| 622992265|
+-------+------+----------+-------+----------+
only showing top 10 rows



## Initial load
The first time loading the fact table perform a FULL load. All data is written to the Delta Table.

In [9]:

#salesFactFromSource.write.format("delta").mode("overwrite").saveAsTable("factSales")


#spark.sql("ALTER TABLE dimSalesRep  ADD columns (salesRepSK long GENERATED ALWAYS AS IDENTITY (START WITH 0 INCREMENT BY 1)")

## Incremental load
When previous runs where performend you can opt for a 'faster' icremental run that only writes away changes.
In this example two merges are used: One that updates existing records and one that inserts new records.


- *Why two merges? Combining them in one merge did result in creating copies of all parquet files. This is due to the Delta Lake internals*
- * In our solution we detect a change by comparing avery source field with the existing fact field. Using the md5 hash strategy (see SCD2 dimension) would have been a superior solution. The MD5 column can be added in step 3.
- * To make the loading proces even faster you can opt to only read changed rows from the source tables (sales operational table in this case). This however requires a "last_updated" timestamp in the source table.*

In [15]:
from delta import DeltaTable
fact = DeltaTable.forPath(spark,".\spark-warehouse\\factsales")
#Merge to perform updates (TODO: Implement md5 strategy)
fact.alias("current")\
.merge(salesFactFromSource.alias("new"),
       "current.OrderID = new.OrderId "
       "and (current.dateSK <> new.dateSK OR current.salesrepSK <> new.salesrepSK OR current.countMV <> new.countMV OR current.revenue_MV <> new.revenue_MV)").whenMatchedUpdateAll().execute()
#Merge to perform inserts
fact.alias("current").merge(salesFactFromSource.alias("new"),"current.OrderID = new.OrderId").whenNotMatchedInsertAll().execute()

In [17]:
# IMPORTANT: ALWAYS TEST THE CREATED CODE.
# In this example I changed order 498 in the operational database and checked the change after the run.
# spark.sql("select * from factsales f join dimsalesrep ds on f.salesrepSK = ds.salesrepSK where OrderID = 192  ").show()
spark.sql("select count(*) from factsales").show()
spark.sql("select * from factsales where orderId=1").show()



+--------+
|count(1)|
+--------+
|    1000|
+--------+

+-------+------+----------+-------+----------+
|OrderID|dateSK|salesrepSK|countMV|revenue_mv|
+-------+------+----------+-------+----------+
|      1|  5405|         0|      1|        10|
+-------+------+----------+-------+----------+



NameError: name 'selec' is not defined

### Checking the history of your delta fact table

In [None]:
# The history information is derived from the delta table log files. They contain a lot of information of all the actions performed on the table. In this case it tells us something about de merge operations. You can find statistics about the update and insert counts in the document.

fact.history().show(10,False)

In [None]:
spark.stop()