## Gold Notebook dim_branch

In [0]:
#Add libraries
from pyspark.sql.functions import *
from pyspark.sql.types import *

## Creating FLAG Parameter

In [0]:
dbutils.widgets.text('incremental_flag','0')

In [0]:
incremental_flag = dbutils.widgets.get('incremental_flag')
print(incremental_flag)
print(type(incremental_flag))

0
<class 'str'>


## Creating Dimensional Model

### Fetch Relative Columns

In [0]:
%sql
SELECT * FROM parquet.`abfss://silver@datalake.dfs.core.windows.net/carsales`

### Creating the first dimension (Dim_Model)
**Step 1: I create a new Dataframe using spark.sql with model ID and model category.**

In [0]:
df_source = spark.sql('''
SELECT DISTINCT  
    (Branch_ID) as Branch_ID
    ,BranchName
 FROM parquet.`abfss://silver@datalake.dfs.core.windows.net/carsales`
 ''')

**Step 2: Add surrogate key to the table.**

In [0]:
#Example of initial load schema:

df_sink = spark.sql( '''
SELECT 
  1 as dim_branch_key, 
  Branch_ID, 
  BranchName 
FROM parquet.`abfss://silver@datalake.dfs.core.windows.net/carsales`
WHERE 1=0 --this means that this condition is always false. In other words only returns the schema.
''')
#display(df_sink)

In [0]:
%sql
--SELECT 
--  dim_model_key, 
--  Model_ID, 
--  model_category 
--FROM parquet.`abfss://silver@datalake.dfs.core.windows.net/carsales`
--WHERE 1=0 --this means that this condition is always false. In other words only returns the schema.

**Filtering new records and old records**

In [0]:
df_filter = df_source.join(df_sink, df_source['Branch_ID'] == df_sink['Branch_ID'], 'left').select(df_source['Branch_ID'], df_source['BranchName'], df_sink['dim_branch_key'])

df_filter_old

In [0]:
df_filter_old = df_filter.filter(col('dim_branch_key').isNotNull())

df_filter_new

In [0]:
df_filter_new = df_filter.filter(col('dim_branch_key').isNull()).select(df_source['Branch_ID'], df_source['BranchName'])

## Create Surrogate Key

**Fetch the max Surrogate Key from existing table**

In [0]:
%python
if incremental_flag == '0':
    max_value = 1
else:
    max_value = spark.sql(
        "select max(dim_branch_key) from carsproject_catalog.gold.dim_branch"
    )
    max_value = max_value.collect()[0][0]+1



**Create Surrogate Key Column and ADD the max surrogate key**

In [0]:
df_filter_new = df_filter_new.withColumn('dim_branch_key', max_value+monotonically_increasing_id())

## Create Final DF - df_filter_old + df_filter_new

In [0]:
df_final = df_filter_new.union(df_filter_old)

## SCD TYPE - 1 (UPSERT)
- SCD (Slowly Changing Dimension)
- UPSERT (Update + Insert)

In [0]:
from delta.tables import DeltaTable

In [0]:
#Incremental RUN
if spark.catalog.tableExists('carsproject_catalog.gold.dim_branch'):
    delta_tbl = DeltaTable.forPath(spark, "abfss://gold@datalake.dfs.core.windows.net/dim_branch")
    delta_tbl.alias("trg").merge(df_final.alias("src"), "trg.dim_branch_key = src.dim_branch_key")\
        .whenMatchedUpdateAll()\
        .whenNotMatchedInsertAll()\
        .execute()

#Initial RUN
else:
    df_final.write.format("delta")\
        .mode("overwrite")\
        .option("path","abfss://gold@datalake.dfs.core.windows.net/dim_branch")\
        .saveAsTable("carsproject_catalog.gold.dim_branch")

In [0]:
%sql
SELECT * FROM carsproject_catalog.gold.dim_branch