##Create parameter

In [0]:
dbutils.widgets.text('incremental_flag', '0')

In [0]:
incremental_flag = dbutils.widgets.get('incremental_flag')

#Create dimensional tables

##Dimensional Model

###Fetch Data

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
query = '''Select Date_ID, Day, Month, Year
           From parquet.`abfss://silver@carsalesdatdpdatalake.dfs.core.windows.net/carsales`
'''
df_src = spark.sql(query)
df_src.display()

Date_ID,Day,Month,Year
DT00143,30,1,2017
DT00305,14,7,2017
DT00632,6,9,2018
DT00862,24,10,2019
DT00933,14,7,2019
DT01093,17,4,2019
DT00194,8,2,2017
DT00195,19,6,2017
DT00351,18,11,2017
DT00536,18,8,2018


###Dim_Date Sink

In [0]:
if not spark.catalog.tableExists('car_catalogs.gold.Sub_Date'):
    query = '''
    Select 1 as Sub_Date_Key, Date_ID, Day, Month, Year
    From parquet.`abfss://silver@carsalesdatdpdatalake.dfs.core.windows.net/carsales`
    Where 1=0
    '''
    df_sink  = spark.sql(query)
else:
    new_query = '''
        Select Sub_Date_Key, Date_ID, Day, Month, Year
        From car_catalogs.gold.Sub_Date
    '''
    df_sink = spark.sql(new_query)

In [0]:
df_filter = df_src.join(df_sink, df_src['Date_ID'] == df_sink['Date_ID'], 'left').select(df_src['Date_ID'], df_src['Day'], df_src['Month'],df_src['Year'], df_sink['Sub_Date_Key'])
df_filter.display()

Date_ID,Day,Month,Year,Sub_Date_Key
DT00143,30,1,2017,1674
DT00143,30,1,2017,1
DT00305,14,7,2017,1405
DT00305,14,7,2017,2
DT00632,6,9,2018,1519
DT00632,6,9,2018,3
DT00862,24,10,2019,4
DT00933,14,7,2019,1654
DT00933,14,7,2019,5
DT01093,17,4,2019,1476


In [0]:
df_filter_old = df_filter.filter(col('Sub_Date_Key').isNotNull()).dropDuplicates(["Sub_Date_Key"])
df_filter_old.display()

Date_ID,Day,Month,Year,Sub_Date_Key
DT00143,2,10,2017,1
DT00305,7,3,2017,2
DT00632,13,1,2018,3
DT00862,24,10,2019,4
DT00933,20,11,2019,5
DT01093,16,8,2019,6
DT00194,13,10,2017,7
DT00195,15,10,2017,8
DT00351,16,3,2017,9
DT00536,14,12,2018,10


In [0]:
df_filter_new = df_filter.filter(col('Sub_Date_Key').isNull()).select(df_src['Date_ID'], df_src['Day'], df_src['Month'], df_src['Year'])
df_filter_new.display()

Date_ID,Day,Month,Year


##Create Surrogate Key

In [0]:
if (incremental_flag == '0'):
    max_value = 1
else:
    df_max_val = spark.sql('Select max(Sub_Date_Key) as Sub_Date_Key From car_catalogs.gold.Sub_Date')
    max_value = df_max_val.collect()[0][0]+1

In [0]:
df_filter_new = df_filter_new.withColumn('Sub_Date_Key', max_value + monotonically_increasing_id())
df_filter_new.display()

Date_ID,Day,Month,Year,Sub_Date_Key


##Combine old and new df

In [0]:
df_final = df_filter_new.union(df_filter_old).dropDuplicates(['Sub_Date_Key'])

##SCD Type 1 

In [0]:
from delta.tables import *

In [0]:
#Init Run
if not spark.catalog.tableExists('car_catalogs.gold.Sub_Date'):
    df_final.write.format('delta') \
                  .mode('overwrite') \
                  .option('path','abfss://gold@carsalesdatdpdatalake.dfs.core.windows.net/Sub_Date') \
                  .saveAsTable('car_catalogs.gold.Sub_Date')
#Incremental Run
else:
    df_final.createOrReplaceTempView('updates')
    delta_tbl = DeltaTable.forPath(spark, 'abfss://gold@carsalesdatdpdatalake.dfs.core.windows.net/Sub_Date')
    delta_tbl.alias('trg').merge((df_final).alias('src'), '''trg.Sub_Date_Key = src.Sub_Date_Key''') \
                          .withSchemaEvolution() \
                          .whenNotMatchedInsertAll() \
                          .execute()


In [0]:
%sql  
select count(*) from car_catalogs.gold.sub_date

count(1)
1849
