In [1]:
from pyspark.sql.types import *
from delta.tables import *
from pyspark.sql.functions import *

StatementMeta(, 7379f303-56f3-4391-b298-0809e8a991d9, 3, Finished, Available)

In [2]:
# loading data to a dataframe
df = spark.read.table('holidays.holidays_silver')

StatementMeta(, 7379f303-56f3-4391-b298-0809e8a991d9, 4, Finished, Available)

In [3]:
df.schema

StatementMeta(, 7379f303-56f3-4391-b298-0809e8a991d9, 5, Finished, Available)

StructType([StructField('Country_or_region', StringType(), True), StructField('Holiday_name', StringType(), True), StructField('Norm_Holiday_name', StringType(), True), StructField('Paid_time_off', BooleanType(), True), StructField('Country_region_code', StringType(), True), StructField('Date', DateType(), True), StructField('CreatedUTC', DateType(), True), StructField('ModifiedUTC', DateType(), True)])

In [4]:
# schema for Date dimension table

DeltaTable.createIfNotExists(spark) \
    .tableName('holidays.date_gold') \
    .addColumn('Date', DateType()) \
    .addColumn('Day', IntegerType()) \
    .addColumn('Month', IntegerType()) \
    .addColumn('Year', IntegerType()) \
    .execute()


StatementMeta(, 7379f303-56f3-4391-b298-0809e8a991d9, 6, Finished, Available)

<delta.tables.DeltaTable at 0x7f9c2801ef80>

In [5]:
# dataframe date_gold

dfdate_gold = df.dropDuplicates(['Date']).select(col('Date'), \
dayofmonth('Date').alias('Day'), \
month('Date').alias('Month'), \
year('Date').alias('Year'), \
).orderBy('Date')

display(dfdate_gold.head(5))


StatementMeta(, 7379f303-56f3-4391-b298-0809e8a991d9, 7, Finished, Available)

SynapseWidget(Synapse.DataFrame, 02f50aa0-88f6-4e0a-a206-b35417c4a091)

The code takes your original DataFrame, removes duplicates based on the 'Date' column, extracts the day, month, and year from the 'Date' column, and then orders the resulting DataFrame by the 'Date' column in ascending order.

In [6]:
# updates to the date_gold

deltaTable = DeltaTable.forPath(spark, 'Tables/date_gold')

dfUpdates = dfdate_gold

deltaTable.alias('silver') \
  .merge(
    dfUpdates.alias('updates'),
    'silver.Date = updates.Date'
  ) \
 .whenNotMatchedInsert(values =
    {     
     'Date': 'updates.Date',
     'Day': 'updates.Day',
     'Month': 'updates.Month',
     'Year': 'updates.Year'
    }
  ) \
  .execute()

StatementMeta(, 7379f303-56f3-4391-b298-0809e8a991d9, 8, Finished, Available)

In [17]:
# schema for Country dimension table

DeltaTable.createIfNotExists(spark) \
    .tableName('holidays.country_gold') \
    .addColumn('Country_or_region', StringType()) \
    .addColumn('Country_region_code', StringType()) \
    .execute()

StatementMeta(, 7379f303-56f3-4391-b298-0809e8a991d9, 19, Finished, Available)

<delta.tables.DeltaTable at 0x7f9c282f6a40>

In [20]:
# dataframe country_gold

dfcountry_gold = df.dropDuplicates(['Country_or_region', 'Country_region_code']) \
         .select(col('Country_or_region').alias('Country_or_region'), col('Country_region_code').alias('Country_region_code'))

display(dfcountry_gold)

StatementMeta(, 7379f303-56f3-4391-b298-0809e8a991d9, 22, Finished, Available)

SynapseWidget(Synapse.DataFrame, cc915096-8819-4c26-909a-5098b3e92895)

In [21]:
# updates to the country_gold

deltaTable = DeltaTable.forPath(spark, 'Tables/country_gold')

dfUpdates = dfcountry_gold

deltaTable.alias('silver') \
  .merge(
    dfUpdates.alias('updates'),
    'silver.Country_or_region = updates.Country_or_region AND silver.Country_region_code = updates.Country_region_code'
  ) \
 .whenNotMatchedInsert(values =
    { 
     'Country_or_region': 'updates.Country_or_region',    
     'Country_region_code': 'updates.Country_region_code',
    }
  ) \
  .execute()

StatementMeta(, 7379f303-56f3-4391-b298-0809e8a991d9, 23, Finished, Available)

In [22]:
# schema for holiname_gold dimension table

DeltaTable.createIfNotExists(spark) \
    .tableName('holidays.holiname_gold') \
    .addColumn('Holiday_name', StringType()) \
    .addColumn('Paid_time_off', BooleanType()) \
    .addColumn('Holiday_ID', LongType()) \
    .execute()

StatementMeta(, 7379f303-56f3-4391-b298-0809e8a991d9, 24, Finished, Available)

<delta.tables.DeltaTable at 0x7f9c282f4280>

In [23]:
# dataframe holiname_gold

dfholiname_gold  = df.dropDuplicates(['Holiday_name', 'Paid_time_off']) \
    .select(col('Holiday_name').alias('Holiday_name'), col('Paid_time_off').alias('Paid_time_off')) \
    .withColumn('Holiday_ID', monotonically_increasing_id().cast(LongType())) #creating IDs with monotonically_increasing_id() function

display(dfholiname_gold)

StatementMeta(, 7379f303-56f3-4391-b298-0809e8a991d9, 25, Finished, Available)

SynapseWidget(Synapse.DataFrame, ba925f62-f47c-42f7-ad4e-0d497eb7e5b2)

In [24]:
# updates to the holiname_gold

deltaTable = DeltaTable.forPath(spark, 'Tables/holiname_gold')

dfUpdates = dfholiname_gold

deltaTable.alias('silver') \
  .merge(
    dfUpdates.alias('updates'),
    'silver.Holiday_name = updates.Holiday_name AND silver.Paid_time_off = updates.Paid_time_off AND silver.Holiday_ID = updates.Holiday_ID'
   ) \
  .whenMatchedUpdate(set =
  {
  'Holiday_ID': 'updates.Holiday_ID',
  }
  ) \
 .whenNotMatchedInsert(values =
    {     
     'Holiday_name': 'updates.Holiday_name',
     'Paid_time_off': 'updates.Paid_time_off',
     'Holiday_ID': 'updates.Holiday_ID'
    }
  ) \
  .execute()

StatementMeta(, 7379f303-56f3-4391-b298-0809e8a991d9, 26, Finished, Available)

In [25]:
# schema for factholidays_gold fact table

DeltaTable.createIfNotExists(spark) \
    .tableName('holidays.factholidays_gold') \
    .addColumn('Date', DateType()) \
    .addColumn('Holiday_ID', LongType()) \
    .addColumn('Country_region_code', StringType()) \
    .execute()

StatementMeta(, 7379f303-56f3-4391-b298-0809e8a991d9, 27, Finished, Available)

<delta.tables.DeltaTable at 0x7f9c282f67d0>

In [26]:
country_gold_temp = spark.read.table('holidays.country_gold')
holiname_gold_temp = spark.read.table('holidays.holiname_gold')

StatementMeta(, 7379f303-56f3-4391-b298-0809e8a991d9, 28, Finished, Available)

In [27]:
# create factholidays_gold dataframe
dffactholidays_gold = df.alias('df1') \
    .join(country_gold_temp.alias('df2'), (col('df1.Country_or_region') == col('df2.Country_or_region')), 'left') \
    .join(holiname_gold_temp.alias('df3'), (col('df1.Holiday_name') == col('df3.Holiday_name')), 'left') \
    .select(
        col('df2.Country_region_code').alias('Country_region_code'),
        col('df3.Holiday_ID').alias('Holiday_ID'),
        col('df1.Date')
    ) \
    .orderBy(col('df1.Date'), col('df2.Country_region_code'))

# Show the resulting DataFrame
dffactholidays_gold.show(10)

StatementMeta(, 7379f303-56f3-4391-b298-0809e8a991d9, 29, Finished, Available)

+-------------------+----------+----------+
|Country_region_code|Holiday_ID|      Date|
+-------------------+----------+----------+
|                 AR|       305|1970-01-01|
|                 AT|       349|1970-01-01|
|                 AU|       372|1970-01-01|
|                 AU|       280|1970-01-01|
|                 BE|       393|1970-01-01|
|                 BR|       277|1970-01-01|
|                 CA|       280|1970-01-01|
|                 CA|       372|1970-01-01|
|                 CH|       271|1970-01-01|
|                 CO|       305|1970-01-01|
+-------------------+----------+----------+
only showing top 10 rows



In [28]:
# updates to the factholidays_gold

deltaTable = DeltaTable.forPath(spark, 'Tables/factholidays_gold')

dfUpdates = dffactholidays_gold

deltaTable.alias('silver') \
  .merge(
    dfUpdates.alias('updates'),
    'silver.Country_region_code = updates.Country_region_code AND silver.Holiday_ID = updates.Holiday_ID AND silver.Date = updates.Date'
   ) \
  .whenMatchedUpdate(set =
  {
  'Holiday_ID': 'updates.Holiday_ID',
  }
  ) \
 .whenNotMatchedInsert(values =
    {     
     'Country_region_code': 'updates.Country_region_code',
     'Holiday_ID': 'updates.Holiday_ID',
     'Date': 'updates.Date'
    }
  ) \
  .execute()

StatementMeta(, 7379f303-56f3-4391-b298-0809e8a991d9, 30, Finished, Available)