In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

StatementMeta(, 24a28986-8e87-466f-ab99-a21357f557d8, 3, Finished, Available)

In [9]:
# create table schema

holidaysSchema = StructType([
    StructField('Country_or_region', StringType()),
    StructField('Holiday_name', StringType()),
    StructField('Norm_Holiday_name', StringType()),
    StructField('Paid_time_off',StringType()),
    StructField('Country_region_code', StringType()),
    StructField('Date', DateType()),
])

# loading in with * to catch new files
df = spark.read.format('csv').option('header', 'true').schema(holidaysSchema).load('Files/bronze/holidays_*')

StatementMeta(, 24a28986-8e87-466f-ab99-a21357f557d8, 11, Finished, Available)

In [10]:
df.show(2)

StatementMeta(, 24a28986-8e87-466f-ab99-a21357f557d8, 12, Finished, Available)

+-----------------+--------------------+--------------------+-------------+-------------------+----------+
|Country_or_region|        Holiday_name|   Norm_Holiday_name|Paid_time_off|Country_region_code|      Date|
+-----------------+--------------------+--------------------+-------------+-------------------+----------+
|        Argentina|Año Nuevo [New Ye...|Año Nuevo [New Ye...|         null|                 AR|1970-01-01|
|        Australia|      New Year's Day|      New Year's Day|         null|                 AU|1970-01-01|
+-----------------+--------------------+--------------------+-------------+-------------------+----------+
only showing top 2 rows



In [11]:
# adding new coulmns to track creation and modification of records with timestamps
df = df.withColumn('FileName', input_file_name()) \
       .withColumn('CreatedUTC', current_timestamp()).withColumn('ModifiedUTC', current_timestamp())

StatementMeta(, 24a28986-8e87-466f-ab99-a21357f557d8, 13, Finished, Available)

In [12]:
# checking for value consistency
column_name = 'Paid_time_off'
un = df.select(column_name).distinct()
un.show()

StatementMeta(, 24a28986-8e87-466f-ab99-a21357f557d8, 14, Finished, Available)

+-------------+
|Paid_time_off|
+-------------+
|        False|
|         null|
|         True|
+-------------+



In [16]:
# based on the above I will update Paid_time_off if Paid_time_off is null to False
df = df.withColumn('Paid_time_off', when(col('Paid_time_off').isNull(), lit('False')).otherwise(col('Paid_time_off')))

StatementMeta(, 24a28986-8e87-466f-ab99-a21357f557d8, 18, Finished, Available)

In [17]:
fix = df.select(column_name).distinct()
fix.show()

StatementMeta(, 24a28986-8e87-466f-ab99-a21357f557d8, 19, Finished, Available)

+-------------+
|Paid_time_off|
+-------------+
|        False|
|         True|
+-------------+



CREATING DELTA TABLE

In [18]:
from delta.tables import *

StatementMeta(, 24a28986-8e87-466f-ab99-a21357f557d8, 20, Finished, Available)

In [19]:
# create schema for the holidays_silver table
DeltaTable.createIfNotExists(spark) \
     .tableName('holidays.holidays_silver') \
     .addColumn('Country_or_region', StringType()) \
     .addColumn('Holiday_name', StringType()) \
     .addColumn('Norm_Holiday_name', StringType()) \
     .addColumn('Paid_time_off', BooleanType()) \
     .addColumn('Country_region_code', StringType()) \
     .addColumn('Date', DateType()) \
     .addColumn('CreatedUTC', DateType()) \
     .addColumn('ModifiedUTC', DateType()) \
     .execute()

StatementMeta(, 24a28986-8e87-466f-ab99-a21357f557d8, 21, Finished, Available)

<delta.tables.DeltaTable at 0x7fba2e820760>

In [20]:
# updating existing records in the holidays_silver table and inserting new ones based on Holiday_name, Country_region_code and Date

deltaTable = DeltaTable.forPath(spark, 'Tables/holidays_silver')

dfUpdates = df

deltaTable.alias('silver') \
   .merge(
    dfUpdates.alias('updates'),
    'silver.Holiday_name = updates.Holiday_name and silver.Country_region_code = updates.Country_region_code and silver.Date = updates.Date'
    ) \
       .whenMatchedUpdate(set =
       {

       }
    ) \
    .whenNotMatchedInsert(values =
       {

        'Country_or_region': 'updates.Country_or_region',
        'Holiday_name': 'updates.Holiday_name',
        'Norm_Holiday_name': 'updates.Norm_Holiday_name',
        'Paid_time_off': 'updates.Paid_time_off',
        'Country_region_code': 'updates.Country_region_code',
        'Date': 'updates.Date',
        'CreatedUTC': 'updates.CreatedUTC',
        'ModifiedUTC': 'updates.ModifiedUTC'
       }
    ) \
    .execute()

StatementMeta(, 24a28986-8e87-466f-ab99-a21357f557d8, 22, Finished, Available)

Additionally:

1. Should check if 'Holiday_name' and 'Norm_Holiday_name' have the same values
2. Could Drop ' Norm_Holiday_name' if it's a copy of 'Holiday_name'
3. Introduce Great Expectations framework for data validation from bronze to silver layer
