### **1. Importing the required modules and functions.**

In [0]:
# Importing the required modules and functions
from pyspark.sql.functions import lit, current_date, concat
from delta.tables import DeltaTable
from abc import ABC, abstractmethod

### **2. Abstract base class for factory interface.**

In [0]:
# Creating abstract base class for factory interface
class Gold_Dims(ABC):
    # Basic representation of the data extraction & loading codes

    @abstractmethod
    def read_silver(self, spark, src_table_path):
        # Read and returns a pyspark dataframe from the Delta tables in Silver layer
        raise NotImplementedError("This method must be overridden by subclasses")

    @abstractmethod
    def load_dimension(self, spark, basePath, silver_df):
        # Performs SCD2 using merge for the dimension Delta tables in Gold layer with silver pyspark dataframe 
        raise NotImplementedError("This method must be overridden by subclasses")

### **3. Concrete classes and implementing the abstract methods in subclasses.**

In [0]:
class Customers(Gold_Dims):

    # Method to read the Delta tables file from Silver layer & return the pyspark dataframe     
    def read_silver(self, spark, src_table_path): 
        
        # Reading the Delta file from Silver layer
        silver_delta = DeltaTable.forPath(spark, f"abfss://{src_table_path}")

        # Converting the delta table to pyspark dataframe
        silver_df = silver_delta.toDF()

        return silver_df
    

    # Method to perform SCD2 using merge for customers dimension delta table in gold layer
    def load_dimension(self, spark, basePath, silver_df):

        # Reading customers dimensions delta table from gold layer
        dim_olist_customers = DeltaTable.forPath(spark, f"abfss://{basePath}")

        # Rows to INSERT new rows of existing customers
        newRowsToInsert = silver_df.alias("updates") \
        .join(dim_olist_customers.toDF().alias("customers"), "customer_id") \
        .where("""
               customers.is_current = 1 AND 
               ((updates.customer_email_id <> customers.customer_email_id) or 
                (updates.customer_state <> customers.customer_state) or 
                (updates.customer_state_code <> customers.customer_state_code))
                """)
        
        # Stage the update by unioning two sets of rows
        # 1. Rows that will be inserted in the whenNotMatched clause
        # 2. Rows that will either update the current changes of existing customers or insert the data of new customers
        stagedUpdates = (
        newRowsToInsert
        .selectExpr("NULL as mergeKey", "updates.*")   # Rows for 1
        .union(silver_df.selectExpr("customer_id as mergeKey", "*"))  # Rows for 2.
        )

        # Apply SCD Type 2 operation using merge
        dim_olist_customers.alias("customers").merge(
        stagedUpdates.alias("staged_updates"),
        "customers.customer_id = mergeKey") \
        .whenMatchedUpdate(
        condition = """customers.is_current = 1 AND                
                    ((staged_updates.customer_email_id <> customers.customer_email_id) or 
                     (staged_updates.customer_state <> customers.customer_state) or 
                     (staged_updates.customer_state_code <> customers.customer_state_code)
                     )""",
        set = { # Set is_current to 0 and eff_end_dt to current date - 1.
            "is_current": lit(0),
            "eff_end_dt": (current_date() - 1),
            "update_date": current_date(),
            "updated_by": lit("olist_etl")            
        }
        ).whenNotMatchedInsert(
        values = {
            "customer_skey": concat("staged_updates.customer_id", lit("_skey")),
            "customer_id": "staged_updates.customer_id",
            "customer_email_id": "staged_updates.customer_email_id",
            "customer_state": "staged_updates.customer_state",
            "customer_state_code": "staged_updates.customer_state_code",
            "eff_start_dt": current_date(), # Set is_current to 1 along with the new rows and its eff_start_dt.
            "eff_end_dt": lit("9999-12-31").cast("date"),
            "is_current": lit(1),
            "insert_date": current_date(),
            "inserted_by": lit("olist_etl"),
            "update_date": lit(None).cast("string"),
            "updated_by": lit(None).cast("string")
        }
        ).execute()

        return True

In [0]:
class Orders(Gold_Dims):

    # Method to read the Delta table from Silver layer & return the pyspark dataframe     
    def read_silver(self, spark, src_table_path): 
        
        # Reading the Delta file from Silver layer
        silver_delta = DeltaTable.forPath(spark, f"abfss://{src_table_path}")

        # Converting the delta table to pyspark dataframe
        silver_df = silver_delta.toDF()

        return silver_df
    

    # Method to perform SCD2 using merge for orders dimension delta table in gold layer
    def load_dimension(self, spark, basePath, silver_df):

        # Reading orders dimensions delta table from gold layer
        dim_olist_orders = DeltaTable.forPath(spark, f"abfss://{basePath}")

        # Rows to INSERT new rows of existing orders
        newRowsToInsert = silver_df.alias("updates") \
        .join(dim_olist_orders.toDF().alias("orders"), ["order_id", "customer_id"]) \
        .where("""
               orders.is_current = 1 AND 
               ((updates.order_status <> orders.order_status) or 
                (updates.order_purchase_timestamp <> orders.order_purchase_timestamp) or 
                (updates.order_approved_at <> orders.order_approved_at) or
                (updates.order_delivered_carrier_date <> orders.order_delivered_carrier_date) or 
                (updates.order_delivered_customer_date <> orders.order_delivered_customer_date) or
                (updates.order_estimated_delivery_date <> orders.order_estimated_delivery_date))
                """)
        
        # Stage the update by unioning two sets of rows
        # 1. Rows that will be inserted in the whenNotMatched clause
        # 2. Rows that will either update the current changes of existing orders or insert the data of new orders
        stagedUpdates = (
        newRowsToInsert
        .selectExpr("NULL as mergeKey", "updates.*")   # Rows for 1
        .union(silver_df.selectExpr("order_id as mergeKey", "*"))  # Rows for 2.
        )

        # Apply SCD Type 2 operation using merge
        dim_olist_orders.alias("orders").merge(
        stagedUpdates.alias("staged_updates"),
        "orders.order_id = mergeKey") \
        .whenMatchedUpdate(
        condition = """orders.is_current = 1 AND                
                    ((staged_updates.order_status <> orders.order_status) or 
                    (staged_updates.order_purchase_timestamp <> orders.order_purchase_timestamp) or 
                    (staged_updates.order_approved_at <> orders.order_approved_at) or
                    (staged_updates.order_delivered_carrier_date <> orders.order_delivered_carrier_date) or 
                    (staged_updates.order_delivered_customer_date <> orders.order_delivered_customer_date) or
                    (staged_updates.order_estimated_delivery_date <> orders.order_estimated_delivery_date)
                     )""",
        set = { # Set is_current to 0 and eff_end_dt to current date - 1.
            "is_current": lit(0),
            "eff_end_dt": (current_date() - 1),
            "update_date": current_date(),
            "updated_by": lit("olist_etl")            
        }
        ).whenNotMatchedInsert(
        values = {
            "order_skey": concat("staged_updates.order_id", lit("_skey")),
            "order_id": "staged_updates.order_id",
            "customer_id": "staged_updates.customer_id",
            "order_status": "staged_updates.order_status",
            "order_purchase_timestamp": "staged_updates.order_purchase_timestamp",
            "order_approved_at": "staged_updates.order_approved_at",
            "order_delivered_carrier_date": "staged_updates.order_delivered_carrier_date",
            "order_delivered_customer_date": "staged_updates.order_delivered_customer_date",
            "order_estimated_delivery_date": "staged_updates.order_estimated_delivery_date",
            "eff_start_dt": current_date(), # Set is_current to 1 along with the new rows and its eff_start_dt.
            "eff_end_dt": lit("9999-12-31").cast("date"),
            "is_current": lit(1),
            "insert_date": current_date(),
            "inserted_by": lit("olist_etl"),
            "update_date": lit(None).cast("string"),
            "updated_by": lit(None).cast("string")
        }
        ).execute()

        return True

In [0]:
class Products(Gold_Dims):

    # Method to read the Delta table from Silver layer & return the pyspark dataframe     
    def read_silver(self, spark, src_table_path): 
        
        # Reading the Delta file from Silver layer
        silver_delta = DeltaTable.forPath(spark, f"abfss://{src_table_path}")

        # Converting the delta table to pyspark dataframe
        silver_df = silver_delta.toDF()

        return silver_df
    

    # Method to perform SCD2 using merge for products dimension delta table in gold layer
    def load_dimension(self, spark, basePath, silver_df):

        # Reading products dimensions delta table from gold layer
        dim_olist_products = DeltaTable.forPath(spark, f"abfss://{basePath}")

        # Rows to INSERT new rows of existing products
        newRowsToInsert = silver_df.alias("updates") \
        .join(dim_olist_products.toDF().alias("products"), ["product_id", "seller_id"]) \
        .where("""
               products.is_current = 1 AND 
               ((updates.product_category_name <> products.product_category_name) or 
                (updates.product_category_name_english <> products.product_category_name_english) or 
                (updates.price <> products.price) or
                (updates.freight_value <> products.freight_value) or 
                (updates.product_name_length <> products.product_name_length) or
                (updates.product_description_length <> products.product_description_length) or
                (updates.product_photos_qty <> products.product_photos_qty) or 
                (updates.product_weight_g <> products.product_weight_g) or
                (updates.product_length_cm <> products.product_length_cm) or 
                (updates.product_height_cm <> products.product_height_cm) or
                (updates.product_width_cm <> products.product_width_cm))
                """)
        
        # Stage the update by unioning two sets of rows
        # 1. Rows that will be inserted in the whenNotMatched clause
        # 2. Rows that will either update the current changes of existing products or insert the new data of new products
        stagedUpdates = (
        newRowsToInsert
        .selectExpr("NULL as mergeKey", "updates.*")   # Rows for 1
        .union(silver_df.selectExpr("product_id as mergeKey", "*"))  # Rows for 2.
        )

        # Apply SCD Type 2 operation using merge
        dim_olist_products.alias("products").merge(
        stagedUpdates.alias("staged_updates"),
        "products.product_id = mergeKey") \
        .whenMatchedUpdate(
        condition = """products.is_current = 1 AND                
                    ((staged_updates.product_category_name <> products.product_category_name) or 
                    (staged_updates.product_category_name_english <> products.product_category_name_english) or 
                    (staged_updates.price <> products.price) or
                    (staged_updates.freight_value <> products.freight_value) or 
                    (staged_updates.product_name_length <> products.product_name_length) or
                    (staged_updates.product_description_length <> products.product_description_length) or
                    (staged_updates.product_photos_qty <> products.product_photos_qty) or 
                    (staged_updates.product_weight_g <> products.product_weight_g) or
                    (staged_updates.product_length_cm <> products.product_length_cm) or 
                    (staged_updates.product_height_cm <> products.product_height_cm) or
                    (staged_updates.product_width_cm <> products.product_width_cm)
                     )""",
        set = { # Set is_current to 0 and eff_end_dt to current date - 1.
            "is_current": lit(0),
            "eff_end_dt": (current_date() - 1),
            "update_date": current_date(),
            "updated_by": lit("olist_etl")            
        }
        ).whenNotMatchedInsert(
        values = {
            "product_skey": concat("staged_updates.product_id", lit("_skey")),
            "product_id": "staged_updates.product_id",
            "seller_id": "staged_updates.seller_id",
            "product_category_name": "staged_updates.product_category_name",
            "product_category_name_english": "staged_updates.product_category_name_english",
            "price": "staged_updates.price",
            "freight_value": "staged_updates.freight_value",
            "product_name_length": "staged_updates.product_name_length",
            "product_description_length": "staged_updates.product_description_length",
            "product_photos_qty": "staged_updates.product_photos_qty",
            "product_weight_g": "staged_updates.product_weight_g",
            "product_length_cm": "staged_updates.product_length_cm",
            "product_height_cm": "staged_updates.product_height_cm",
            "product_width_cm": "staged_updates.product_width_cm",                        
            "eff_start_dt": current_date(), # Set is_current to 1 along with the new rows and its eff_start_dt.
            "eff_end_dt": lit("9999-12-31").cast("date"),
            "is_current": lit(1),
            "insert_date": current_date(),
            "inserted_by": lit("olist_etl"),
            "update_date": lit(None).cast("string"),
            "updated_by": lit(None).cast("string")
        }
        ).execute()

        return True

In [0]:
class Sellers(Gold_Dims):

    # Method to read the Delta table from Silver layer & return the pyspark dataframe     
    def read_silver(self, spark, src_table_path): 
        
        # Reading the Delta file from Silver layer
        silver_delta = DeltaTable.forPath(spark, f"abfss://{src_table_path}")

        # Converting the delta table to pyspark dataframe
        silver_df = silver_delta.toDF()

        return silver_df
    

    # Method to perform SCD2 using merge for sellers dimension delta table in gold layer
    def load_dimension(self, spark, basePath, silver_df):

        # Reading sellers dimensions delta table from gold layer
        dim_olist_sellers = DeltaTable.forPath(spark, f"abfss://{basePath}")

        # Rows to INSERT new rows of existing customers
        newRowsToInsert = silver_df.alias("updates") \
        .join(dim_olist_sellers.toDF().alias("sellers"), "seller_id") \
        .where("""
               sellers.is_current = 1 AND 
               ((updates.seller_state <> sellers.seller_state) or 
                (updates.seller_state_code <> sellers.seller_state_code))
                """)
        
        # Stage the update by unioning two sets of rows
        # 1. Rows that will be inserted in the whenNotMatched clause
        # 2. Rows that will either update the current changes of existing sellers or insert the new data of new sellers
        stagedUpdates = (
        newRowsToInsert
        .selectExpr("NULL as mergeKey", "updates.*")   # Rows for 1
        .union(silver_df.selectExpr("seller_id as mergeKey", "*"))  # Rows for 2.
        )

        # Apply SCD Type 2 operation using merge
        dim_olist_sellers.alias("sellers").merge(
        stagedUpdates.alias("staged_updates"),
        "sellers.seller_id = mergeKey") \
        .whenMatchedUpdate(
        condition = """sellers.is_current = 1 AND                
                    ((staged_updates.seller_state <> sellers.seller_state) or 
                    (staged_updates.seller_state_code <> sellers.seller_state_code)
                     )""",
        set = { # Set is_current to 0 and eff_end_dt to current date - 1.
            "is_current": lit(0),
            "eff_end_dt": (current_date() - 1),
            "update_date": current_date(),
            "updated_by": lit("olist_etl")            
        }
        ).whenNotMatchedInsert(
        values = {
            "seller_skey": concat("staged_updates.seller_id", lit("_skey")),
            "seller_id": "staged_updates.seller_id",
            "seller_state": "staged_updates.seller_state",
            "seller_state_code": "staged_updates.seller_state_code",
            "eff_start_dt": current_date(), # Set is_current to 1 along with the new rows and its eff_start_dt.
            "eff_end_dt": lit("9999-12-31").cast("date"),
            "is_current": lit(1),
            "insert_date": current_date(),
            "inserted_by": lit("olist_etl"),
            "update_date": lit(None).cast("string"),
            "updated_by": lit(None).cast("string")
        }
        ).execute()

        return True

In [0]:
class Order_Ratings(Gold_Dims):

    # Method to read the Delta table from Silver layer & return the pyspark dataframe     
    def read_silver(self, spark, src_table_path): 
        
        # Reading the Delta file from Silver layer
        silver_delta = DeltaTable.forPath(spark, f"abfss://{src_table_path}")

        # Converting the delta table to pyspark dataframe
        silver_df = silver_delta.toDF()

        return silver_df
    

    # Method to perform SCD2 using merge for order_ratings dimension delta table in gold layer
    def load_dimension(self, spark, basePath, silver_df):

        # Reading order_ratings dimensions delta table from gold layer
        dim_olist_order_ratings = DeltaTable.forPath(spark, f"abfss://{basePath}")

        # Rows to INSERT new rows of existing order_ratings
        newRowsToInsert = silver_df.alias("updates") \
        .join(dim_olist_order_ratings.toDF().alias("order_ratings"), ["rating_id", "order_id"]) \
        .where("""
               order_ratings.is_current = 1 AND 
               ((updates.rating_score <> order_ratings.rating_score) or 
                (updates.rating_survey_creation_date <> order_ratings.rating_survey_creation_date) or 
                (updates.rating_survey_answer_timestamp <> order_ratings.rating_survey_answer_timestamp))
                """)
        
        # Stage the update by unioning two sets of rows
        # 1. Rows that will be inserted in the whenNotMatched clause
        # 2. Rows that will either update the current changes of existing ratings or insert the new data of new ratings
        stagedUpdates = (
        newRowsToInsert
        .selectExpr("NULL as mergeKey", "updates.*")   # Rows for 1
        .union(silver_df.selectExpr("rating_id as mergeKey", "*"))  # Rows for 2.
        )

        # Apply SCD Type 2 operation using merge
        dim_olist_order_ratings.alias("order_ratings").merge(
        stagedUpdates.alias("staged_updates"),
        "order_ratings.rating_id = mergeKey") \
        .whenMatchedUpdate(
        condition = """order_ratings.is_current = 1 AND                
                    ((staged_updates.rating_score <> order_ratings.rating_score) or 
                    (staged_updates.rating_survey_creation_date <> order_ratings.rating_survey_creation_date) or 
                    (staged_updates.rating_survey_answer_timestamp <> order_ratings.rating_survey_answer_timestamp)
                     )""",
        set = { # Set is_current to 0 and eff_end_dt to current date - 1.
            "is_current": lit(0),
            "eff_end_dt": (current_date() - 1),
            "update_date": current_date(),
            "updated_by": lit("olist_etl")            
        }
        ).whenNotMatchedInsert(
        values = {
            "rating_skey": concat("staged_updates.rating_id", lit("_skey")),
            "rating_id": "staged_updates.rating_id",
            "order_id": "staged_updates.order_id",
            "rating_score": "staged_updates.rating_score",
            "rating_survey_creation_date": "staged_updates.rating_survey_creation_date",
            "rating_survey_answer_timestamp": "staged_updates.rating_survey_answer_timestamp",
            "eff_start_dt": current_date(), # Set is_current to 1 along with the new rows and its eff_start_dt.
            "eff_end_dt": lit("9999-12-31").cast("date"),
            "is_current": lit(1),
            "insert_date": current_date(),
            "inserted_by": lit("olist_etl"),
            "update_date": lit(None).cast("string"),
            "updated_by": lit(None).cast("string")
        }
        ).execute()

        return True

### **4. Factory class with static method**

In [0]:
# Factory class with static method
class Factory:

    # Method to return the instance based on the target dimension table
    @staticmethod
    def get_gold_dims(tgt_table_name: str) -> Gold_Dims:

        tables = {
        "dim_olist_customers" : Customers(),
        "dim_olist_orders" : Orders(),
        "dim_olist_products" : Products(),
        "dim_olist_sellers" : Sellers(),
        "dim_olist_order_ratings" : Order_Ratings()
        }

        while True:
            # If target table name is present in the table dictionary it'll return the object of the concrete class
            if tgt_table_name in tables:
                return tables[tgt_table_name]
            
            print(f"Unknown target table : {tgt_table_name}.")