dim_Product

In [0]:
%run "/Workspace/Utils/Utils"

In [0]:
from pyspark.sql import DataFrame, Window
from pyspark.sql import functions as F
from pyspark.sql.types import (
    IntegerType, StringType, TimestampType, StructType, StructField
)

In [0]:
#Loading Table

df = spark.table("adlslmcompany_silver.managed_silver.saleslt_product")

**Once the data has been cleaned, it should now be adapted to its final requirements, specifically as a dimension Product table.**

In [0]:
#Displaying DF

df.display()

In [0]:
def gold_clean_dimproduct(df): 

    #Drop columns that will not be necessary for this dimension table
    df = df.drop("rowguid", "ModifiedDate", "bronze_ingestion_timestamp", "silves_transformed_timestamp" )

    # Add processed timestamp column

    df = df.withColumn("processed_timestamp", F.current_timestamp())


    # Join Product Category Table and select required columns
    product_category_df = spark.table("adlslmcompany_silver.managed_silver.saleslt_productcategory")

    product_category_df = product_category_df.alias("pc1").join(
        product_category_df.alias("pc2"),
        F.col("pc1.ParentProductCategoryID") == F.col("pc2.ProductCategoryID"),
        "inner"
    ).select(
        F.col("pc1.ProductCategoryID"),
        F.col("pc1.Name").alias("SubCategory"),
        F.col("pc2.Name").alias("Category")
    )

    df = df.join(product_category_df, df.ProductCategoryID == product_category_df.ProductCategoryID, "left") \
           .select(df["*"], product_category_df["Category"], product_category_df["SubCategory"])

       

   # Join Product Model Table and select required columns
    product_model_df = spark.table("adlslmcompany_silver.managed_silver.saleslt_productmodel")

    df = df.join(product_model_df, df.ProductModelID == product_model_df.ProductModelID, "left") \
           .select(df["*"], product_model_df["Name"].alias("Model"))



    


    #Cast to ensure datatype and columns
    df = df.select(
         F.col('ProductID').cast(IntegerType()).alias('ProductID'),
         F.col('Name').cast(StringType()).alias('ProductName'),
         F.col('Color').cast(StringType()).alias('Color'),
         F.col('StandardCost').cast(IntegerType()).alias('StandardCost'),
         F.col('ListPrice').cast(IntegerType()).alias('ListPrice'),
         F.col('Size').cast(StringType()).alias('Size'),
         F.col('Weight').cast(IntegerType()).alias('Weight'),
         F.col('Category').cast(StringType()).alias('Category'),
         F.col('SubCategory').cast(StringType()).alias('SubCategory'),
         F.col('Model').cast(StringType()).alias('Model'),
         F.col('processed_timestamp').cast(TimestampType()).alias('processed_timestamp'),
         )

    return df

In [0]:
#Defining expected schema
expected_schema = StructType([
    StructField("ProductID", IntegerType(), False),             
    StructField("ProductName", StringType(), False),
    StructField("Color", StringType(), False),
    StructField("StandardCost", IntegerType(), False),
    StructField("ListPrice", IntegerType(), False),
    StructField("Size", StringType(), False),
    StructField("Weight", IntegerType(), False),
    StructField("Category", StringType(), False),
    StructField("SubCategory", StringType(), False),
    StructField("Model", StringType(), False),
    StructField("processed_timestamp", TimestampType(), False),
                            ])

In [0]:
#Transforming DF

gold_df = gold_clean_dimproduct(df)

In [0]:
#Comparing lenghts

compare_lengths(df, gold_df)

In [0]:
#Checking the schema 
_validate_schema(gold_df, expected_schema)

**IMPORTANT: Please note that this is a simulated project; the upsert operation will be executed within this notebook. In a production environment, a dedicated notebook containing only the function and validations would be developed. All function notebooks would be orchestrated by Azure Data Factory (ADF) pipelines or Azure Databricks (ADB) workflows. The method of upsert may vary based on the utilization of auto loader, streaming, or Change Data Feed (CDF).**


In [0]:
#Loading into the Gold Layer 


target_table= "dim_product"   

schema = "star_schema"

catalog = "adlscompany_gold"

primary_keys = ["ProductID"]


upsert_table(gold_df, target_table, primary_keys, schema, catalog )