SalesLT_Product

In [0]:
# Enable auto merge

spark.sql("SET spark.databricks.delta.schema.autoMerge.enabled = true")

In [0]:
from pyspark.sql import DataFrame, Window
from pyspark.sql import functions as F
from pyspark.sql.types import (
    DecimalType, IntegerType, StringType, TimestampType, StructType, StructField
)
from pyspark.sql.functions import count
from pyspark.sql.functions import col, count
from pyspark.sql.functions import count, desc
from pyspark.sql.functions import col, when, datediff, current_date
from pyspark.sql.types import IntegerType

In [0]:
%run "/Workspace/Utils/Utils"

In [0]:
#Loading Table

df = spark.table("adlslmcompany_bronze.managed_bronze.saleslt_product")

In [0]:
#Displaying table

df.limit(10).display()

In [0]:
#Checking for duplicated ID's

checkduplicates(df, "ProductID" )

In [0]:
#Checking for duplicated product names

checkduplicates(df, "Name" )


In [0]:
# This section identifies duplicated values in the DataFrame for further analysis.

name_counts = df.groupBy("Name").count().filter("count > 1")

# The original DataFrame is joined with the filtered counts DataFrame to isolate duplicated entries.
df_duplicates = df.join(name_counts, on="Name", how="inner").select(df["*"])

# The duplicated entries are ordered by name for better visibility.
df_duplicates.orderBy("Name").display()

Once this dataset contains only 68 rows, it is possible to manually verify the values. However, in scenarios with larger datasets, a function to check all columns should be utilized.

In [0]:
#Based on the analysis, it is evident that all the rows are duplicated without any value changes. Therefore, the rows can be dropped.

df = deduplicate(df, "Name", None)

In [0]:
def silver_clean_salesproduct(df): 

    #Deleting ireelevant columns
    df = df.drop("ThumbNailPhoto", "ThumbnailPhotoFileName", "ProductNumber")
   

    
        # Creating a selling time column based on the SellStart and SellEndDate
    df = df.withColumn(
        "SalesTime", 
        when(
            col("SellEndDate").isNull(), 
            datediff(current_date(), col("sellStartDate"))
        ).otherwise(
            datediff(col("SellEndDate"), col("sellStartDate"))
        ).cast(IntegerType())
    )




    #Deduplicating
    df= deduplicate(df, "Name", None)
    

    # Adds transformation Date column
    df = df.withColumn("silves_transformed_timestamp", F.current_timestamp())
    

    #Cast to ensure datatype
    df = df.select(
         F.col('ProductID').cast(IntegerType()).alias('ProductID'),
         F.col('Name').cast(StringType()).alias('Name'), 
         F.col('Color').cast(StringType()).alias('Color'),
         F.col('ProductModelID').cast(StringType()).alias('ProductModelID'),
         F.col('StandardCost').cast(DecimalType(19,2)).alias('StandardCost'),
         F.col('ListPrice').cast(DecimalType(19,2)).alias('ListPrice'),
         F.col('Size').cast(StringType()).alias('Size'),
         F.col('Weight').cast(DecimalType(8,2)).alias('Weight'),
         F.col('ProductCategoryID').cast(IntegerType()).alias('ProductCategoryID'),
         F.col('SellStartDate').cast(TimestampType()).alias('SellStartDate'),
         F.col('SellEndDate').cast(TimestampType()).alias('SellEndDate'),
         F.col('DiscontinuedDate').cast(TimestampType()).alias('DiscontinuedDate'),
         F.col('rowguid').cast(StringType()).alias('rowguid'),
         F.col('ModifiedDate').cast(TimestampType()).alias('ModifiedDate'),
         F.col('bronze_ingestion_timestamp').cast(TimestampType()).alias('bronze_ingestion_timestamp'),
         F.col('silves_transformed_timestamp').cast(TimestampType()).alias('silves_transformed_timestamp'),
                 )
    return df

In [0]:
#Defining expected schema
expected_schema = StructType([
    StructField("ProductID", IntegerType(), False),             
    StructField("Name", StringType(), True),                  
    StructField("Color", StringType(), True),
    StructField("ProductModelID", StringType(), True),
    StructField("StandardCost", DecimalType(19,2), True),
    StructField("ListPrice", DecimalType(19,2), False),  
    StructField("Size", StringType(), False),
    StructField("Weight", DecimalType(8,2), False),
    StructField("ProductCategoryID", IntegerType(), False),
    StructField("SellStartDate", TimestampType(), False),
    StructField("SellEndDate", TimestampType(), False),
    StructField("DiscontinuedDate", TimestampType(), False),
    StructField("rowguid", StringType(), False),
    StructField("ModifiedDate", TimestampType(), False) ,
    StructField("bronze_ingestion_timestamp", TimestampType(), False),
    StructField("silves_transformed_timestamp", TimestampType(), False)])

In [0]:
silver_df = silver_clean_salesproduct(df)

In [0]:
#Checking for duplicated product names

checkduplicates(silver_df, "Name" )


In [0]:
#Checking the schema 
_validate_schema(silver_df, expected_schema)

In [0]:
#Comparing lenghts

compare_lengths(df, silver_df)

**IMPORTANT: Please note that this is a simulated project; the upsert operation will be executed within this notebook. In a production environment, a dedicated notebook containing only the function and validations would be developed. All function notebooks would be orchestrated by Azure Data Factory (ADF) pipelines or Azure Databricks (ADB) workflows. The method of upsert may vary based on the utilization of auto loader, streaming, or Change Data Feed (CDF).**

In [0]:
 #Loading into the Silver Layer   

target_table= "saleslt_product"   

schema = "managed_silver"

catalog = "adlslmcompany_silver"

primary_keys = ["ProductID"]


_upsert_silver_table(silver_df, target_table, primary_keys, schema, catalog )