SalesLT_SalesOrderDetail

In [0]:
# Enable auto merge

spark.sql("SET spark.databricks.delta.schema.autoMerge.enabled = true")


In [0]:
from pyspark.sql import DataFrame, Window
from pyspark.sql import functions as F
from pyspark.sql.types import (
    DecimalType, IntegerType, StringType, TimestampType, StructType, StructField
)
from pyspark.sql.functions import col, count, desc, when, datediff, current_date

In [0]:
%run "/Workspace/Utils/Utils"

In [0]:
#Loading Table

df = spark.table("adlslmcompany_bronze.managed_bronze.saleslt_salesorderdetail")

In [0]:
#Displaying table

df.display()

In [0]:
#Checking for duplicated ID's

checkduplicates(df, "SalesOrderDetailID" )

In [0]:
# Identifying outliers in the numerical columns  using the IQR method
iqr_outlier(df, "UnitPrice")

In this context, even though we have some outliers, those values are not errors in input, but only expensive unit price products.
Therefore, for the silver layer, they should not be deleted.

In [0]:
iqr_outlier(df, "UnitPriceDiscount" ) 

In this context, although we observe some outliers, these values do not represent input errors; rather, they pertain to standard discounts. Therefore, for the silver layer, these should not be removed.

In [0]:
iqr_outlier(df, "LineTotal") 

In this context, although we observe some outliers, these values do not represent input errors; rather, they pertain to expensive products or large orders. Therefore, for the silver layer, these should not be removed.

In this case, no outliers or duplicated values were identified; therefore, only the silver stamp column will be added.

In [0]:
def silver_clean_salesorderdetail(df): 

    # Adds transformation Date column
    df = df.withColumn("silves_transformed_timestamp", F.current_timestamp())

    #Cast to ensure datatype
    df = df.select(
         F.col('SalesOrderID').cast(IntegerType()).alias('SalesOrderID'),
         F.col('SalesOrderDetailID').cast(IntegerType()).alias('SalesOrderDetailID'),
         F.col('OrderQty').cast(IntegerType()).alias('OrderQty'),
         F.col('ProductID').cast(IntegerType()).alias('ProductID'),
         F.col('UnitPrice').cast(DecimalType(19,4)).alias('UnitPrice'),
         F.col('UnitPriceDiscount').cast(DecimalType(19,4)).alias('UnitPriceDiscount'),
         F.col('LineTotal').cast(DecimalType(38,6)).alias('LineTotal'),
         F.col('rowguid').cast(StringType()).alias('rowguid'),
         F.col('ModifiedDate').cast(TimestampType()).alias('ModifiedDate'),
         F.col('bronze_ingestion_timestamp').cast(TimestampType()).alias('bronze_ingestion_timestamp'),
         F.col('silves_transformed_timestamp').cast(TimestampType()).alias('silves_transformed_timestamp'),
                 )
    return df

In [0]:
#Defining expected schema
expected_schema = StructType([
    StructField("SalesOrderID", IntegerType(), False),             
    StructField("SalesOrderDetailID", IntegerType(), False),
    StructField("OrderQty", IntegerType(), False),
    StructField("ProductID", IntegerType(), False),
    StructField("UnitPrice", DecimalType(19,4), False),
    StructField("UnitPriceDiscount", DecimalType(19,4), False),
    StructField("LineTotal", DecimalType(38,6), False),
    StructField("rowguid", StringType(), False),
    StructField("ModifiedDate", TimestampType(), False),
    StructField("bronze_ingestion_timestamp", TimestampType(), False),
    StructField("silves_transformed_timestamp", TimestampType(), False),
                            ])

In [0]:
# Transforming df
silver_df = silver_clean_salesorderdetail(df)

In [0]:
#Comparing lenghts

compare_lengths(df, silver_df)

**IMPORTANT: Please note that this is a simulated project; the upsert operation will be executed within this notebook. In a production environment, a dedicated notebook containing only the function and validations would be developed. All function notebooks would be orchestrated by Azure Data Factory (ADF) pipelines or Azure Databricks (ADB) workflows. The method of upsert may vary based on the utilization of auto loader, streaming, or Change Data Feed (CDF).**

In [0]:
#Loading into the Silver Layer 


target_table= "sales_orderdetail"   

schema = "managed_silver"

catalog = "adlslmcompany_silver"

primary_keys = ["SalesOrderID"]


_upsert_silver_table(silver_df, target_table, primary_keys, schema, catalog )