## 1. Import needed packages

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType

In [0]:
df = spark.table("workspace.bronze.sales_details")

In [0]:
df.show()

In [0]:
df.printSchema()

Transformations:
- convert date columns to date datatype instead of integer
- check sls_sales & sls_price are aligned well with quantity column

In [0]:
df = df.withColumn('sls_price', F.col("sls_price").cast("double")).withColumn('sls_sales', F.col("sls_sales").cast("double"))

In [0]:
# trimming all spaces first
string_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, StringType)]
for col in string_cols:
    df = df.withColumn(col, F.trim(F.col(col)))

In [0]:
#convert date columns
df = (
    df.withColumn("sls_order_dt",
      F.when((F.col("sls_order_dt") == 0) | (F.length(F.col("sls_order_dt")) != 8), None)
       .otherwise(F.to_date(F.col("sls_order_dt").cast("string"), "yyyyMMdd"))
    ).withColumn("sls_ship_dt",
      F.when((F.col("sls_ship_dt") == 0) | (F.length(F.col("sls_ship_dt")) != 8), None)
       .otherwise(F.to_date(F.col("sls_ship_dt").cast("string"), "yyyyMMdd"))
    ).withColumn("sls_due_dt",
      F.when((F.col("sls_due_dt") == 0) | (F.length(F.col("sls_due_dt")) != 8), None)
       .otherwise(F.to_date(F.col("sls_due_dt").cast("string"), "yyyyMMdd"))
    )
)

In [0]:
# sls_price * sls_quantity = sls_sales

df = (df.withColumn(
    "sls_price",
    F.when((F.col("sls_price").isNull()) | (F.col("sls_price") <=0),
           F.when(F.col("sls_price") != 0, F.col("sls_sales") / F.col("sls_quantity")).otherwise(None) 
        ).otherwise(F.col("sls_price"))
    ).withColumn(
        "sls_sales",
        F.when((F.col("sls_price") * F.col("sls_quantity") != F.col("sls_sales")) & (F.col("sls_price") != 0) & (F.col("sls_price").isNotNull()) & (F.col("sls_quantity") != 0) & (F.col("sls_quantity").isNotNull()),
               F.col("sls_price") * F.col("sls_quantity")).otherwise(F.col("sls_sales"))
    )
)

In [0]:
#rename columns

_RENAME_MAP = {
    "sls_ord_num": "order_number",
    "sls_prd_key": "product_number",
    "sls_cust_id": "customer_id",
    "sls_order_dt": "order_date",
    "sls_ship_dt": "ship_date",
    "sls_due_dt": "due_date",
    "sls_sales": "sales_amount",
    "sls_quantity": "quantity",
    "sls_price": "price"
}
for old_name, new_name in _RENAME_MAP.items():
    df = df.withColumnRenamed(old_name, new_name)

## 2. write to silver table

In [0]:
df.write.mode("overwrite").format("delta").saveAsTable("workspace.silver.crm_sales_details")