# Import libraries and modules

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import trim, col
from pyspark.sql.types import StringType



# Rename map


In [0]:
RENAME_MAP = {
    'sls_ord_num': 'order_number',
    'sls_prd_key': 'product_key',
    'sls_cust_id': 'customer_id',
    'sls_order_dt': 'order_date',
    'sls_ship_dt': 'ship_date',
    'sls_due_dt': 'due_date',
    'sls_sales': 'sales',
    'sls_quantity': 'quantity',
    'sls_price': 'price'
    }

# Reading data from the Bronze and check schema

In [0]:
df = spark.table("workspace.bronze.crm_sales_details")
df.printSchema()


#Renaming columns


In [0]:
def rename_columns(df, RENAME_MAP:dict):
    for old_name, new_name in RENAME_MAP.items():
        df = df.withColumnRenamed(old_name, new_name)
    return df
df = rename_columns(df, RENAME_MAP)
df.printSchema()

# Trim strings columns 

In [0]:
def trim_string_columns(df):
    for field in df.schema.fields:
        if isinstance(field.dataType, StringType):
            df = df.withColumn(field.name, trim(col(field.name)))
    return df
df = trim_string_columns(df)
df.display()
#df.printSchema()

# CHECK. We want to understand how bad the data is before fixing.
## This code loops through date columns, counts nulls, zeros, and wrong-length values.
## Then shows all those counts in one row to spot data problems.


In [0]:


date_cols = ["order_date", "ship_date", "due_date"]

checks = []
for c in date_cols:
    checks.append(
        F.sum(
            F.col(c).isNull()
            .cast("int"))
            .alias(f"{c}_nulls"))
    checks.append(
        F.sum((F.col(c) == 0)
            .cast("int"))
            .alias(f"{c}_zeros"))
    checks.append(
        F.sum(
            (F.col(c).isNotNull() &
             (F.col(c) != 0) &
             (F.length(F.col(c).cast("string")) != 8)
            ).cast("int")
        ).alias(f"{c}_wrong_len")
    )




# CHECK. Use copy of df to check if data looks good after the conversion

In [0]:
df2 = (
    df
    .withColumn("order_date_dt",
        F.to_date(F.col("order_date").cast("string"), "yyyyMMdd")
    )
    .withColumn("ship_date_dt",
        F.to_date(F.col("ship_date").cast("string"), "yyyyMMdd")
    )
    .withColumn("due_date_dt",
        F.to_date(F.col("due_date").cast("string"), "yyyyMMdd")
    )
)
df2.display()
#df2.printSchema()

#Convert int to date

In [0]:



def convert_int_to_date(col_name: str):
    return (
        F.when(F.col(col_name).isNull() | (F.col(col_name) == 0), None)
         .when(F.length(F.col(col_name).cast("string")) == 8,
               F.to_date(F.col(col_name).cast("string"), "yyyyMMdd"))
         .otherwise(None)
    )

#df.withColumn("test_date", int_to_date("order_date")).show(5)


## Owerite the columns

In [0]:
df = df.withColumn("order_date", convert_int_to_date("order_date")) \
       .withColumn("ship_date",   convert_int_to_date("ship_date")) \
       .withColumn("due_date",    convert_int_to_date("due_date"))

In [0]:
df.printSchema()                          # see date type
df.select("order_date", "ship_date", "due_date").show(5, truncate=False)

# Write to Silver Table

In [0]:
(
    df.write
    .mode("overwrite")
    .format("delta")
    .option("overwriteSchema", "true") 
    .saveAsTable("silver.crm_sales_details")
)

In [0]:
%sql
SELECT
  COUNT(*) AS total_rows,
  SUM(CASE WHEN order_date IS NOT NULL THEN 1 ELSE 0 END) AS valid_dates,
  SUM(CASE WHEN order_date IS NULL THEN 1 ELSE 0 END) AS null_dates
FROM workspace.silver.crm_sales_details;



In [0]:
%sql
SELECT *
FROM workspace.silver.crm_sales_details
WHERE customer_id IS NULL;