In [0]:
%sql
USE CATALOG analytics;
USE SCHEMA silver;

In [0]:
%sql
-- Check marital status values
SELECT DISTINCT cst_marital_status FROM bronze.crm_cust_info;


## Read Bronze table

In [0]:
df = spark.table("bronze.crm_cust_info")
df.display()

## Remove duplicate customers

In [0]:
df = df.dropDuplicates(["cst_id"])
df.display()


## Trim all string columns

In [0]:
from pyspark.sql.functions import trim, col
from pyspark.sql.types import StringType

for field in df.schema.fields:
    if isinstance(field.dataType, StringType):
        df = df.withColumn(field.name, trim(col(field.name)))

df.display()


## Normalize gender values

In [0]:
from pyspark.sql.functions import when

df = df.withColumn(
    "cst_gndr",
    when(col("cst_gndr") == "M", "Male")
    .when(col("cst_gndr") == "F", "Female")
    .otherwise(None)
)
df.display()


## Normalize marital status

In [0]:
df = df.withColumn(
    "cst_marital_status",
    when(col("cst_marital_status") == "S", "Single")
    .when(col("cst_marital_status") == "M", "Married")
    .otherwise(None)
)
df.display()

## Handle missing categorical values

In [0]:

df = df.fillna({
    "cst_gndr": "Unknown",
    "cst_marital_status": "Unknown"
})
df.display()

## Rename columns to friendly names

In [0]:
df = (
    df
    .withColumnRenamed("cst_id", "customer_id")
    .withColumnRenamed("cst_key", "customer_key")
    .withColumnRenamed("cst_firstname", "first_name")
    .withColumnRenamed("cst_lastname", "last_name")
    .withColumnRenamed("cst_gndr", "gender")
    .withColumnRenamed("cst_marital_status", "marital_status")
    .withColumnRenamed("cst_created_date", "created_date")
)

df.display()

## Write Silver table

In [0]:
df.write \
  .mode("overwrite") \
  .format("delta") \
  .saveAsTable("silver.crm_customers")


## Validate Silver table

In [0]:
spark.table("silver.crm_customers").display()