## 1. Importing needed packages & check data for transormations

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType

In [0]:
cust_crm_df = spark.table("workspace.bronze.cust_info")
cust_crm_df.show()

In [0]:
cust_crm_df.printSchema()

Trasnformations:
- trim spaces of both firstname & lastname columns
- remove customers with empty customer ids
- replace abbreviations with full words in both cst_martial_status and cst_gndr
- rename columns

In [0]:
#trimming string columns
string_cols = [field.name for field in cust_crm_df.schema.fields if isinstance(field.dataType, StringType)]
for col in string_cols:
  cust_crm_df = cust_crm_df.withColumn(col, F.trim(F.col(col)))

In [0]:
#remove customers with no ids
cust_crm_df = cust_crm_df.filter(F.col("cst_id").isNotNull())

In [0]:
cust_crm_df = cust_crm_df.dropDuplicates(["cst_id", "cst_key"])

In [0]:
#replace abbreviations
cust_crm_df = (
    cust_crm_df.withColumn(
        "cst_marital_status",
        F.when(F.lower(F.col("cst_marital_status")) == "m", "Married")
         .when(F.lower(F.col("cst_marital_status")) == "s", "Single")
         .otherwise("Unknown")
    )
    .withColumn(
        "cst_gndr",
        F.when(F.lower(F.col("cst_gndr")) == "m", "Male")
         .when(F.lower(F.col("cst_gndr")) == "f", "Female")
         .otherwise("Unknown")
    )
)

In [0]:
#rename columns
_RENAME_MAP = {
    "cst_id": "customer_id",
    "cst_key": "customer_number",
    "cst_firstname": "first_name",
    "cst_lastname": "last_name",
    "cst_marital_status": "marital_status",
    "cst_gndr": "gender",
    "cst_create_date": "creation_date"
}

for old_name, new_name in _RENAME_MAP.items():
    cust_crm_df = cust_crm_df.withColumnRenamed(old_name, new_name)

## 2. Write Transformed DF to delta table

In [0]:
cust_crm_df.write.mode("overWrite").format("delta").saveAsTable("silver.crm_cust_info")