In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType

In [0]:
df = spark.table("workspace.bronze.cust_az12")

In [0]:
df.show()

In [0]:
df.printSchema()

In [0]:
%sql
SELECT DISTINCT GEN FROM workspace.bronze.cust_az12

Transformations:
- Remove "NAS" prefix from CID column
- Make sure no birthdate exceeds current date
- Trim space and replace abbreviations in GEN column

In [0]:
# remove "NAS" prefix

df = df.withColumn("CID",
          F.when(F.col("CID").like("NAS%"), F.substring(F.col("CID"), 4, F.length(F.col("CID"))))
           .otherwise(F.col("CID"))    
    )

In [0]:
# birthday transformation

df = df.withColumn("BDATE", F.when(F.col("BDATE") >= F.current_date(), None).otherwise(F.col("BDATE")))

In [0]:
df = df.withColumn("GEN",
       F.when(F.upper(F.trim(F.col("GEN"))).isin("F", "FEMALE"), "Female")
       .when(F.upper(F.trim(F.col("GEN"))).isin("M", "MALE"), "Male")
       .otherwise("Unknown")
    )

In [0]:
_RENAME_MAP = {
    "cid": "customer_number",
    "bdate": "birth_date",
    "gen": "gender"
}
for old_name, new_name in _RENAME_MAP.items():
    df = df.withColumnRenamed(old_name, new_name)

In [0]:
df.write.mode("overwrite").format("delta").saveAsTable("workspace.silver.erp_cust_az12")