In [0]:

import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.functions import trim, col
     

#Read Table from bronze

In [0]:
df = spark.table("workspace.bronze.cust_erp")

#data cleaning

##Triming

In [0]:
for field in df.schema.fields:
  if isinstance(field.dataType, StringType):
    df = df.withColumn(field.name, trim(col(field.name)))


##Clean up customer ID

In [0]:

df = df.withColumn(
    "cid",
    F.when(col("cid").startswith("NAS"),
           F.substring(col("cid"), 4, F.length(col("cid"))))
     .otherwise(col("cid"))
)


##check the quality of BDT


In [0]:

df = df.withColumn(
    "bdate",
    F.when(col("bdate") > F.current_date(), None)
     .otherwise(col("bdate"))
)

## Standrize Gneder

In [0]:
df = df.withColumn(
    "gen",
    F.when(F.upper(col("gen")).isin("F", "FEMALE"), "Female")
     .when(F.upper(col("gen")).isin("M", "MALE"), "Male")
     .otherwise("n/a")
)

##Rename Culomns

In [0]:

rename_map= {
    "cid": "customer_number",
    "bdate": "birth_date",
    "gen": "gender"
}
for old_name , new_name in rename_map.items():
    df = df.withColumnRenamed(old_name, new_name)

In [0]:
df.write.mode("overwrite").format("delta").saveAsTable("workspace.silver.erpcustomers")

In [0]:
%sql
SELECT * FROM workspace.silver.erpcustomers

customer_number,birth_date,gender
AW00011000,1971-10-06,Male
AW00011001,1976-05-10,Male
AW00011002,1971-02-09,Male
AW00011003,1973-08-14,Female
AW00011004,1979-08-05,Female
AW00011005,1976-08-01,Male
AW00011006,1976-12-02,Female
AW00011007,1969-11-06,Male
AW00011008,1975-07-04,Female
AW00011009,1969-09-29,Male
