# Silver transfomation

# Import modules

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import trim, col
from pyspark.sql.types import StringType
from pyspark.sql import DataFrame


RENAME_MAP = {
    "cid": "customer_id",
    "bdate": "birth_date",
    "gen": "gender"
}

# Read from Bronze layer

In [0]:
df=spark.table("workspace.bronze.erp_cust_az12")
display(df)

# Rename columns and normalazition 

In [0]:
def rename_columns(df: DataFrame, rename_map: dict) -> DataFrame:
    for old_name, new_name in rename_map.items():
        df = df.withColumnRenamed(old_name, new_name)
    return df


In [0]:
display(df)

In [0]:

RENAME_MAP = {
    "cid": "customer_id",
    "bdate": "birth_date",
    "gen": "gender"
}

for old_name, new_name in RENAME_MAP.items():
    df = df.withColumnRenamed(old_name, new_name)

# Trimming columns

In [0]:
def trim_string_columns(df):
    for field in df.schema.fields:
        if isinstance(field.dataType, StringType):
            df = df.withColumn(field.name, F.trim(F.col(field.name)))
    return df

In [0]:
def standardize_gender(df: DataFrame, col_name: str = "gender") -> DataFrame:
    g = F.upper(F.trim(F.col(col_name)))
    return df.withColumn(
        col_name,
        F.when(g.isin("F", "FEMALE"), F.lit("Female"))
         .when(g.isin("M", "MALE"), F.lit("Male"))
         .otherwise(F.lit(None))
    )

# Check how gender validation worked

In [0]:
df = trim_string_columns(df)

df.groupBy("gender") \
  .agg(F.count("*").alias("cnt"),
       F.length("gender").alias("len"),
       F.hex("gender").alias("hex")) \
  .orderBy(F.desc("cnt")) \
  .show(truncate=False)

In [0]:
display(df)

# Check data types

In [0]:
df.printSchema()

# Primary key uniqueness

In [0]:
%sql
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT customer_id) AS distinct_customers
FROM silver.erp_customers;


# Duplicates

In [0]:
%sql
SELECT customer_id, COUNT(*) cnt
FROM silver.erp_customers
GROUP BY customer_id
HAVING COUNT(*) > 1;

# Null / missing values

In [0]:
%sql
SELECT
  SUM(CASE WHEN customer_id IS NULL THEN 1 ELSE 0 END) AS null_customer_id,
  SUM(CASE WHEN birth_date IS NULL THEN 1 ELSE 0 END) AS null_birth_date,
  SUM(CASE WHEN gender IS NULL THEN 1 ELSE 0 END) AS null_gender
FROM silver.erp_customers;


# Domain and standardization (gender) only allowed values, consistent format.

In [0]:


df = df.withColumn(
    "gender",
    F.when(g == "F", "Female")
     .when(g == "M", "Male")
     .when(g == "FEMALE", "Female")
     .when(g == "MALE", "Male")
     .otherwise(F.lit(None))
)



In [0]:


df = trim_string_columns(df)

df.groupBy("gender") \
  .agg(F.count("*").alias("cnt"),
       F.length("gender").alias("len"),
       F.hex("gender").alias("hex")) \
  .orderBy(F.desc("cnt")) \
  .show(truncate=False)


In [0]:
%sql
SELECT gender, COUNT(*) cnt
FROM silver.erp_customers
GROUP BY gender
ORDER BY cnt DESC;

In [0]:
df = df.withColumn(
    "customer_id",
    F.when(col("customer_id").startswith("NAS"),
           F.substring(col("customer_id"), 4, F.length(col("customer_id"))))
     .otherwise(col("customer_id"))
)

In [0]:
display(df)

# Write to Silver layer

In [0]:
(
    df.write
    .mode("overwrite")
    .format("delta")
    .option("overwriteSchema", "true") 
    .saveAsTable("silver.erp_customers")
)