# BUILDING CUSTOMER DIMENSION TABLE

# Import libraries 

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import trim, col
from pyspark.sql.types import StringType
from pyspark.sql.window import Window


# Mapping/rules 

In [0]:
RENAME_MAP = {
    "cst_id": "customer_id",
    "cst_key": "customer_key",
    "cst_firstname": "first_name",
    "cst_lastname": "last_name",
    "cst_marital_status": "marital_status",
    "cst_gndr": "gender",
    "cst_create_date": "create_date",
}

# Reading from Bronze

In [0]:
df = spark.table("workspace.bronze.crm_cust_info")

# Data Transformations



## Renaming the columns

In [0]:
for old_name, new_name in RENAME_MAP.items():
    df = df.withColumnRenamed(old_name, new_name)

## Trim whitespace for every string column

In [0]:
for field in df.schema.fields:
    if isinstance(field.dataType, StringType):
        df = df.withColumn(field.name,trim(col(field.name)))

## Normalization/Standardizes columns into friendly values

In [0]:
df = (
    df
    .withColumn(
        "marital_status",
        F.when(F.upper(F.col("marital_status")) == "S", "Single")
        .when(F.upper(F.col("marital_status")) == "M", "Married")
        .otherwise(None)
    )

    .withColumn(
        "gender",
        F.when(F.upper(F.col("gender")) == "M", "Male")
        .when(F.upper(F.col("gender")) == "F", "Female")
        .otherwise(None)
    )
)

# Focus on the business key and check for duplicates and nulls, if exist, solve it with counting a row through items and keep those who have most items in a row. If there are dups take the most complite item and latest on date. 

## Create a "completeness score": how many important columns are NOT null

In [0]:


df_scored = df.withColumn(
    "completeness_score",
    F.expr("""
      (case when first_name is not null then 1 else 0 end) +
      (case when last_name is not null then 1 else 0 end) +
      (case when marital_status is not null then 1 else 0 end) +
      (case when gender is not null then 1 else 0 end)
    """)
)



## Decide which record is the best. Rank rows per customer_key: best completeness first, then newest date


In [0]:
w = Window.partitionBy("customer_key").orderBy(
    F.col("completeness_score").desc(),
    F.col("create_date").desc_nulls_last(),
    F.col("customer_id").desc_nulls_last()
)

df_ranked = df_scored.withColumn("rn", F.row_number().over(w))

##  Keep only the best row per customer_key (remove duplicates)

In [0]:
df_deduped = df_ranked.filter(F.col("rn") == 1).drop("rn", "completeness_score")

## Check customer_key for duplicates 

In [0]:
dups_after_df = (
    df_deduped.filter(F.col("customer_key").isNotNull() & (F.trim(F.col("customer_key")) != ""))
      .groupBy("customer_key")
      .count()
      .filter(F.col("count") > 1)
      .orderBy(F.col("count").desc())
)

display(dups_after_df)


# Write Into Silver Table

In [0]:
(
    df.write
    .mode("overwrite")
    .format("delta")
    .saveAsTable("silver.crm_customers")
)

# Check data

In [0]:
%sql
select * from workspace.silver.crm_customers