# BUILDING CUSTOMER DIMENSION TABLE

# Import libraries 

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import trim, col
from pyspark.sql.types import StringType, IntegerType, DateType
from pyspark.sql.window import Window
import logging

# Logging configuration


In [0]:
import logging

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

logger = logging.getLogger("silver_crm_cust_info")
logger.setLevel(logging.INFO)

# Mapping/rules 

In [0]:



# Rename mapping

RENAME_MAP = {
    "cst_id": "customer_id",
    "cst_key": "customer_key",
    "cst_firstname": "first_name",
    "cst_lastname": "last_name",
    "cst_marital_status": "marital_status",
    "cst_gndr": "gender",
    "cst_create_date": "create_date",
}

# Transfomations / All columns arrive as string

## Trimming columns

In [0]:
def trim_string_columns(df):
    logger.info("Running trim_string_columns")
    for field in df.schema.fields:
        if isinstance(field.dataType, StringType):
            df = df.withColumn(field.name, trim(col(field.name)))
    return df

## Renaming columns

In [0]:
def rename_columns(df):
    logger.info("Running rename_columns")
    logger.info(f"Columns before rename: {df.columns}")

    for old_name, new_name in RENAME_MAP.items():
        if old_name in df.columns:
            df = df.withColumnRenamed(old_name, new_name)

    logger.info(f"Columns after rename: {df.columns}")
    return df

## Casting columns

In [0]:
def cast_columns(df):
    logger.info("Running cast_columns")

    df = df \
        .withColumn("customer_id", col("customer_id").cast(IntegerType())) \
        .withColumn("create_date", col("create_date").cast(DateType()))

    logger.info("Schema after casting:")
    df.printSchema()

    return df

## Stadardization/Normalization
## for columns: 'marital_status' AND 'gender' 

In [0]:
def standardize_categorical(df):
    logger.info("Running standardize_categorical")

    df = df \
        .withColumn(
            "marital_status",
            F.when(F.upper(F.col("marital_status")).isin("S", "SINGLE"), "Single")
             .when(F.upper(F.col("marital_status")).isin("M", "MARRIED"), "Married")
             .otherwise(None)
        ) \
        .withColumn(
            "gender",
            F.when(F.upper(F.col("gender")).isin("M", "MALE", "1"), "Male")
             .when(F.upper(F.col("gender")).isin("F", "FEMALE", "2"), "Female")
             .otherwise(None)
        )

    return df

In [0]:
# Main transformation flow

logger.info("Reading Bronze table: workspace.bronze.crm_cust_info")

df = spark.table("workspace.bronze.crm_cust_info")

df = trim_string_columns(df)
df = rename_columns(df)
df = cast_columns(df)
df = standardize_categorical(df)

logger.info("Transformation completed successfully")




Focus on the business key and check for duplicates and nulls, if exist, solve it with counting a row through items and keep those who have most items in a row. If there are dups take the most complite item and latest on date.

#Business Key Enforcement


## A business key is the value that identifies something uniquely in real life.

## 1 Step: Check NULL business keys


In [0]:
logger.info("Starting business key validation")

null_keys = df.filter(F.col("customer_key").isNull())
null_count = null_keys.count()

logger.info(f"Null customer_key count: {null_count}")

if null_count > 0:
        logger.error("Null business keys detected. Failing Silver layer.")
        raise Exception("Silver DQ FAILED: Null customer_key detected")

## Step 2: Detect duplicate business keys


In [0]:

duplicate_keys = (
    df.groupBy("customer_key").count().filter(F.col("count") > 1))


dup_key_count = duplicate_keys.count()
logger.info(f"Duplicate customer_key count: {dup_key_count}")
#print(dup_key_count)

## Step 3: Create completeness score (safer ranking logic)

In [0]:
important_cols = [
    "first_name",
    "last_name",
    "marital_status",
    "gender"
]

df = df.withColumn(
    "completeness_score",
    sum(F.col(c).isNotNull().cast("int")
        for c in important_cols)
)

logger.info("Completeness score column created")

## Step 4: Apply deterministic ranking

In [0]:

w = Window.partitionBy("customer_key").orderBy(
    F.col("completeness_score").desc(),
    F.col("create_date").desc_nulls_last(),
    F.col("customer_id").desc_nulls_last()
)

df_ranked = df.withColumn("rn", F.row_number().over(w))

logger.info("Ranking applied using completeness + create_date + customer_id")

## Step 5: Keep best record per business key


In [0]:
df_silver = (
    df_ranked.filter(F.col("rn") == 1)
        .drop("rn", "completeness_score")
)

logger.info("Duplicate resolution completed")

## Final validation

In [0]:
remaining_dups = (
    df_silver.groupBy("customer_key")
             .count()
             .filter(F.col("count") > 1)
             .count()
)

logger.info(f"Remaining duplicates after resolution:---------> {remaining_dups}")

if remaining_dups > 0:
    logger.error("Duplicate keys still exist after resolution.")
    raise Exception("Silver DQ FAILED: Duplicate keys remain")

logger.info("Business key enforcement completed successfully")

In [0]:
%sql
SELECT * FROM workspace.silver.crm_customers