In [0]:
%run "/Workspace/Users/dimaklimenko90@gmail.com/databricks_bootcamp_2026/script/silver/crm/silver_crm_cust_info"


In [0]:
# =====================================================
# BUILD CUSTOMER DIMENSION – SILVER LAYER
# =====================================================

logger.info("Reading Bronze table: workspace.bronze.crm_cust_info")

# -----------------------------------------------------
# Step 1: Read Bronze
# -----------------------------------------------------
# Load raw data from Bronze layer.
df = spark.table("workspace.bronze.crm_cust_info")


# -----------------------------------------------------
# Step 2: Basic Transformations (Clean & Standardize)
# -----------------------------------------------------

# Trim all string columns (important before validation)
df = trim_string_columns(df)

# Rename columns to business-friendly names
df = rename_columns(df)

# Cast data types (e.g., id → int, date → date)
df = cast_columns(df)

# Normalize categorical columns (gender, marital_status)
df = standardize_categorical(df)


# -----------------------------------------------------
# Step 3: Business Key Enforcement
# -----------------------------------------------------
# We enforce business key integrity in layers.

# 3.1 Validate existence (NULL / empty)
df = validate_not_null_business_key(
    df,
    key_col="customer_key",
    quarantine_table="workspace.quarantine.crm_cust_info"
)

# 3.2 Validate format (prefix + length + numeric suffix)
# This ensures structural correctness of the key.
df = validate_business_key_format(
    df,
    key_col="customer_key",
    allowed_prefixes=ALLOWED_PREFIXES,
    expected_length=EXPECTED_LENGTH,
    quarantine_table="workspace.quarantine.crm_cust_info"
)

# 3.3 Resolve duplicates deterministically
# Keeps the most complete + latest record.
df = resolve_duplicate_business_keys(
    df,
    key_col="customer_key",
    important_cols=["first_name", "last_name", "marital_status", "gender"],
    date_col="create_date",
    id_col="customer_id",
    quarantine_table="workspace.quarantine.crm_cust_info"
)

# 3.4 Final safety check
# Fail pipeline if duplicates still exist.
df = final_duplicate_check(
    df,
    key_col="customer_key"
)


# -----------------------------------------------------
# Step 4: Write Clean Silver Table
# -----------------------------------------------------
# At this point:
# - No null business keys
# - Valid key format
# - No duplicates
# - Cleaned & standardized data

df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("workspace.silver.crm_customers")

logger.info("Customer dimension successfully built.")

In [0]:


df = spark.table("workspace.bronze.crm_cust_info")

df = rename_columns(df)          
df = cast_columns(df)
df = trim_string_columns(df)
df = standardize_categorical(df)



# Check null business key
df_null_keys = df.filter(col("customer_key").isNull())
display(df_null_keys)


# Check duplicates keys
df_duplicates = (df.groupBy("customer_key")
                .count()
                .filter(col("count") > 1)
)



df_duplicates_rows = df.join(
    df_duplicates.select("customer_key"),
    on="customer_key",
    how = "inner"
)
display(df_duplicates_rows.orderBy("customer_key"))




w = Window.partitionBy("customer_key") \
          .orderBy(F.col("create_date").desc())


df_ranked = df.withColumn(
    "rn",
    F.row_number().over(w)
)

df_silver = df_ranked \
    .filter(F.col("rn") == 1) \
    .drop("rn")


df_silver.groupBy("customer_key") \
         .count() \
         .filter(F.col("count") > 1) \
         .show()

