# Notebook to cleansed bronze.customers into silver.customers

Define constant variables

In [0]:
from pyspark.sql.functions import col, trim

In [0]:
%run ../../config/variables

In [0]:
ORIGIN_TABLE="brz_customers"
TARGET_TABLE="slv_customers"

###  Load data from bronze.customers 

In [0]:
bronze_df = spark.read.table(f"{catalog_name}.{bronze_schema_name}.{ORIGIN_TABLE}")

### Transform and check quality 

In [0]:
string_cols = [f.name for f in bronze_df.schema.fields if f.dataType.simpleString() == "string"]
bronze_df = bronze_df.select(
    *[trim(col(c)).alias(c) if c in string_cols else col(c) for c in bronze_df.columns]
)
#QUALITY
bronze_df = bronze_df.dropDuplicates(['customer_id'])

bronze_df.createOrReplaceTempView("customers")




### Write into silver.customers using MERGE

In [0]:
spark.sql(f"""
  MERGE INTO {catalog_name}.{silver_schema_name}.{TARGET_TABLE} AS target
  USING customers AS source
  ON target.customer_id = source.customer_id
  WHEN MATCHED AND (
      target.name    IS DISTINCT FROM source.name OR
      target.phone   IS DISTINCT FROM source.phone OR
      target.email   IS DISTINCT FROM source.email OR
      target.address IS DISTINCT FROM source.address
  ) THEN
    UPDATE SET
      target.name    = source.name,
      target.phone   = source.phone,
      target.email   = source.email,
      target.address = source.address
  WHEN NOT MATCHED THEN
    INSERT (
      customer_id,
      name,
      phone,
      email,
      address
    )
    VALUES (
      source.customer_id,
      source.name,
      source.phone,
      source.email,
      source.address
    )
""")