# Notebook to cleansed bronze.customers into silver.customers

Define constant variables

In [0]:
ORIGIN_SCHEMA="bronze"
ORIGIN_TABLE="customers"
TARGET_SCHEMA="silver"
TARGET_TABLE=ORIGIN_TABLE

###  Load most recent time-window data from bronze.customers 

Doing it this way, there's a time process improvement when fetching the data related to matching the needed partitions not the entire entity.

In [0]:
from pyspark.sql.functions import col,current_timestamp, expr

bronze_df= spark.read.table(f"{ORIGIN_SCHEMA}.{ORIGIN_TABLE}").filter(col("ingestime")>= current_timestamp() - expr("INTERVAL 12 HOUR"))

Add silver_ingest timestamp to keep track of the data

In [0]:
bronze_df=bronze_df.withColumn("silver_ingestime",current_timestamp())
columns = ["silver_ingestime"] + [col for col in bronze_df.columns if col != "silver_ingestime"]
bronze_df = bronze_df.select(columns)

### Create Schema in Catalog

In [0]:
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {TARGET_SCHEMA}")

#### Create Empty table in the schema before merge

Get schema of bronze DataFrame

In [0]:
bronze_schema=bronze_df.schema

Create empty table

In [0]:
empty_df = spark.createDataFrame([], schema=bronze_schema)
empty_df.write.partitionBy("silver_ingestime").format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(f"{TARGET_SCHEMA}.{TARGET_TABLE}")

### Write into silver.customers using MERGE

Load silver.customers table as Delta Table

In [0]:
from delta.tables import DeltaTable
silver_table = DeltaTable.forName(spark, f"{TARGET_SCHEMA}.{TARGET_TABLE}")

Perform the MERGE

In [0]:
silver_table.alias("target").merge(
    source=bronze_df.alias("source"),
    condition="target.customer_id IS NOT NULL AND target.customer_id = source.customer_id"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()