### Silver Layer

In [0]:
from datetime import datetime
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, TimestampType
from pyspark.sql.functions import to_date, to_timestamp

In [0]:
# path
bronze_path = 'gs://my-bucket-deep/Medallion/bronze/'
silver_path = 'gs://my-bucket-deep/Medallion/silver/'

In [0]:
bronze_df = spark.read.format('delta').load(f'{bronze_path}crm_cust_info')

In [0]:
bronze_df.write.format('delta').mode('overwrite').save(f'{silver_path}crm_cust_info')

In [0]:
bronze_tables = [
    "crm_cust_info",
    "crm_prd_info",
    "crm_sales_details",
    "erp_cust_az12",
    "erp_loc_a101",
    "erp_px_cat_g1v2"
]

In [0]:
bronze_dfs = {
    table: spark.read.format('delta').load(f'{bronze_path}{table}')
    for table in bronze_tables
}

In [0]:
bronze_dfs

{'crm_cust_info': DataFrame[cst_id: int, cst_key: string, cst_firstname: string, cst_lastname: string, cst_marital_status: string, cst_gndr: string, cst_create_date: date],
 'crm_prd_info': DataFrame[prd_id: int, prd_key: string, prd_nm: string, prd_cost: int, prd_line: string, prd_start_dt: timestamp, prd_end_dt: timestamp],
 'crm_sales_details': DataFrame[sls_ord_num: string, sls_prd_key: string, sls_cust_id: int, sls_order_dt: int, sls_ship_dt: int, sls_due_dt: int, sls_sales: int, sls_quantity: int, sls_price: int],
 'erp_cust_az12': DataFrame[cid: string, bdate: date, gen: string],
 'erp_loc_a101': DataFrame[cid: string, cntry: string],
 'erp_px_cat_g1v2': DataFrame[id: string, cat: string, subcat: string, maintenance: string]}

In [0]:
# Utility logger
def log(msg):
    print(f"[{datetime.now().isoformat()}] {msg}")

In [0]:
def load_to_silver_table():
    try:
        

Transformation `crm_cust_info`

In [0]:
bronze_dfs['crm_cust_info'].display()

cst_id,cst_key,cst_firstname,cst_lastname,cst_marital_status,cst_gndr,cst_create_date
11000,AW00011000,Jon,Yang,M,M,2025-10-06
11001,AW00011001,Eugene,Huang,S,M,2025-10-06
11002,AW00011002,Ruben,Torres,M,M,2025-10-06
11003,AW00011003,Christy,Zhu,S,F,2025-10-06
11004,AW00011004,Elizabeth,Johnson,S,F,2025-10-06
11005,AW00011005,Julio,Ruiz,S,M,2025-10-06
11006,AW00011006,Janet,Alvarez,S,F,2025-10-06
11007,AW00011007,Marco,Mehta,M,M,2025-10-06
11008,AW00011008,Rob,Verhoff,S,F,2025-10-06
11009,AW00011009,Shannon,Carlson,S,M,2025-10-06


In [0]:
from pyspark.sql import functions as f
from pyspark.sql.window import Window

In [0]:
df = bronze_dfs['crm_cust_info']

# Trim columns
for col in ['cst_firstname', 'cst_lastname']:
    df = df.withColumn(col, f.trim(f.col(col)))

In [0]:
bronze_dfs['crm_cust_info'].filter(f.trim(f.col('cst_firstname')) != f.col('cst_firstname')).show()


+------+----------+-------------+------------+------------------+--------+---------------+
|cst_id|   cst_key|cst_firstname|cst_lastname|cst_marital_status|cst_gndr|cst_create_date|
+------+----------+-------------+------------+------------------+--------+---------------+
| 11000|AW00011000|          Jon|       Yang |                 M|       M|     2025-10-06|
| 11004|AW00011004|    Elizabeth|     Johnson|                 S|       F|     2025-10-06|
| 11012|AW00011012|       Lauren|      Walker|                 M|       F|     2025-10-06|
| 11013|AW00011013|         Ian |    Jenkins |                 M|       M|     2025-10-06|
| 11015|AW00011015|        Chloe|       Young|                 S|       F|     2025-10-06|
| 11021|AW00011021|      Destiny|     Wilson |                 S|       F|     2025-10-07|
| 11063|AW00011063|     Angela  |    Murphy  |                 S|       F|     2025-10-07|
| 11065|AW00011065|    Jessica  |  Henderson |                 M|       F|     2025-10-07|