# 🪙 Silver Layer: 
## Data Cleansing, Enrichment, and Aggregation

**Purpose:**  
This notebook reads data from **Bronze** tables, performs:
- Data cleansing  
- WHO-style AQI enrichment  
- Idempotent upserts into **Silver** tables  
- Builds **daily aggregated Air Quality** Silver table  

It also updates the `pipeline_log` table for every execution — marking both **success** and **failure**.

## 🧰 Step 1: Setup and Imports


In [0]:
from datetime import datetime, timezone
from pyspark.sql import SparkSession, functions as F, types as T
from pyspark.sql.utils import AnalysisException
from delta.tables import DeltaTable

spark = SparkSession.builder.getOrCreate()

## ⚙️ Step 2: Configuration — Table Names and Globals

In [0]:
catalog = "env_catalog"
schema = "env_data"

city_master_tbl = f"{catalog}.{schema}.city_master"
pipeline_log_tbl = f"{catalog}.{schema}.pipeline_log"
silver_meta_tbl = f"{catalog}.{schema}.source_last_processed_ts"

bronze_weather_hourly_tbl   = f"{catalog}.{schema}.bronze_weather_hourly"
bronze_weather_daily_tbl    = f"{catalog}.{schema}.bronze_weather_daily"
bronze_air_hourly_tbl       = f"{catalog}.{schema}.bronze_air_hourly"

silver_weather_hourly_tbl = f"{catalog}.{schema}.silver_weather_hourly"
silver_weather_daily_tbl = f"{catalog}.{schema}.silver_weather_daily"
silver_air_hourly_tbl     = f"{catalog}.{schema}.silver_air_hourly"
silver_air_daily_tbl      = f"{catalog}.{schema}.silver_air_daily"

In [0]:
pipeline_name = "TRANSFORM"
# TODO: Determine if Manual run or Scheduled run
triggered_by = "Manual"  # Can be Scheduled/Event driven if automated
run_type = "HOURLY"
start_ts = datetime.now(timezone.utc)
run_id = globals().get("run_id", f"{run_type}_{pipeline_name.upper()}_{start_ts.strftime('%Y%m%d_%H%M%S')}")
status = "RUNNING"
remarks = "Silver Transformation layer job started"

# Initial pipeline log entry
try:
    spark.sql(f"""
        INSERT INTO {pipeline_log_tbl} 
        (run_id, pipeline_name, run_type, start_time, status, triggered_by, remarks, created_ts)
        VALUES ('{run_id}', '{pipeline_name}', '{run_type}', TIMESTAMP '{start_ts}', '{status}', '{triggered_by}', '{remarks}', current_timestamp())
    """)
    print(f"🪶 Created pipeline_log entry for run_id={run_id}")
except Exception as e:
    print("⚠️ Could not log start in pipeline_log:", e)

🪶 Created pipeline_log entry for run_id=HOURLY_TRANSFORM_20251113_061943


## 🧩 Step 3: Utility Functions

In [0]:
def table_exists(name: str) -> bool:
    """Robust check for table existence (works across environments)."""
    try:
        # prefer catalog metadata check
        if spark.catalog.tableExists(name):
            return True
    except Exception:
        pass
    # fallback: try to read metadata plan (no data read)
    try:
        spark.table(name).limit(0).count()
        return True
    except Exception:
        return False
    
def ensure_table(name: str, schema_sdf, partition_col=None):
    """
    Create a Delta table with the provided schema if it does not exist.
    Uses schema_sdf.limit(0) to create structure only.
    """
    if table_exists(name):
        return
    write_builder = schema_sdf.limit(0).write.format("delta") .mode("overwrite").option("overwriteSchema", "true")
    if partition_col:
        write_builder = write_builder.partitionBy(partition_col)
    write_builder.saveAsTable(name)
    print(f"Created table {name}")

def update_pipeline_log(status, remarks, records_processed=0, earliest_ts=None, latest_ts=None):
    """Update the pipeline_log for this run."""
    try:
        earliest_expr = f"TIMESTAMP '{earliest_ts}'" if earliest_ts else "NULL"
        latest_expr = f"TIMESTAMP '{latest_ts}'" if latest_ts else "NULL"
        spark.sql(f"""
            UPDATE {pipeline_log_tbl}
            SET end_time = current_timestamp(),
                status = '{status}',
                records_processed = COALESCE(records_processed, 0) + {records_processed},
                earliest_ts = COALESCE(earliest_ts, {earliest_expr}),
                latest_ts = {latest_expr},
                remarks = '{remarks}'
            WHERE run_id = '{run_id}'
        """)
        print(f"✅ pipeline_log updated: {status}")
    except Exception as e:
        print("⚠️ Could not update pipeline_log:", e)

## 🧼 Step 4: Data Cleansing and Data Enrichment Utilities

In [0]:
def basic_cleanse(df, key_cols):
    """Remove nulls, deduplicate, normalize timestamps."""
    for k in key_cols:
        df = df.filter(F.col(k).isNotNull())
    if "timestamp" in df.columns:
        df = df.withColumn("timestamp", F.to_timestamp(F.col("timestamp")))
    df = df.dropDuplicates(key_cols)
    return df

def derive_weather_description(sdf):
    # Ref: https://open-meteo.com/en/docs#daily_weather_variables
    # WMO Weather interpretation codes (WW)
    # Code	Description
    # 0	Clear sky
    # 1, 2, 3	Mainly clear, partly cloudy, and overcast
    # 45, 48	Fog and depositing rime fog
    # 51, 53, 55	Drizzle: Light, moderate, and dense intensity
    # 56, 57	Freezing Drizzle: Light and dense intensity
    # 61, 63, 65	Rain: Slight, moderate and heavy intensity
    # 66, 67	Freezing Rain: Light and heavy intensity
    # 71, 73, 75	Snow fall: Slight, moderate, and heavy intensity
    # 77	Snow grains
    # 80, 81, 82	Rain showers: Slight, moderate, and violent
    # 85, 86	Snow showers slight and heavy
    # 95 *	Thunderstorm: Slight or moderate
    # 96, 99 *	Thunderstorm with slight and heavy hail
    wc_col = "weather_code"

    wc = F.col(wc_col).cast("int")

    # Compact labels based on Open-Meteo / WMO weather codes
    sdf = sdf.withColumn(
        "weather_description",
        F.when(wc == 0, "Clear sky")
         .when((wc >= 1) & (wc <= 3), "Mainly clear / partly cloudy / overcast")
         .when((wc >= 45) & (wc <= 48), "Fog / depositing rime fog")
         .when((wc >= 51) & (wc <= 55), "Drizzle (light / moderate / dense)")
         .when((wc >= 56) & (wc <= 57), "Freezing drizzle")
         .when((wc >= 61) & (wc <= 65), "Rain (light / moderate / heavy)")
         .when((wc >= 66) & (wc <= 67), "Freezing rain")
         .when((wc >= 71) & (wc <= 75), "Snow fall (light / moderate / heavy)")
         .when(wc == 77, "Snow grains")
         .when((wc >= 80) & (wc <= 82), "Rain showers (slight / moderate / violent)")
         .when((wc >= 85) & (wc <= 86), "Snow showers (slight / moderate)")
         .when((wc == 95), "Thunderstorm (slight / moderate rain)")
         .when((wc >= 96) & (wc <= 99), "Thunderstorm with hail")
         .otherwise("Unknown / Other")
    )
    return sdf

# Compute single climate_index (0-100) + climate_label 
# It will run only for weather tables (hourly/daily) and is safe if columns are missing.
def compute_climate_index_and_label(sdf):
    """
    Adds two columns:
      - climate_index (double) in [0,100]
      - climate_label (string) one of: Poor, Fair, Moderate, Good, Excellent
    Uses existing columns if present: temperature_c, relative_humidity, pressure_hpa,
    precipitation_mm (hourly) or precipitation_sum (daily), snow_mm, wind_speed_m_s.
    """

    # Ensure expected columns exist (create null if missing)
    def _ensure(colname):
        return colname if colname in sdf.columns else F.lit(None).cast("double").alias(colname)

    # Prefer precipitation_sum for daily; fallback to precipitation_mm
    temp_col = "temperature_2m" if "temperature_2m" in sdf.columns else "temperature_2m_mean"
    humd_col = "relative_humidity_2m" if "relative_humidity_2m" in sdf.columns else "relative_humidity_2m_mean"
    phpa_col = "pressure_msl" if "pressure_msl" in sdf.columns else "pressure_msl_mean"
    prec_col = "precipitation" if "precipitation" in sdf.columns else "precipitation_sum"
    snow_col = "snowfall" if "snowfall" in sdf.columns else "snowfall_sum"
    wsms_col = "wind_speed_10m" if "wind_speed_10m" in sdf.columns else "wind_speed_10m_max"  

    # create safe column refs (these are Column expressions)
    t_col   = sdf[temp_col] if temp_col in sdf.columns else F.lit(None).cast("double")
    rh_col  = sdf[humd_col] if humd_col in sdf.columns else F.lit(None).cast("double")
    p_col   = sdf[phpa_col] if phpa_col in sdf.columns else F.lit(None).cast("double")
    pr_col  = sdf[prec_col] if prec_col in sdf.columns else F.lit(None).cast("double")
    sno_col = sdf[snow_col] if snow_col in sdf.columns else F.lit(0.0).cast("double")
    ws_col  = sdf[wsms_col] if wsms_col in sdf.columns else F.lit(None).cast("double")

    # Subscore calculations (each in 0..100)
    # Temperature: ideal band 18..26°C -> 100, linear decay outside; clamp to [0,100]
    temp_sub = (F.when(t_col.between(18.0,26.0), F.lit(100.0))
                 .when(t_col.isNull(), F.lit(None))
                 .otherwise(
                    # linear decay: score = 100 * max(0, 1 - (span/40))
                    F.least(F.lit(100.0),
                            F.greatest(F.lit(0.0),
                                100.0 * (1 - (F.when(t_col < 18.0, (18.0 - t_col)).otherwise(t_col - 26.0) / F.lit(40.0)))
                            )
                    )
                 )
               )

    # Humidity: ideal 30..60% -> 100, linear decay to 0 over span 60
    hum_sub = (F.when(rh_col.between(30.0,60.0), F.lit(100.0))
               .when(rh_col.isNull(), F.lit(None))
               .otherwise(
                   F.least(F.lit(100.0),
                           F.greatest(F.lit(0.0),
                               100.0 * (1 - (F.when(rh_col < 30.0, (30.0 - rh_col)).otherwise(rh_col - 60.0) / F.lit(60.0))))
                          )
               )
              )

    # Pressure: nominal 1013.25 hPa within 20 -> 100, linear decay to 0 at 3*20
    pres_sub = (F.when(p_col.isNull(), F.lit(None))
                .otherwise(
                    F.when(F.abs(p_col - F.lit(1013.25)) <= F.lit(20.0), F.lit(100.0))
                     .otherwise(
                        F.least(F.lit(100.0),
                                F.greatest(F.lit(0.0),
                                    100.0 * (1 - ((F.abs(p_col - F.lit(1013.25)) - F.lit(20.0)) / (F.lit(60.0))))
                                )
                        )
                     )
                )
               )

    # Precipitation: 0 mm -> 100, penalty per mm => clamp at 0
    precip_total = F.coalesce(pr_col, F.lit(0.0)) #+ F.coalesce(sno_col, F.lit(0.0))
    precip_sub = F.when(pr_col.isNull() , F.lit(None)) \
                  .otherwise(F.least(F.lit(100.0), F.greatest(F.lit(0.0), F.lit(100.0) - precip_total * F.lit(1.5))))

    # Wind: <=2 m/s -> 100, mild penalty up to 5 m/s, strong penalty above
    wind_sub = (F.when(ws_col.isNull(), F.lit(None))
                .when(ws_col <= F.lit(2.0), F.lit(100.0))
                .when(ws_col <= F.lit(5.0), F.lit(100.0) * (1 - ((ws_col - F.lit(2.0)) / F.lit(3.0)) * F.lit(0.5)))
                .otherwise(F.least(F.lit(100.0), F.greatest(F.lit(0.0), F.lit(100.0) * (1 - ((ws_col - F.lit(5.0)) / F.lit(20.0))))))
               )

    # Combine subscores with weights (temperature 35%, humidity 25%, precip 20%, pressure 10%, wind 10%)
    # If some subscores are NULL, re-normalize weights among present subscores
    # Build weighted_sum = sum(sub_i * weight_i) / sum(weights_present)
    w_temp, w_hum, w_precip, w_pres, w_wind = 0.35, 0.25, 0.20, 0.10, 0.10

    # compute numerator and denominator safely
    numer = (F.when(temp_sub.isNotNull(), temp_sub * F.lit(w_temp)).otherwise(F.lit(0.0))
             + F.when(hum_sub.isNotNull(), hum_sub * F.lit(w_hum)).otherwise(F.lit(0.0))
             + F.when(precip_sub.isNotNull(), precip_sub * F.lit(w_precip)).otherwise(F.lit(0.0))
             + F.when(pres_sub.isNotNull(), pres_sub * F.lit(w_pres)).otherwise(F.lit(0.0))
             + F.when(wind_sub.isNotNull(), wind_sub * F.lit(w_wind)).otherwise(F.lit(0.0))
            )

    denom = (F.when(temp_sub.isNotNull(), F.lit(w_temp)).otherwise(F.lit(0.0))
             + F.when(hum_sub.isNotNull(), F.lit(w_hum)).otherwise(F.lit(0.0))
             + F.when(precip_sub.isNotNull(), F.lit(w_precip)).otherwise(F.lit(0.0))
             + F.when(pres_sub.isNotNull(), F.lit(w_pres)).otherwise(F.lit(0.0))
             + F.when(wind_sub.isNotNull(), F.lit(w_wind)).otherwise(F.lit(0.0))
            )

    climate_index = F.when(denom == 0, F.lit(None)).otherwise(F.round(numer / denom, 2)).alias("climate_index")

    # label mapping
    climate_label = F.when(climate_index.isNull(), F.lit(None)) \
                     .when(climate_index <= F.lit(20.0), F.lit("Poor")) \
                     .when(climate_index <= F.lit(40.0), F.lit("Fair")) \
                     .when(climate_index <= F.lit(60.0), F.lit("Moderate")) \
                     .when(climate_index <= F.lit(80.0), F.lit("Good")) \
                     .otherwise(F.lit("Excellent")).alias("climate_label")

    # attach intermediate subs (optional: include only those which need to be stored)
    sdf_with = (sdf
                # .withColumn("__temp_sub", temp_sub)
                # .withColumn("__hum_sub", hum_sub)
                # .withColumn("__pres_sub", pres_sub)
                # .withColumn("__precip_sub", precip_sub)
                # .withColumn("__wind_sub", wind_sub)
                .withColumn("climate_index", climate_index)
                .withColumn("climate_label", climate_label)
               )

    return sdf_with

# Compute AQI (0–500) + category + color for hourly & daily -----
# It will run only for air pollution tables (hourly/daily) and is safe if columns are missing.
def compute_aqi_index_and_label(sdf):
    """
    Adds:
      - aqi_value (int, 0–500)
      - aqi_category (string)
      - aqi_color (string)
    Automatically detects hourly vs daily based on column suffix (_mean)
    """

    # Determine whether to use base or _mean columns
    use_suffix = "_mean" if any(c.endswith("_mean") for c in sdf.columns) else ""

    # Construct pollutant column names dynamically
    pollutants = {
        "pm25": f"pm2_5{use_suffix}",
        "pm10": f"pm10{use_suffix}",
        "o3": f"ozone{use_suffix}",
        "no2": f"nitrogen_dioxide{use_suffix}",
        "so2": f"sulphur_dioxide{use_suffix}",
        "co": f"carbon_monoxide{use_suffix}"
    }

    # Ensure all columns exist
    for _, col_name in pollutants.items():
        if col_name not in sdf.columns:
            sdf = sdf.withColumn(col_name, F.lit(None).cast("double"))

    # Breakpoint tables (EPA/AirNow)
    PM25 = [(0,12,0,50),(12.1,35.4,51,100),(35.5,55.4,101,150),(55.5,150.4,151,200),
            (150.5,250.4,201,300),(250.5,350.4,301,400),(350.5,500.4,401,500)]
    PM10 = [(0,54,0,50),(55,154,51,100),(155,254,101,150),(255,354,151,200),
            (355,424,201,300),(425,504,301,400),(505,604,401,500)]
    O3   = [(0,54,0,50),(55,70,51,100),(71,85,101,150),(86,105,151,200),(106,200,201,300)]
    NO2  = [(0,53,0,50),(54,100,51,100),(101,360,101,150),(361,649,151,200),
            (650,1249,201,300),(1250,1649,301,400),(1650,2049,401,500)]
    SO2  = [(0,35,0,50),(36,75,51,100),(76,185,101,150),(186,304,151,200),
            (305,604,201,300),(605,804,301,400),(805,1004,401,500)]
    CO   = [(0,4499,0,50),(4500,9499,51,100),(9500,12499,101,150),(12500,15499,151,200),
            (15500,30499,201,300),(30500,40499,301,400),(40500,50499,401,500)]

    def _build_aqi_expr(col, breaks):
        expr = F.lit(None)
        for (Clow, Chigh, Ilow, Ihigh) in breaks:
            slope = (Ihigh - Ilow) / (Chigh - Clow)
            interp = (slope * (F.col(col) - F.lit(Clow)) + F.lit(Ilow))
            expr = F.when(F.col(col).between(Clow, Chigh), F.round(interp))\
                     .otherwise(expr)
        expr = F.when(F.col(col).isNull(), F.lit(None)).otherwise(
            F.when(F.col(col) > breaks[-1][1], F.lit(500)).otherwise(expr)
        )
        return expr

    # Compute pollutant-specific AQI
    sdf = (sdf
        .withColumn("aqi_pm25", _build_aqi_expr(pollutants["pm25"], PM25))
        .withColumn("aqi_pm10", _build_aqi_expr(pollutants["pm10"], PM10))
        .withColumn("aqi_o3",   _build_aqi_expr(pollutants["o3"], O3))
        .withColumn("aqi_no2",  _build_aqi_expr(pollutants["no2"], NO2))
        .withColumn("aqi_so2",  _build_aqi_expr(pollutants["so2"], SO2))
        .withColumn("aqi_co",   _build_aqi_expr(pollutants["co"], CO))
    )

    # Final AQI = max across pollutants
    sdf = sdf.withColumn(
        "aqi_value",
        F.when(
            F.greatest(
                F.coalesce(F.col("aqi_pm25"), F.lit(-1)),
                F.coalesce(F.col("aqi_pm10"), F.lit(-1)),
                F.coalesce(F.col("aqi_o3"), F.lit(-1)),
                F.coalesce(F.col("aqi_no2"), F.lit(-1)),
                F.coalesce(F.col("aqi_so2"), F.lit(-1)),
                F.coalesce(F.col("aqi_co"), F.lit(-1))
            ) < 0, F.lit(None)
        ).otherwise(
            F.greatest(
                F.coalesce(F.col("aqi_pm25"), F.lit(-1)),
                F.coalesce(F.col("aqi_pm10"), F.lit(-1)),
                F.coalesce(F.col("aqi_o3"), F.lit(-1)),
                F.coalesce(F.col("aqi_no2"), F.lit(-1)),
                F.coalesce(F.col("aqi_so2"), F.lit(-1)),
                F.coalesce(F.col("aqi_co"), F.lit(-1))
            ).cast("int")
        )
    )

    # Primary pollutant
    sdf = sdf.withColumn("aqi_primary_pollutant",
        F.when(F.col("aqi_value") < 0, F.lit(None))
        .when(F.col("aqi_value") == F.coalesce(F.col("aqi_pm25"), F.lit(-1)), F.lit("pm25"))
        .when(F.col("aqi_value") == F.coalesce(F.col("aqi_pm10"), F.lit(-1)), F.lit("pm10"))
        .when(F.col("aqi_value") == F.coalesce(F.col("aqi_o3"), F.lit(-1)), F.lit("o3"))
        .when(F.col("aqi_value") == F.coalesce(F.col("aqi_no2"), F.lit(-1)), F.lit("no2"))
        .when(F.col("aqi_value") == F.coalesce(F.col("aqi_so2"), F.lit(-1)), F.lit("so2"))
        .when(F.col("aqi_value") == F.coalesce(F.col("aqi_co"), F.lit(-1)), F.lit("co"))
        .otherwise(F.lit(None))
    )

    # Category & color mapping
    sdf = (sdf
        .withColumn("aqi_category",
            F.when(F.col("aqi_value").isNull(), F.lit(None))
             .when(F.col("aqi_value") <= 50, "Good")
             .when(F.col("aqi_value") <= 100, "Moderate")
             .when(F.col("aqi_value") <= 150, "Unhealthy for Sensitive Groups")
             .when(F.col("aqi_value") <= 200, "Unhealthy")
             .when(F.col("aqi_value") <= 300, "Very Unhealthy")
             .otherwise("Hazardous")
        )
        .withColumn("aqi_color",
            F.when(F.col("aqi_value").isNull(), F.lit(None))
             .when(F.col("aqi_value") <= 50, "Green") #"#00E400")
             .when(F.col("aqi_value") <= 100, "Yellow") #"#FFFF00")
             .when(F.col("aqi_value") <= 150, "Orange") #"#FF7E00")
             .when(F.col("aqi_value") <= 200, "Red") #"#FF0000")
             .when(F.col("aqi_value") <= 300, "Purple") #"#99004C")
             .otherwise("Maroon") #("#7E0023")
        )
    )

    # Drop temporary pollutant-specific AQIs to keep schema clean
    sdf = sdf.drop("aqi_pm25","aqi_pm10","aqi_o3","aqi_no2","aqi_so2","aqi_co")
    return sdf

## ⚡ Step 5: Function to capture last load/transform timestamp from source table

In [0]:
def read_last_processed_load_ts(source_table: str):
    """Return last_processed_load_ts timestamp or None"""
    if not table_exists(silver_meta_tbl):
        return None
    try:
        rows = spark.table(silver_meta_tbl).filter(F.col("source_table") == source_table).select("last_processed_load_ts").limit(1).collect()
        return rows[0]["last_processed_load_ts"] if rows else None
    except Exception:
        return None

def commit_last_processed_load_ts(source_table: str, last_ts):
    """Upsert last_processed_load_ts into meta table (run after successful merge)"""
    if last_ts is None:
        return
    now_ts = datetime.now(timezone.utc)
    staging = spark.createDataFrame([(source_table, last_ts, now_ts)], schema="source_table STRING, last_processed_load_ts TIMESTAMP, updated_ts TIMESTAMP")
    if not table_exists(silver_meta_tbl):
        staging.write.format("delta").mode("overwrite").saveAsTable(silver_meta_tbl)
        return
    DeltaTable.forName(spark, silver_meta_tbl).alias("t").merge(
        staging.alias("s"),
        "t.source_table = s.source_table"
    ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
    print(f"Updated last processed timestamp for {source_table}: {last_ts}")

## 🔄 Step 6: Merge Incremental Data into Silver Tables

In [0]:
# MERGE on (city, data_timestamp)
def merge_into_silver(target_table: str, staging_sdf, merge_keys=("city", "data_timestamp"), exclude_update_cols=None, partition_col=None):
    """
    Idempotent upsert via Delta MERGE
    Ensure target exists and MERGE the staging_sdf into it.
    Returns rows_processed, min_data_ts, max_data_ts, max_load_ts (if load_timestamp present)
    """
    #print("Entered merge")
    if staging_sdf is None:
        print(f"Skipping {target_table} - no staging")
        return 0, None, None, None
    
    # Coerce data_timestamp/load_timestamp types if present
    if "data_timestamp" in staging_sdf.columns:
        staging_sdf = staging_sdf.withColumn("data_timestamp", F.to_timestamp("data_timestamp"))
    if "load_timestamp" in staging_sdf.columns:
        staging_sdf = staging_sdf.withColumn("load_timestamp", F.to_timestamp("load_timestamp"))

    # print("Before table creation")
    # ensure target exists
    ensure_table(target_table, staging_sdf, partition_col=partition_col)
    
    # prepare temp view for staging
    view = f"temp_{target_table.split('.')[-1]}_{int(datetime.now().timestamp())}"
    staging_sdf.createOrReplaceTempView(view)

    cols = staging_sdf.columns
    exclude_update_cols = exclude_update_cols or []
    update_cols = [c for c in cols if c not in merge_keys + tuple(exclude_update_cols)]
    if not update_cols:
        raise ValueError("No columns to update in MERGE")

    on_clause = " AND ".join([f"t.`{k}` = s.`{k}`" for k in merge_keys])
    update_set = ", ".join([f"t.`{c}` = s.`{c}`" for c in update_cols])
    insert_cols = ", ".join([f"`{c}`" for c in cols])
    insert_vals = ", ".join([f"s.`{c}`" for c in cols])
    
    # print("Before merge")
    # spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")
    # [CONFIG_NOT_AVAILABLE] Configuration spark.databricks.delta.schema.autoMerge.enabled is not available. See https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-configuration.html for details.

    # Work around for handling schema changes\
    staging_sdf.limit(0).write.format("delta") \
        .mode("append") \
        .option("mergeSchema", "true") \
        .saveAsTable(target_table)

    merge_sql = f"""
    MERGE INTO {target_table} t
    USING {view} s
    ON {on_clause}
    WHEN MATCHED THEN UPDATE SET {update_set}
    WHEN NOT MATCHED THEN INSERT ({insert_cols}) VALUES ({insert_vals})
    """
    print(f"Executing MERGE into {target_table} ...")
    spark.sql(merge_sql)
    recs = staging_sdf.count()

    # compute data_timestamp and load_timestamp bounds
    min_data_ts = None
    max_data_ts = None
    max_load_ts = None
    if "data_timestamp" in cols:
        agg = staging_sdf.agg(F.min("data_timestamp").alias("min_dt"), F.max("data_timestamp").alias("max_dt")).collect()[0]
        min_data_ts, max_data_ts = agg["min_dt"], agg["max_dt"]
    if "load_timestamp" in cols:
        agg2 = staging_sdf.agg(F.max("load_timestamp").alias("max_lt")).collect()[0]
        max_load_ts = agg2["max_lt"]
    elif "max_load_timestamp" in cols:
        agg2 = staging_sdf.agg(F.max("max_load_timestamp").alias("max_lt")).collect()[0]
        max_load_ts = agg2["max_lt"]
    
    print(f"MERGE completed for {target_table}: {recs} rows, data_ts range=({min_data_ts},{max_data_ts}), max_load_ts={max_load_ts}")
    return recs, min_data_ts, max_data_ts, max_load_ts

## 🧮 Step 7: Process Weather and Air Hourly Data (Bronze ➜ Silver)

In [0]:
try:
    total_processed = 0
    agg_min_data_ts = None
    agg_max_data_ts = None

    # 1️⃣ Weather Hourly
    src = bronze_weather_hourly_tbl
    if table_exists(src):
        last_load = read_last_processed_load_ts(src)
        print(f"Weather Hourly: last processed load_timestamp = {last_load}")

        wdf = spark.table(src)
        if last_load:
            wdf = wdf.filter(F.col("load_timestamp") > F.lit(last_load))
        #wdf = basic_cleanse(wdf, ["city", "data_timestamp"])
        wdf = derive_weather_description(wdf)
        wdf = compute_climate_index_and_label(wdf)
        # add transform_ts and Update run_id into staging
        wdf = wdf.withColumn("transform_ts", F.current_timestamp()) \
                 .withColumn("run_id", F.lit(run_id))
        # Merge into silver
        recs, min_dt, max_dt, max_lt = merge_into_silver(silver_weather_hourly_tbl, wdf
                                                         , merge_keys=("city", "data_timestamp"))
        total_processed += recs
        if min_dt:
            agg_min_data_ts = min_dt if agg_min_data_ts is None else min(agg_min_data_ts, min_dt)
        if max_dt:
            agg_max_data_ts = max_dt if agg_max_data_ts is None else max(agg_max_data_ts, max_dt)
        # commit meta only after successful merge
        if max_lt:
            commit_last_processed_load_ts(src, max_lt)
    else:
        print(f"Source {src} not present - skipping weather hourly")

   # 2️⃣ Air Hourly
    src = bronze_air_hourly_tbl
    if table_exists(src):
        last_load = read_last_processed_load_ts(src)
        print(f"Air Hourly: last processed load_timestamp = {last_load}")

        air_df = spark.table(src)
        if last_load:
            air_df = air_df.filter(F.col("load_timestamp") > F.lit(last_load))
        #air_df = basic_cleanse(air_df, ["city", "data_timestamp"])
        air_df = compute_aqi_index_and_label(air_df)
        
        # add transform_ts and Update run_id into staging
        air_df = air_df.withColumn("transform_ts", F.current_timestamp()) \
                 .withColumn("run_id", F.lit(run_id))
        # Merge into silver
        recs, min_dt, max_dt, max_lt = merge_into_silver(silver_air_hourly_tbl, air_df
                                                         , merge_keys=("city", "data_timestamp"))
        total_processed += recs
        if min_dt:
            agg_min_data_ts = min_dt if agg_min_data_ts is None else min(agg_min_data_ts, min_dt)
        if max_dt:
            agg_max_data_ts = max_dt if agg_max_data_ts is None else max(agg_max_data_ts, max_dt)
        # commit meta only after successful merge
        if max_lt:
            commit_last_processed_load_ts(src, max_lt)
    else:
        print(f"Source {src} not present - skipping air hourly")

    # 3️⃣ Weather Daily
    src = bronze_weather_daily_tbl
    if table_exists(src):
        last_load = read_last_processed_load_ts(src)
        print(f"Weather Daily: last processed load_timestamp = {last_load}")

        wd_df = spark.table(src)
        if last_load:
            wd_df = wd_df.filter(F.col("load_timestamp") > F.lit(last_load))
        #wd_df = basic_cleanse(wd_df, ["city", "data_timestamp"])
        wd_df = derive_weather_description(wd_df)
        wd_df = compute_climate_index_and_label(wd_df)
        
        # add transform_ts and Update run_id into staging
        wd_df = wd_df.withColumn("transform_ts", F.current_timestamp()) \
                 .withColumn("run_id", F.lit(run_id))
        # Merge into silver
        recs, min_dt, max_dt, max_lt = merge_into_silver(silver_weather_daily_tbl, wd_df
                                                         , merge_keys=("city", "data_timestamp"))
        total_processed += recs
        if min_dt:
            agg_min_data_ts = min_dt if agg_min_data_ts is None else min(agg_min_data_ts, min_dt)
        if max_dt:
            agg_max_data_ts = max_dt if agg_max_data_ts is None else max(agg_max_data_ts, max_dt)
        # commit meta only after successful merge
        if max_lt:
            commit_last_processed_load_ts(src, max_lt)
    else:
        print(f"Source {src} not present - skipping weather daily")
    
    print(f"✅ Total rows merged into Silver: {total_processed}")
except Exception as e:
    print("❌ Silver merge failed:", e)
    update_pipeline_log("FAILED", f"Silver layer failed: {str(e)}")
    raise

Weather Hourly: last processed load_timestamp = 2025-11-13 06:03:07.528229
Executing MERGE into env_catalog.env_data.silver_weather_hourly ...
MERGE completed for env_catalog.env_data.silver_weather_hourly: 0 rows, data_ts range=(None,None), max_load_ts=None
Air Hourly: last processed load_timestamp = 2025-11-13 06:03:07.530387
Executing MERGE into env_catalog.env_data.silver_air_hourly ...
MERGE completed for env_catalog.env_data.silver_air_hourly: 0 rows, data_ts range=(None,None), max_load_ts=None
Weather Daily: last processed load_timestamp = 2025-11-13 06:03:07.529117
Executing MERGE into env_catalog.env_data.silver_weather_daily ...
MERGE completed for env_catalog.env_data.silver_weather_daily: 0 rows, data_ts range=(None,None), max_load_ts=None
✅ Total rows merged into Silver: 0


## 🌤️ Step 8: Create Silver Air Daily Aggregations

In [0]:
# Build Silver Air Daily aggregates (derived_AQI mean/max/min)
try:
    src = silver_air_hourly_tbl
    if not table_exists(src):
        print("No hourly air table found; skipping daily aggregation.")
    else:
        last_load = read_last_processed_load_ts(src)
        print(f"Air Daily: last processed load_timestamp = {last_load}")

        air = spark.table(src)
        if last_load:
            # TODO: Update this to use transform_ts from Silver Air Hourly
            air = air.filter(F.col("load_timestamp") > F.lit(last_load))
        # derive date 
        air = air.withColumn("data_timestamp", F.to_date("data_timestamp")) \
        # aggregations 
        # Exclude T.LongType and T.IntegerType to ensure dimkey and aqi value are omitted
        numeric_cols = [f.name for f in air.schema.fields if isinstance(f.dataType, (T.FloatType, T.DoubleType, T.DecimalType))]

        agg_exprs = []
        # aggregate stats for each numeric column: mean/max/min
        for c in numeric_cols:
            agg_exprs.extend([
                F.avg(F.col(c)).alias(f"{c}_mean"),
                F.max(F.col(c)).alias(f"{c}_max"),
                F.min(F.col(c)).alias(f"{c}_min"),
            ])
        # --- ADD: max of load_timestamp and dimkey ---
        agg_exprs.append(F.max(F.col("load_timestamp")).alias("max_load_timestamp"))
        agg_exprs.append(F.max(F.col("dim_key")).alias("dim_key"))

        # --- ADD: Unhealhy hours in day with AQI > 200  ---
        agg_exprs.append(F.sum(F.when(F.col("aqi_value") >= 200, 1).otherwise(0)).alias("unhealthy_hours"))

        daily = air.groupBy("city", "data_timestamp").agg(*agg_exprs)

        # print(daily.head(5))
        # Add other required columns
        # print("Before columns additon")
        daily = daily.withColumn("transform_ts", F.current_timestamp()) \
            .withColumn("run_id", F.lit(run_id))

        #ADD AQI
        daily = compute_aqi_index_and_label(daily)
        
        #print("Before merge function call")
        recs, min_dt, max_dt, max_lt = merge_into_silver(silver_air_daily_tbl, daily
                                                         , merge_keys=("city", "data_timestamp"))
        
        total_processed += recs
        if min_dt:
            agg_min_data_ts = min_dt if agg_min_data_ts is None else min(agg_min_data_ts, min_dt)
        if max_dt:
            agg_max_data_ts = max_dt if agg_max_data_ts is None else max(agg_max_data_ts, max_dt)

        # commit meta only after successful merge
        if max_lt:
            commit_last_processed_load_ts(src, max_lt)
        print("🌤️ Daily air aggregates updated.")
except Exception as e:
    print(f"❌ ERROR while building {silver_air_daily_tbl}: {str(e)}")
    update_pipeline_log("FAILED", f"Air daily aggregation failed")

Air Daily: last processed load_timestamp = 2025-11-13 06:22:16.797663
Executing MERGE into env_catalog.env_data.silver_air_daily ...
MERGE completed for env_catalog.env_data.silver_air_daily: 0 rows, data_ts range=(None,None), max_load_ts=None
🌤️ Daily air aggregates updated.


## ✅ Step 9: Finalize and Log Success

In [0]:
try:
    update_pipeline_log("SUCCESS", "Silver layer completed successfully", total_processed, agg_min_data_ts, agg_max_data_ts)
    print(f"✅ Silver layer completed successfully. Rows processed: {total_processed}")
except Exception as e:
    print("⚠️ Final log update failed:", e)


✅ pipeline_log updated: SUCCESS
✅ Silver layer completed successfully. Rows processed: 0
