# Notebook to cleansed bronze.municipalities into silver.municipalities

Define constant variables

In [0]:
from pyspark.sql.functions import col, trim, udf, from_json, when
from pyspark.sql.types import StringType, StructType, StructField, ArrayType, DoubleType
from shapely.wkb import loads as wkb_loads
import json

In [0]:
%run ../../config/variables

In [0]:
ORIGIN_TABLE="brz_municipalities"
TARGET_TABLE="slv_municipalities"

###  Load data from bronze.municipalities 

In [0]:
bronze_df = spark.read.table(f"{catalog_name}.{bronze_schema_name}.{ORIGIN_TABLE}")

### Transform and check quality 

In [0]:
def wkb_to_geojson(wkb_bytes):
    """
    Converts WKB (Well-Known Binary) geometry bytes to a GeoJSON string.
    Args:
        wkb_bytes (bytes): The WKB geometry as bytes.
    Returns:
        str or None: The GeoJSON representation as a string, or None if conversion fails.
    """
    try:
        geom = wkb_loads(bytes(wkb_bytes))
        return json.dumps(geom.__geo_interface__)
    except Exception as e:
        return None

wkb_to_geojson_udf = udf(wkb_to_geojson, StringType())

In [0]:
# Apply the UDF to the 'geometry' column and create a new 'geojson' column
bronze_df=bronze_df.withColumn("geojson", wkb_to_geojson_udf(col("geometry")))
bronze_df=bronze_df.drop("geometry")

In [0]:
renamed_colums = {
                    "DPTOMPIO":"code",
                    "DPTO_CCDGO":"department_code",
                    "MPIO_CCDGO":"municipality_code",
                    "MPIO_CNMBR":"municipality",
                    "MPIO_CCNCT":"ccnct_code"
                }

bronze_df = bronze_df.withColumnsRenamed(renamed_colums)
bronze_df.createOrReplaceTempView("municipalities")

### Write into silver.municipalities using MERGE

In [0]:
spark.sql(f"""
  MERGE INTO {catalog_name}.{silver_schema_name}.{TARGET_TABLE} AS target
  USING municipalities AS source
  ON target.code = source.code
  WHEN MATCHED AND (
      target.department_code   <> source.department_code OR
      target.municipality_code <> source.municipality_code OR
      target.municipality      <> source.municipality OR
      target.ccnct_code        <> source.ccnct_code OR
      target.geojson           <> source.geojson
  ) THEN
    UPDATE SET
      target.department_code   = source.department_code,
      target.municipality_code = source.municipality_code,
      target.municipality      = source.municipality,
      target.ccnct_code        = source.ccnct_code,
      target.geojson           = source.geojson
  WHEN NOT MATCHED THEN
    INSERT (
      code,
      department_code,
      municipality_code,
      municipality,
      ccnct_code,
      geojson
    )
    VALUES (
      source.code,
      source.department_code,
      source.municipality_code,
      source.municipality,
      source.ccnct_code,
      source.geojson
    )
""")