# Notebook to cleansed bronze.municipalities into silver.municipalities

In [0]:
%pip install shapely

Define constant variables

In [0]:
ORIGIN_SCHEMA="bronze"
ORIGIN_TABLE="municipalities"
TARGET_SCHEMA="silver"
TARGET_TABLE=ORIGIN_TABLE

###  Load most recent time-window data from bronze.customers 

Doing it this way, there's a time process improvement when fetching the data related to matching the needed partitions not the entire entity.

In [0]:
from pyspark.sql.functions import col,current_timestamp, expr

bronze_df= spark.read.table(f"{ORIGIN_SCHEMA}.{ORIGIN_TABLE}").filter(col("ingestime")>= current_timestamp() - expr("INTERVAL 12 HOUR"))

Add silver_ingest timestamp to keep track of the data

In [0]:
bronze_df=bronze_df.withColumn("silver_ingestime",current_timestamp())
columns = ["silver_ingestime"] + [col for col in bronze_df.columns if col != "silver_ingestime"]
bronze_df = bronze_df.select(columns)

#### Deserialize Geometry to get GeoJSON

Define the function to deserialize

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from shapely.wkb import loads as wkb_loads
import json

def wkb_to_geojson(wkb_bytes):
    try:
        geom = wkb_loads(bytes(wkb_bytes))
        return json.dumps(geom.__geo_interface__)
    except Exception as e:
        return None

wkb_to_geojson_udf = udf(wkb_to_geojson, StringType())

Define the schema for GeoJSON - Polygons and Multipolygons

In [0]:
from pyspark.sql.types import StructType, StructField, ArrayType, DoubleType

geojson_schema_polygon = StructType([
    StructField("type", StringType(), True),
    StructField("coordinates", ArrayType(
        ArrayType(
            ArrayType(
                DoubleType()
            )
        )
    ), True)
])

geojson_schema_multipolygon = StructType([
    StructField("type", StringType(), True),
    StructField("coordinates", ArrayType(
        ArrayType(
            ArrayType(
                ArrayType(
                    DoubleType()
                )
            )
        )
    ), True)
])

Add GeoJSON column

In [0]:
from pyspark.sql.functions import from_json, when

# bronze_df=bronze_df.withColumn("geojson_raw", wkb_to_geojson_udf(col("geometry")))
# bronze_df = bronze_df.withColumn("geojson", from_json(col("geojson_raw"), geojson_schema_polygon))
# bronze_df=bronze_df.drop("geometry","geojson_raw")

bronze_df=bronze_df.withColumn("geojson", wkb_to_geojson_udf(col("geometry")))
bronze_df=bronze_df.drop("geometry")


### Adjust data's Attributes to the standard

In [0]:
from pyspark.sql.functions import col
renamed_colums={
    "DPTOMPIO":"code",
    "DPTO_CCDGO":"department_code",
    "MPIO_CCDGO":"municipality_code",
    "MPIO_CNMBR":"municipality",
    "MPIO_CCNCT":"ccnct_code"
    }

bronze_df=bronze_df.withColumnsRenamed(renamed_colums)

### Create Schema in Catalog

In [0]:
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {TARGET_SCHEMA}")

#### Create Empty table in the schema before merge

Get schema of bronze DataFrame

In [0]:
bronze_schema=bronze_df.schema

Create empty table

In [0]:
empty_df = spark.createDataFrame([], schema=bronze_schema)
empty_df.write.partitionBy("silver_ingestime").format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(f"{TARGET_SCHEMA}.{TARGET_TABLE}")

### Write into silver.customers using MERGE

Load silver.customers table as Delta Table

In [0]:
from delta.tables import DeltaTable
silver_table = DeltaTable.forName(spark, f"{TARGET_SCHEMA}.{TARGET_TABLE}")

Perform the MERGE

In [0]:
silver_table.alias("target").merge(
    source=bronze_df.alias("source"),
    condition="target.code IS NOT NULL AND target.code = source.code"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()