# Notebook to cleansed bronze.neighborhoods into silver.neighborhoods

Define constant variables

In [0]:
from pyspark.sql.functions import col, trim, udf, from_json, when

In [0]:
%run ../../config/utils

In [0]:
ORIGIN_TABLE="brz_neighborhoods"
TARGET_TABLE="slv_neighborhoods"

###  Load data from bronze.neighborhoods 

In [0]:
bronze_df = spark.read.table(f"{catalog_name}.{bronze_schema_name}.{ORIGIN_TABLE}")

### Transform and check quality 

In [0]:
# Apply the UDF to the 'geometry' column and create a new 'geojson' column
bronze_df=bronze_df.withColumn("geojson", wkb_to_geojson_udf(col("geometry")))
bronze_df=bronze_df.drop("geometry")

In [0]:
renamed_colums={
                    "OBJECTID":"object_id",
                    "CODIGO":"code",
                    "NOMBRE":"name",
                    "IDENTIFICACION":"identification",
                    "LIMITEMUNICIPIOID":"limit_municipality_id",
                    "SUBTIPO_COMUNACORREGIMIENTO":"sub_type_community_district",
                    "LINK_DOCUMENTO":"link_document",
                    "SHAPEAREA": "shape_area",
                    "SHAPELEN": "shape_len"
                }

bronze_df = bronze_df.withColumnsRenamed(renamed_colums)

Base on the following image, the nulls records in `bronze_df`, mark in red squares, are related to *Santa Elena (object_id=340)* and *San Cristobál (object_id=342)* districts. Thats the reason why those null records will be mapped to these respectively districts.
![Poligono](/Volumes/workspace/default/staging/PoligonosMedellin_Nulls.png)

In [0]:
bronze_df = substitute_fields(bronze_df, 339, 340)
bronze_df = substitute_fields(bronze_df, 337, 342)
bronze_df.createOrReplaceTempView("neighborhoods")

### Write into silver.neighborhoods using MERGE

In [0]:
spark.sql(f"""
  MERGE INTO {catalog_name}.{silver_schema_name}.{TARGET_TABLE} AS target
  USING neighborhoods AS source
  ON target.object_id = source.object_id
  WHEN MATCHED AND (
      target.code                       IS DISTINCT FROM source.code OR
      target.name                       IS DISTINCT FROM source.name OR
      target.identification             IS DISTINCT FROM source.identification OR
      target.limit_municipality_id      IS DISTINCT FROM source.limit_municipality_id OR
      target.subtype_community_district IS DISTINCT FROM source.sub_type_community_district OR
      target.link_document              IS DISTINCT FROM source.link_document OR
      target.shape_area                 IS DISTINCT FROM source.shape_area OR
      target.shape_len                  IS DISTINCT FROM source.shape_len OR
      target.geojson                    IS DISTINCT FROM source.geojson
  ) THEN
    UPDATE SET
      target.code                       = source.code,
      target.name                       = source.name,
      target.identification             = source.identification,
      target.limit_municipality_id      = source.limit_municipality_id,
      target.subtype_community_district = source.sub_type_community_district,
      target.link_document              = source.link_document,
      target.shape_area                 = source.shape_area,
      target.shape_len                  = source.shape_len,
      target.geojson                    = source.geojson
  WHEN NOT MATCHED THEN
    INSERT (
      object_id,
      code,
      name,
      identification,
      limit_municipality_id,
      subtype_community_district,
      link_document,
      shape_area,
      shape_len,
      geojson
    )
    VALUES (
      source.object_id,
      source.code,
      source.name,
      source.identification,
      source.limit_municipality_id,
      source.sub_type_community_district,
      source.link_document,
      source.shape_area,
      source.shape_len,
      source.geojson
    )
""")