# Notebook to cleansed bronze.sales into silver.sales

Define constant variables

In [0]:
from pyspark.sql.functions import col,current_timestamp, expr
from pyspark.sql.functions import (
    col, to_timestamp, date_format, dayofmonth, hour,
    minute, month, second, year
)

In [0]:
%run ../../config/variables

In [0]:
ORIGIN_TABLE="brz_sales"
TARGET_TABLE="slv_sales"

###  Load data from brone.sales

In [0]:
bronze_df = spark.readStream.format('delta').option("maxFilesPerTrigger", 1).table(f"{catalog_name}.{bronze_schema_name}.{ORIGIN_TABLE}")

### Transform data

rename date column

In [0]:
bronze_df = bronze_df.withColumnRenamed("date", "event_date")

#### Date/Time enrichment

In [0]:
bronze_df = bronze_df.withColumn("event_date", to_timestamp(col("event_date"), "dd/MM/yyyy HH:mm:ss"))

bronze_df = bronze_df.withColumn("partition_date", date_format(col("event_date"), "ddMMyyyy")) \
       .withColumn("event_day", dayofmonth(col("event_date"))) \
       .withColumn("event_hour", hour(col("event_date"))) \
       .withColumn("event_minute", minute(col("event_date"))) \
       .withColumn("event_month", month(col("event_date"))) \
       .withColumn("event_second", second(col("event_date"))) \
       .withColumn("event_year", year(col("event_date")))

#### Location enrichment

In [0]:
neighborhoods_df = spark.read.table(f"{silver_schema_name}.slv_neighborhoods")
neighborhoods_df = neighborhoods_df.select([
    "name",
    "identification",
    "geojson"
])
neighborhoods_df = neighborhoods_df.withColumnRenamed("name", "district").withColumnRenamed("identification", "neighborhood")


Perform cross join between sales and neighborhoods

In [0]:
complete_df=bronze_df.crossJoin(neighborhoods_df)

Define a function to validate if a given point is inside a Polygon/Multipolygon

In [0]:
from pyspark.sql.functions import udf, col, lit
from pyspark.sql.types import BooleanType
from shapely.geometry import Point, shape
import json

# UDF logic: receives lat/lon and geojsons per row
@udf(BooleanType())
def point_in_polygon(lat, lon, geojson):
    try:
        point = Point(lon, lat)
        polygon_neighborhoods = shape(json.loads(geojson)) if geojson else None
        in_neighborhoods = polygon_neighborhoods.contains(point) if polygon_neighborhoods else False
        return in_neighborhoods
    except:
        return False

# Register the UDF
# point_in_polygon_udf = udf(point_in_polygon, BooleanType())

Apply function and return the district and neighboorhood for each record.

In [0]:
complete_df=complete_df.withColumn("match_location",point_in_polygon(col("latitude"),col("longitude"),col("geojson")))
complete_df=complete_df.filter(col("match_location") == True).drop('geojson','match_location')

Validate against original bronze_df to find the record which location is unknown

### Write Stream

In [0]:
(complete_df.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", silver_checkpoint_path)
    .trigger(availableNow=True)
    .table(f"{catalog_name}.{silver_schema_name}.{TARGET_TABLE}")
    .awaitTermination()
  )