# Notebook to cleansed bronze.sales into silver.sales

In [0]:
%pip install shapely

Define constant variables

In [0]:
ORIGIN_SCHEMA="bronze"
ORIGIN_TABLE="sales"
TARGET_SCHEMA="silver"
TARGET_TABLE=ORIGIN_TABLE
TABLE_TO_MERGE_1="neighborhoods"
TABLE_TO_MERGE_2="municipalities"

###  Load most recent time-window data from bronze.customers 

Doing it this way, there's a time process improvement when fetching the data related to matching the needed partitions not the entire entity.

In [0]:
from pyspark.sql.functions import col,current_timestamp, expr

bronze_df= spark.read.table(f"{ORIGIN_SCHEMA}.{ORIGIN_TABLE}").filter(col("ingestime")>= current_timestamp() - expr("INTERVAL 12 HOUR"))

Add silver_ingest timestamp to keep track of the data

In [0]:
bronze_df=bronze_df.withColumn("silver_ingestime",current_timestamp())
columns = ["silver_ingestime"] + [col for col in bronze_df.columns if col != "silver_ingestime"]
bronze_df = bronze_df.select(columns)

### Data set enrichment

rename date column

In [0]:
bronze_df = bronze_df.withColumnRenamed("date", "event_date")

In [0]:
bronze_df.display()

#### Date/Time enrichment

In [0]:
from pyspark.sql.functions import (
    col, to_timestamp, date_format, dayofmonth, hour,
    minute, month, second, year
)

bronze_df = bronze_df.withColumn("timestamp_date", to_timestamp(col("event_date"), "dd/MM/yyyy HH:mm:ss"))

bronze_df = bronze_df.withColumn("partition_date", date_format(col("timestamp_date"), "ddMMyyyy")) \
       .withColumn("event_day", dayofmonth(col("timestamp_date"))) \
       .withColumn("event_hour", hour(col("timestamp_date"))) \
       .withColumn("event_minute", minute(col("timestamp_date"))) \
       .withColumn("event_month", month(col("timestamp_date"))) \
       .withColumn("event_second", second(col("timestamp_date"))) \
       .withColumn("event_year", year(col("timestamp_date")))

bronze_df=bronze_df.drop("timestamp_date")

bronze_df.display()



#### Location enrichment

Get from **neighborhoods** and **municipalities** from **silver**

In [0]:
neighborhoods_df = spark.read.table(f"{TARGET_SCHEMA}.{TABLE_TO_MERGE_1}")
neighborhoods_df = neighborhoods_df.select([
    "name",
    "identification",
    "limit_municipality_id",
    "geojson"
])
neighborhoods_df = neighborhoods_df.withColumnRenamed("geojson", "geojson_neighborhoods").withColumnRenamed("limit_municipality_id", "municipality_code")

municipalities_df = spark.read.table(f"{TARGET_SCHEMA}.{TABLE_TO_MERGE_2}")
municipalities_df = municipalities_df.select([
    "municipality_code",
    "geojson"
])
municipalities_df = municipalities_df.withColumnRenamed("geojson", "geojson_municipalities")

silver_locations = neighborhoods_df.join(municipalities_df, on="municipality_code", how="inner")

Define a function to validate if a given point is inside a Polygon/Multipolygon

In [0]:
from shapely.geometry import Point, shape
def validate_point(latitude,longitude,df):
    point = Point(longitude, latitude)
    df=df.withColumn("


In [0]:
silver_locations.count()

### Create Schema in Catalog

In [0]:
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {TARGET_SCHEMA}")

#### Create Empty table in the schema before merge

Get schema of bronze DataFrame

In [0]:
bronze_schema=bronze_df.schema

Create empty table

In [0]:
empty_df = spark.createDataFrame([], schema=bronze_schema)
empty_df.write.partitionBy("silver_ingestime").format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(f"{TARGET_SCHEMA}.{TARGET_TABLE}")

### Write into silver.customers using MERGE

Load silver.customers table as Delta Table

In [0]:
from delta.tables import DeltaTable
silver_table = DeltaTable.forName(spark, f"{TARGET_SCHEMA}.{TARGET_TABLE}")

Perform the MERGE

In [0]:
silver_table.alias("target").merge(
    source=bronze_df.alias("source"),
    condition="target.order_id IS NOT NULL AND target.order_id = source.order_id"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()