#Imports

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

#Widgets

In [0]:
dbutils.widgets.removeAll()
dbutils.widgets.text("catalog","catalog_ecommerce")
dbutils.widgets.text("schema_source", "bronze")
dbutils.widgets.text("schema_sink", "silver")

#Constantes

In [0]:
catalog         =  dbutils.widgets.get("catalog")
schema_source   =  dbutils.widgets.get("schema_source")
schema_sink     = dbutils.widgets.get("schema_sink")

#Preparacion

In [0]:
df_orders = spark.table(f"{catalog}.{schema_source}.orders")
df_orders.cache()

#Limpieza

In [0]:
df_orders_clean = df_orders.filter(col("order_id").isNotNull())\
    .filter(col("customer_id").isNotNull()).dropDuplicates(["order_id"])

###UDF

In [0]:
def is_late(days_vs_estimated):
    if days_vs_estimated is None:
        return "No entregado"
    elif days_vs_estimated < -1:
        return "Tarde"
    elif days_vs_estimated > 0:
        return "Antes de la fecha estimada"
    else:
        return "Mismo dia"

is_late_udf = udf(is_late, StringType())

#Conversion

In [0]:
df_orders_clean = df_orders_clean \
    .withColumn("delivery_days", 
                datediff(col("order_delivered_customer_date"), col("order_purchase_timestamp")))\
    .withColumn("delivery_vs_estimated_time", 
                datediff(col("order_estimated_delivery_date"), col("order_delivered_customer_date"))) \
    .withColumn("Is_late", 
                is_late_udf(col("delivery_vs_estimated_time"))) \
    .withColumn("Proccesing_time", 
                    datediff(col("order_delivered_carrier_date"), col("order_approved_at"))) \
    .withColumn("Ingestion_date", current_timestamp())
                

In [0]:
df_orders_clean.write.mode("overwrite").saveAsTable(f"{catalog}.{schema_sink}.orders")