In [0]:
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [None]:
schema = ArrayType(
    StructType(
        [
            StructField("qty", IntegerType(), True),
            StructField("unit", StringType(), True),
            StructField("curr", StringType(), True),
            StructField("id", StringType(), True),
            StructField("name", StringType(), True),
            StructField("price", IntegerType(), True),
            StructField(
                "promotion_info",
                StructType(
                    [
                        StructField("promo_id", IntegerType(), True),
                        StructField("promo_qty", IntegerType(), True),
                        StructField("promo_disc", DecimalType(3, 2), True),
                        StructField("promo_item", StringType(), True),
                    ]
                ),
                True,
            ),
        ]
    )
)

schema2 = ArrayType(
    StructType(
        [
            StructField("promo_id", IntegerType(), True),
            StructField("promo_qty", IntegerType(), True),
            StructField("promo_disc", DecimalType(3, 2), True),
            StructField("promo_item", StringType(), True),
        ]
    ),
    True,
)
schema3 = ArrayType(ArrayType(StringType(), True), True)


@dlt.table(
    comment="Load data to sales_order cleansed table",
    table_properties={"pipelines.reset.allowed": "true"},
    spark_conf={"pipelines.trigger.interval": "60 seconds"},
    temporary=False,
)
def sales_orders_cleansed():
    return (
         dlt.read("sales_orders_raw")
        .table("example.sales_order_raw")
        .select("*")
        .withColumn("clicked_items", from_json(col("clicked_items"), schema3))
        .withColumn("promo_info", from_json(col("promo_info"), schema2))
        .withColumn("ordered_products", from_json(col("ordered_products"), schema))
        .withColumn("ordered_products", explode("ordered_products"))
        .withColumn("order_datetime", from_unixtime("order_datetime"))
        .withColumn("product_id", col("ordered_products").id)
        .withColumn("unit_price", col("ordered_products").price)
        .withColumn("quantity", col("ordered_products").qty)
    )

In [0]:
@dlt.table(
    comment="Load data to customers cleansed table",
    table_properties={"pipelines.reset.allowed": "true"},
    spark_conf={"pipelines.trigger.interval": "60 seconds"},
    temporary=False,
)
def customers_cleansed():
    return (
        dlt.read("customers_raw")
        .select("*").where(col("state").rlike('[A-Z]'))
        .withColumn("customer_id",col("customer_id").cast("integer"))
        .withColumn("lon",col("lon").cast("double"))
        .withColumn("lat",col("lat").cast("double"))
        .withColumn("loyalty_segment",col("loyalty_segment").cast("integer"))
    )

In [0]:
@dlt.table(
    comment="Load data to a products cleansed table",
    table_properties={"pipelines.reset.allowed": "true"},
    spark_conf={"pipelines.trigger.interval": "60 seconds"},
    temporary=False,
)
def products_cleansed():
    return (
        dlt.read("products_raw")
        .select("*")
        .withColumn("ean13",col("ean13").cast("double"))
    )