In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
customers_schema = "customer_id integer,tax_id STRING,tax_code STRING, customer_name STRING,state STRING,city STRING,postcode STRING,street STRING,number STRING,unit STRING,region STRING,district STRING,lon double,lat double,ship_to_address STRING,valid_from STRING,valid_to STRING,units_purchased STRING,loyalty_segment integer, customer_key BIGINT"

customers_df = spark.read \
        .format("delta") \
        .load("dbfs:/user/hive/warehouse/example.db/customers_cleansed_dt")

customers_df = customers_df.withColumn("customer_key", expr("monotonically_increasing_id() + 1"))

dim_customers = spark.createDataFrame(customers_df.rdd, customers_schema)

dim_customers.write.format("delta").saveAsTable("example.dim_customers_dt")

In [0]:
products_schema = "product_id STRING,product_category STRING,product_name STRING,sales_price STRING,ean13 DOUBLE,ean5 STRING,product_unit STRING, product_key BIGINT"

products_df = spark.read \
        .format("delta") \
        .load("dbfs:/user/hive/warehouse/example.db/products_cleansed_dt") \
        .select("product_id", "product_category", "product_name", "sales_price", "ean13", "ean5", "product_unit")

products_df = products_df.withColumn("product_key", expr("monotonically_increasing_id() + 1"))

dim_products = spark.createDataFrame(products_df.rdd, products_schema)

dim_products.write.format("delta").saveAsTable("example.dim_products_dt")

In [0]:
s = spark.read.format("delta").load("dbfs:/user/hive/warehouse/example.db/sales_orders_cleansed_dt").alias("s")
p = spark.read.format("delta").load("dbfs:/user/hive/warehouse/example.db/dim_products_dt").alias("p")
c = spark.read.format("delta").load("dbfs:/user/hive/warehouse/example.db/dim_customers_dt").alias("c")

fact_sales_orders = s.join(p, s.product_id == p.product_id, "inner") \
        .join(c, s.customer_id == c.customer_id, "inner") \
        .select(
            "s.order_number",
            "c.customer_key",
            "p.product_key",
            col("s.order_datetime").cast("date").alias("order_date"),
            "s.unit_price",
            "s.quantity",
            expr("s.unit_price * s.quantity").alias("total_price"),
        )


fact_sales_orders.write.format("delta").saveAsTable("example.fact_sales_orders_dt")


fact_customer_sales = s.join(p, s.product_id == p.product_id, "inner") \
        .join(c, s.customer_id == c.customer_id, "inner") \
        .groupBy("c.customer_key", "p.product_key") \
        .agg(
            sum("quantity").alias("total_quantity"),
            sum(expr("s.unit_price * s.quantity")).alias("sale_amount"),
        )


fact_customer_sales.write.format("delta").saveAsTable("example.fact_customer_sales_dt")   