In [1]:
%pip uninstall delta_spark -y
%pip install delta_spark -U

Found existing installation: delta-spark 2.4.0
Uninstalling delta-spark-2.4.0:
  Successfully uninstalled delta-spark-2.4.0
Note: you may need to restart the kernel to use updated packages.
Collecting delta_spark
  Using cached delta_spark-2.4.0-py3-none-any.whl (20 kB)
Installing collected packages: delta_spark
Successfully installed delta_spark-2.4.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
from typing import List
from pathlib import Path

from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession, DataFrame, Row
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType

In [2]:
JAR_PACKAGES = ",".join([str(x) for x in Path("../../jars").glob("*.jar")])
JAR_PACKAGES

'../../jars/antlr4-runtime-4.9.3.jar,../../jars/aws-java-sdk-bundle-1.12.392.jar,../../jars/delta-core_2.12-2.4.0.jar,../../jars/delta-storage-2.4.0.jar,../../jars/hadoop-aws-3.3.1.jar,../../jars/wildfly-openssl-1.0.7.Final.jar'

In [None]:
SPARK_URI = "spark://spark:7077"
HIVE_URI = "thrift://hive-metastore:9083"
MINIO_URI = "http://minio:9000"

builder = SparkSession.builder.appName("olist_silver_transformer").master(SPARK_URI) \
    .config("spark.jars", JAR_PACKAGES) \
    .config("spark.sql.warehouse.dir", "s3a:///") \
    .config("spark.hadoop.hive.metastore.uris", HIVE_URI) \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.access.key", "datalake") \
    .config("spark.hadoop.fs.s3a.secret.key", "datalake") \
    .config("spark.hadoop.fs.s3a.endpoint", MINIO_URI) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark = configure_spark_with_delta_pip(builder).enableHiveSupport().getOrCreate()

In [None]:
spark.catalog.listCatalogs()

[CatalogMetadata(name='spark_catalog', description=None)]

In [6]:
spark.catalog.listDatabases()

[Database(name='bronze', catalog='spark_catalog', description='', locationUri='s3a://bronze/'),
 Database(name='default', catalog='spark_catalog', description='Default Hive database', locationUri='file:/user/hive/warehouse'),
 Database(name='delta_lake', catalog='spark_catalog', description='', locationUri='s3a://delta-lake/'),
 Database(name='gold', catalog='spark_catalog', description='', locationUri='s3a://gold/'),
 Database(name='silver', catalog='spark_catalog', description='', locationUri='s3a://silver/'),
 Database(name='tmp', catalog='spark_catalog', description='', locationUri='s3a://tmp/')]

In [18]:
orders_performance_report_df = spark.sql("""
    SELECT d_date1.year
        ,d_date1.year_month
        ,d_date1.month
        ,d_date1.year_week
        ,d_date1.time_of_day
        ,dos.order_status
        ,dc.customer_state
        ,dc.customer_city
        ,COUNT(DISTINCT fact_orders.order_id) AS total_orders
        ,ROUND(SUM(price), 2) AS total_amount
        ,ROUND(AVG(freight_value), 2) AS avg_freight
        ,ROUND(IF(AVG(DATEDIFF(d_date1.timestamp, d_date3.timestamp)) < 0, 0, AVG(DATEDIFF(d_date1.timestamp, d_date3.timestamp))), 2) AS avg_days_delivery_delay
        ,ROUND(AVG(DATEDIFF(d_date1.timestamp, d_date2.timestamp)), 2) AS avg_days_to_deliver
    FROM silver.fact_orders
    JOIN silver.dim_order_status dos ON dos.order_status_id = fact_orders.order_status_id
    LEFT JOIN silver.dim_date d_date1 ON d_date1.date_id = fact_orders.order_delivered_customer_date_id
    LEFT JOIN silver.dim_date d_date2 ON d_date2.date_id = fact_orders.order_purchase_timestamp_id
    LEFT JOIN silver.dim_date d_date3 ON d_date3.date_id = fact_orders.order_estimated_delivery_date_id
    JOIN silver.dim_customers dc ON dc.customer_id = fact_orders.customer_id
    GROUP BY dos.order_status
        ,dc.customer_state
        ,dc.customer_city
        ,d_date1.year
        ,d_date1.year_month
        ,d_date1.month
        ,d_date1.year_week
        ,d_date1.time_of_day
"""
)

In [24]:
order_payment_performance_df = spark.sql("""
    SELECT d_date1.year
        ,d_date1.year_month
        ,d_date1.month
        ,dos.order_status
        ,dc.customer_state
        ,dc.customer_city
        ,COUNT(fact_payments.order_id) AS total_transactions
        ,ROUND(SUM(fact_payments.payment_value), 2) AS total_payment
    FROM silver.fact_payments
    JOIN silver.dim_order_status dos ON dos.order_status_id = fact_payments.order_status_id
    LEFT JOIN silver.dim_date d_date1 ON d_date1.date_id = fact_payments.order_delivered_customer_date_id
    JOIN silver.dim_customers dc ON dc.customer_id = fact_payments.customer_id
    GROUP BY dos.order_status
        ,dc.customer_state
        ,dc.customer_city
        ,d_date1.year
        ,d_date1.year_month
        ,d_date1.month
"""
)

In [25]:
order_payment_performance_df.show()

+----+----------+-----+------------+--------------+-------------------+------------------+-------------+
|year|year_month|month|order_status|customer_state|      customer_city|total_transactions|total_payment|
+----+----------+-----+------------+--------------+-------------------+------------------+-------------+
|2018|    201805|    5|   delivered|            SP|        sao vicente|                14|       952.67|
|2018|    201806|    6|   delivered|            SP|    mogi das cruzes|                14|      1558.53|
|2018|    201803|    3|   delivered|            SP|    mogi das cruzes|                22|       2793.5|
|2018|    201806|    6|   delivered|            AM|            humaita|                 1|       141.65|
|2017|    201712|   12|   delivered|            SC|      florianopolis|                45|      9948.34|
|2018|    201802|    2|   delivered|            SE|           estancia|                 1|       500.13|
|2017|    201704|    4|   delivered|            SP|    