In [2]:
import os
import pyspark
import pandas as pd
import json
from pyspark.sql import SparkSession
from pyspark.sql.functions import date_format, count, col, sum, round
# from delta.tables import DeltaTable

In [24]:
spark = SparkSession.builder \
                    .config('spark.jars.packages', 'io.delta:delta-spark_2.12:3.0.0') \
                    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
                    .config('spark.sql.catalogImplementation', 'hive') \
                    .getOrCreate()


In [25]:

spark.sql('SET spark.sql.warehouse.dir').show()

+--------------------+--------------------+
|                 key|               value|
+--------------------+--------------------+
|spark.sql.warehou...|file:/D:/Data_Pro...|
+--------------------+--------------------+



In [27]:
spark.sql('DROP DATABASE IF EXISTS retail_db CASCADE').show()

++
||
++
++



In [28]:
spark.sql('CREATE DATABASE retail_db').show()

++
||
++
++



In [29]:
spark.sql('USE DATABASE retail_db')

DataFrame[]

In [30]:
spark.sql('SELECT CURRENT_DATABASE()').show()

+------------------+
|current_database()|
+------------------+
|         retail_db|
+------------------+



In [31]:
spark.sql(''' 
    CREATE TABLE orders (
          order_id INT,
          order_date STRING,
          order_customer_id INT,
          order_status STRING
    ) USING DELTA
''').show()

++
||
++
++



In [32]:
spark.sql(''' 
    CREATE OR REPLACE TEMPORARY VIEW orders_v (
          order_id INT,
          order_date DATE,
          order_customer_id INT,
          order_status STRING
    ) USING CSV
    OPTIONS (
          path='data/retail_db/orders'
    )
''').show()

++
||
++
++



In [33]:
spark.sql('SELECT * FROM orders_v').show()

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|            11599|         CLOSED|
|       2|2013-07-25|              256|PENDING_PAYMENT|
|       3|2013-07-25|            12111|       COMPLETE|
|       4|2013-07-25|             8827|         CLOSED|
|       5|2013-07-25|            11318|       COMPLETE|
|       6|2013-07-25|             7130|       COMPLETE|
|       7|2013-07-25|             4530|       COMPLETE|
|       8|2013-07-25|             2911|     PROCESSING|
|       9|2013-07-25|             5657|PENDING_PAYMENT|
|      10|2013-07-25|             5648|PENDING_PAYMENT|
|      11|2013-07-25|              918| PAYMENT_REVIEW|
|      12|2013-07-25|             1837|         CLOSED|
|      13|2013-07-25|             9149|PENDING_PAYMENT|
|      14|2013-07-25|             9842|     PROCESSING|
|      15|2013-07-25|             2568|       CO

In [34]:
spark.sql(''' 
INSERT INTO orders 
SELECT * FROM orders_v
''').show()

++
||
++
++



In [35]:
spark.sql('SELECT * FROM orders').show()

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|            11599|         CLOSED|
|       2|2013-07-25|              256|PENDING_PAYMENT|
|       3|2013-07-25|            12111|       COMPLETE|
|       4|2013-07-25|             8827|         CLOSED|
|       5|2013-07-25|            11318|       COMPLETE|
|       6|2013-07-25|             7130|       COMPLETE|
|       7|2013-07-25|             4530|       COMPLETE|
|       8|2013-07-25|             2911|     PROCESSING|
|       9|2013-07-25|             5657|PENDING_PAYMENT|
|      10|2013-07-25|             5648|PENDING_PAYMENT|
|      11|2013-07-25|              918| PAYMENT_REVIEW|
|      12|2013-07-25|             1837|         CLOSED|
|      13|2013-07-25|             9149|PENDING_PAYMENT|
|      14|2013-07-25|             9842|     PROCESSING|
|      15|2013-07-25|             2568|       CO

In [36]:
spark.sql(''' 
    CREATE TABLE order_items (
          order_item_id INT,
          order_item_order_id INT,
          order_item_product_id INT,
          order_item_quantity INT,
          order_item_subtotal FLOAT,
          order_item_product_price FLOAT

    )USING DELTA
''').show()

++
||
++
++



In [37]:
spark.sql(''' 
    CREATE OR REPLACE TEMPORARY VIEW order_items_v (
          order_item_id INT,
          order_item_order_id INT,
          order_item_product_id INT,
          order_item_quantity INT,
          order_item_subtotal FLOAT,
          order_item_product_price FLOAT
    ) USING CSV 
    OPTIONS (
          path='data/retail_db/order_items'
    )
''').show()

++
||
++
++



In [38]:
spark.sql(''' 
INSERT INTO order_items
SELECT * FROM order_items_v

''').show()

++
||
++
++



In [39]:
spark.sql('SELECT * FROM order_items').show()

+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|order_item_id|order_item_order_id|order_item_product_id|order_item_quantity|order_item_subtotal|order_item_product_price|
+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|       130539|              52251|                 1014|                  5|              249.9|                   49.98|
|       130540|              52251|                  703|                  4|              79.96|                   19.99|
|       130541|              52253|                  403|                  1|             129.99|                  129.99|
|       130542|              52253|                 1073|                  1|             199.99|                  199.99|
|       130543|              52253|                  957|                  1|             299.98|                  299.98|
|       130544| 

In [40]:
spark.sql('SELECT COUNT(*) FROM order_items').show()

+--------+
|count(1)|
+--------+
|  172198|
+--------+



In [41]:
spark.sql('show tables').show()

+---------+-------------+-----------+
|namespace|    tableName|isTemporary|
+---------+-------------+-----------+
|retail_db|  order_items|      false|
|retail_db|       orders|      false|
|         |order_items_v|      false|
|         |     orders_v|      false|
+---------+-------------+-----------+



In [42]:
spark.sql(''' 
SELECT 
    o.order_date,
    round(sum(oi.order_item_subtotal), 2) as daily_revenue
FROM orders o
INNER JOIN order_items oi
ON o.order_id = oi.order_item_order_id
WHERE o.order_status in ('COMPLETE', 'CLOSED')
GROUP BY 1
ORDER BY daily_revenue
''').show()

+----------+-------------+
|order_date|daily_revenue|
+----------+-------------+
|2013-12-16|     10255.68|
|2014-07-06|     16451.76|
|2014-05-30|     17890.91|
|2013-08-13|     17956.88|
|2013-12-02|     19278.83|
|2014-05-27|      19599.9|
|2014-04-19|     20096.88|
|2014-01-08|     20812.66|
|2014-06-21|     20886.61|
|2014-06-05|     21356.55|
|2013-08-19|     21397.59|
|2013-09-13|     21773.71|
|2014-01-02|      21872.7|
|2013-10-01|     22066.39|
|2013-12-14|     22296.74|
|2013-09-11|     22322.13|
|2014-04-24|     22412.48|
|2013-08-28|     22637.86|
|2014-04-17|     22801.21|
|2013-09-04|     22946.52|
+----------+-------------+
only showing top 20 rows



In [49]:
spark.sql(''' 
CREATE TABLE daily_revenue (
    order_date DATE,
    order_revenue FLOAT
) USING DELTA
''')

DataFrame[]

In [51]:
spark.sql(''' 
INSERT INTO daily_revenue
SELECT 
    o.order_date,
    round(sum(oi.order_item_subtotal), 2) as daily_revenue
FROM orders o
INNER JOIN order_items oi
ON o.order_id = oi.order_item_order_id
WHERE o.order_status in ('COMPLETE', 'CLOSED')
GROUP BY 1
ORDER BY daily_revenue
''')

DataFrame[]

In [52]:
spark.sql('SELECT COUNT(*) FROM daily_revenue').show()

+--------+
|count(1)|
+--------+
|     364|
+--------+



In [53]:
spark.sql(''' 
INSERT OVERWRITE daily_revenue
SELECT 
    o.order_date,
    round(sum(oi.order_item_subtotal), 2) as daily_revenue
FROM orders o
INNER JOIN order_items oi
ON o.order_id = oi.order_item_order_id
WHERE o.order_status in ('COMPLETE', 'CLOSED')
GROUP BY 1
ORDER BY daily_revenue
''' ).show()

++
||
++
++



In [54]:
spark.sql('SELECT COUNT(*) FROM daily_revenue').show()

+--------+
|count(1)|
+--------+
|     364|
+--------+



In [55]:
spark.stop()