In [30]:
import os
import pyspark
import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, round, col, date_format

In [31]:
spark = SparkSession.builder. \
                    appName('Spark SQL application'). \
                    getOrCreate()

url = "jdbc:postgresql://localhost:5432/retail_db"

In [32]:
def read_from_pdb(table_name):
    spark_df = spark.read.format("jdbc"). \
                option("url", url). \
                option("driver", "org.postgresql.Driver"). \
                option("dbtable", table_name). \
                option("user", os.environ.get("user_name")). \
                option("password", os.environ.get("pdb_pass")). \
                load()

    return spark_df


orders_df = read_from_pdb("orders")
oder_items_df = read_from_pdb("order_items")



orders_df.createOrReplaceTempView("orders")

orders_df.show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

In [33]:
oder_items_df.createOrReplaceTempView("order_items")

oder_items_df.show()

+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|order_item_id|order_item_order_id|order_item_product_id|order_item_quantity|order_item_subtotal|order_item_product_price|
+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|            1|                  1|                  957|                  1|             299.98|                  299.98|
|            2|                  2|                 1073|                  1|             199.99|                  199.99|
|            3|                  2|                  502|                  5|              250.0|                    50.0|
|            4|                  2|                  403|                  1|             129.99|                  129.99|
|            5|                  4|                  897|                  2|              49.98|                   24.99|
|            6| 

In [34]:
result = spark.sql(''' 
    SELECT
    o.order_date,
    ROUND(SUM(oi.order_item_subtotal), 2) AS daily_revenue
FROM orders o
JOIN order_items oi 
ON o.order_id = oi.order_item_order_id
WHERE o.order_status IN ('COMPLETE', 'CLOSED')
    AND date_format(o.order_date, 'yyyyMM') = '201401'
GROUP BY o.order_date
ORDER BY daily_revenue desc
''')

In [35]:
result.show()

+-------------------+-------------+
|         order_date|daily_revenue|
+-------------------+-------------+
|2014-01-05 00:00:00|     59093.58|
|2014-01-11 00:00:00|     58913.51|
|2014-01-30 00:00:00|     58597.63|
|2014-01-21 00:00:00|     56125.68|
|2014-01-03 00:00:00|      53080.1|
|2014-01-10 00:00:00|     52781.11|
|2014-01-15 00:00:00|     50622.14|
|2014-01-22 00:00:00|     49359.94|
|2014-01-20 00:00:00|     43416.32|
|2014-01-16 00:00:00|     42362.41|
|2014-01-12 00:00:00|     41235.96|
|2014-01-19 00:00:00|      41023.1|
|2014-01-23 00:00:00|     39708.67|
|2014-01-31 00:00:00|     39644.18|
|2014-01-29 00:00:00|     39289.22|
|2014-01-09 00:00:00|     38871.87|
|2014-01-07 00:00:00|     38545.64|
|2014-01-28 00:00:00|     38419.64|
|2014-01-14 00:00:00|     37722.19|
|2014-01-13 00:00:00|     35398.77|
+-------------------+-------------+
only showing top 20 rows



In [29]:
result.write.format("jdbc"). \
            option("url", url). \
            option("driver", "org.postgresql.Driver"). \
            option("dbtable", "sql_daily_revenue"). \
            option("user", os.environ.get("user_name")). \
            option("password", os.environ.get("pdb_pass")). \
            save()