In [21]:
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import date_format, col, round, sum
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, FloatType, DateType

In [None]:
spark = SparkSession. \
                    builder. \
                    getOrCreate()

# spark = SparkSession.builder. \
#                     appName('postgres connect'). \
#                     config("spark.jars", r"C:\Users\Admin\Downloads\postgresql-42.6.0.jar"). \
#                     getOrCreate()

url = "jdbc:postgresql://localhost:5432/retail_db"
table_name = 'test_daily_revenue_2'

pdb_user = os.environ.get("user_name")
pdb_password = os.environ.get("pdb_pass")

In [16]:
orders_schema = StructType([
    StructField('order_id', IntegerType(), True),
    StructField('order_date', DateType(), True),
    StructField('order_customer_id', IntegerType(), True),
    StructField('order_status', StringType(), True)
])

orders_df = spark.read.csv(r'data\retail_db\orders\part-00000', header=True,schema=orders_schema)

orders_df.show()

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       2|2013-07-25|              256|PENDING_PAYMENT|
|       3|2013-07-25|            12111|       COMPLETE|
|       4|2013-07-25|             8827|         CLOSED|
|       5|2013-07-25|            11318|       COMPLETE|
|       6|2013-07-25|             7130|       COMPLETE|
|       7|2013-07-25|             4530|       COMPLETE|
|       8|2013-07-25|             2911|     PROCESSING|
|       9|2013-07-25|             5657|PENDING_PAYMENT|
|      10|2013-07-25|             5648|PENDING_PAYMENT|
|      11|2013-07-25|              918| PAYMENT_REVIEW|
|      12|2013-07-25|             1837|         CLOSED|
|      13|2013-07-25|             9149|PENDING_PAYMENT|
|      14|2013-07-25|             9842|     PROCESSING|
|      15|2013-07-25|             2568|       COMPLETE|
|      16|2013-07-25|             7276|PENDING_P

In [17]:
order_items_schema = StructType([
    StructField('order_item_id', IntegerType(), True),
    StructField('order_item_order_id', IntegerType(), True),
    StructField('order_item_product_id', IntegerType(), True),
    StructField('order_item_quantity', IntegerType(), True),
    StructField('order_item_subtotal', FloatType(), True),
    StructField('order_item_product_price', FloatType(), True)
])

order_items_df = spark.read.csv(r'data/retail_db/order_items/part-00000', header=True, schema=order_items_schema)

order_items_df.show()

+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|order_item_id|order_item_order_id|order_item_product_id|order_item_quantity|order_item_subtotal|order_item_product_price|
+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|            2|                  2|                 1073|                  1|             199.99|                  199.99|
|            3|                  2|                  502|                  5|              250.0|                    50.0|
|            4|                  2|                  403|                  1|             129.99|                  129.99|
|            5|                  4|                  897|                  2|              49.98|                   24.99|
|            6|                  4|                  365|                  5|             299.95|                   59.99|
|            7| 

In [18]:
merged = orders_df.join(order_items_df, order_items_df['order_item_order_id'] == orders_df['order_id'], how='inner')


merged = merged.withColumn('order_month', date_format('order_date', 'yyyyMM').cast('int'))

agg_df = merged.filter('order_month == 201401 and order_status in ("COMPLETE", "CLOSED")'). \
    groupby('order_date'). \
    agg(round(sum('order_item_subtotal'), 2).alias('daily_revenue')). \
    orderBy(col('daily_revenue').desc())

agg_df.show()

+----------+-------------+
|order_date|daily_revenue|
+----------+-------------+
|2014-01-05|     59093.58|
|2014-01-11|     58913.51|
|2014-01-30|     58597.63|
|2014-01-21|     56125.68|
|2014-01-03|      53080.1|
|2014-01-10|     52781.11|
|2014-01-15|     50622.14|
|2014-01-22|     49359.94|
|2014-01-20|     43416.32|
|2014-01-16|     42362.41|
|2014-01-12|     41235.96|
|2014-01-19|      41023.1|
|2014-01-23|     39708.67|
|2014-01-31|     39644.18|
|2014-01-29|     39289.22|
|2014-01-09|     38871.87|
|2014-01-07|     38545.64|
|2014-01-28|     38419.64|
|2014-01-14|     37722.19|
|2014-01-13|     35398.77|
+----------+-------------+
only showing top 20 rows



In [19]:
agg_df.write.format("jdbc"). \
                option("url", url). \
                option("driver", "org.postgresql.Driver"). \
                option("dbtable", table_name). \
                option("user", pdb_user). \
                option("password", pdb_password). \
                save()

In [20]:

ui_url = spark.sparkContext.uiWebUrl
print(ui_url)

http://DESKTOP-G87TS7V:4040
