In [1]:
import findspark
findspark.init()
import pyspark
sc=pyspark.SparkContext(appName="MyAppName")
sc

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, min as _min, max as _max, asc, month, dayofmonth

# Create Spark Session
spark = SparkSession.builder.appName("Assignment1").getOrCreate()


In [3]:
# Product Data
product_data = [
    (101, "Laptop", "Electronics", 60000),
    (102, "Mobile", "Electronics", 20000),
    (103, "Book", "Stationery", 500),
    (104, "Tablet", "Electronics", 30000)
]
product_cols = ["product_id", "pname", "ptype", "price"]
product_df = spark.createDataFrame(product_data, product_cols)

# Customer Data
customer_data = [
    (1, "Amit", "9999999999", "Pune"),
    (2, "Priya", "8888888888", "Mumbai"),
    (3, "Ravi", "7777777777", "Pune"),
    (4, "Sneha", "6666666666", "Delhi")
]
customer_cols = ["cust_id", "cname", "mobileno", "city"]
customer_df = spark.createDataFrame(customer_data, customer_cols)

# Orders Data
orders_data = [
    (1001, "2013-08-01", 1, "COMPLETE"),
    (1002, "2013-08-15", 2, "CLOSED"),
    (1003, "2013-09-01", 3, "PENDING"),
    (1004, "2013-09-10", 4, "COMPLETE")
]
orders_cols = ["order_id", "order_date", "order_customer_id", "order_status"]
orders_df = spark.createDataFrame(orders_data, orders_cols)

# Order Items Data
order_items_data = [
    (1, 1001, 101, 1, 60000),
    (2, 1001, 103, 2, 1000),
    (3, 1002, 102, 1, 20000),
    (4, 1003, 104, 2, 60000),
    (5, 1004, 103, 3, 1500)
]
order_items_cols = ["order_item_id", "order_item_order_id", "order_item_product_id", "order_item_quantity", "order_item_subtotal"]
order_items_df = spark.createDataFrame(order_items_data, order_items_cols)


(a) Get details of all customers from Pune city

In [4]:
customer_df.filter(col("city") == "Pune").show()


+-------+-----+----------+----+
|cust_id|cname|  mobileno|city|
+-------+-----+----------+----+
|      1| Amit|9999999999|Pune|
|      3| Ravi|7777777777|Pune|
+-------+-----+----------+----+



(b) Get details of orders with subtotal > 20000 in month 8 (August)

In [5]:
orders_df.join(order_items_df, orders_df.order_id == order_items_df.order_item_order_id) \
    .filter((order_items_df.order_item_subtotal > 20000) & (month(col("order_date")) == 8)) \
    .show()


+--------+----------+-----------------+------------+-------------+-------------------+---------------------+-------------------+-------------------+
|order_id|order_date|order_customer_id|order_status|order_item_id|order_item_order_id|order_item_product_id|order_item_quantity|order_item_subtotal|
+--------+----------+-----------------+------------+-------------+-------------------+---------------------+-------------------+-------------------+
|    1001|2013-08-01|                1|    COMPLETE|            1|               1001|                  101|                  1|              60000|
+--------+----------+-----------------+------------+-------------+-------------------+---------------------+-------------------+-------------------+



(c) Print orders in ascending order of subtotal

In [6]:
orders_df.join(order_items_df, orders_df.order_id == order_items_df.order_item_order_id) \
    .orderBy(asc("order_item_subtotal")).show()


+--------+----------+-----------------+------------+-------------+-------------------+---------------------+-------------------+-------------------+
|order_id|order_date|order_customer_id|order_status|order_item_id|order_item_order_id|order_item_product_id|order_item_quantity|order_item_subtotal|
+--------+----------+-----------------+------------+-------------+-------------------+---------------------+-------------------+-------------------+
|    1001|2013-08-01|                1|    COMPLETE|            2|               1001|                  103|                  2|               1000|
|    1004|2013-09-10|                4|    COMPLETE|            5|               1004|                  103|                  3|               1500|
|    1002|2013-08-15|                2|      CLOSED|            3|               1002|                  102|                  1|              20000|
|    1001|2013-08-01|                1|    COMPLETE|            1|               1001|                  10

(d) Print customer details with min and max order amount

In [7]:
cust_orders = orders_df.join(order_items_df, orders_df.order_id == order_items_df.order_item_order_id) \
                       .join(customer_df, orders_df.order_customer_id == customer_df.cust_id)

cust_orders.groupBy("cname").agg(_min("order_item_subtotal").alias("min_order"),
                                 _max("order_item_subtotal").alias("max_order")).show()


+-----+---------+---------+
|cname|min_order|max_order|
+-----+---------+---------+
| Ravi|    60000|    60000|
|Sneha|     1500|     1500|
|Priya|    20000|    20000|
| Amit|     1000|    60000|
+-----+---------+---------+



(e) Get orders which are either COMPLETE or CLOSED

In [8]:
orders_df.filter((col("order_status") == "COMPLETE") | (col("order_status") == "CLOSED")).show()


+--------+----------+-----------------+------------+
|order_id|order_date|order_customer_id|order_status|
+--------+----------+-----------------+------------+
|    1001|2013-08-01|                1|    COMPLETE|
|    1002|2013-08-15|                2|      CLOSED|
|    1004|2013-09-10|                4|    COMPLETE|
+--------+----------+-----------------+------------+



(f) Get orders which are COMPLETE or CLOSED and placed in August 2013

In [9]:
orders_df.filter(((col("order_status") == "COMPLETE") | (col("order_status") == "CLOSED")) &
                 (month(col("order_date")) == 8)).show()


+--------+----------+-----------------+------------+
|order_id|order_date|order_customer_id|order_status|
+--------+----------+-----------------+------------+
|    1001|2013-08-01|                1|    COMPLETE|
|    1002|2013-08-15|                2|      CLOSED|
+--------+----------+-----------------+------------+



(g) Get order items where subtotal != product(price * quantity)

In [10]:
check_items = order_items_df.join(product_df, order_items_df.order_item_product_id == product_df.product_id)
check_items.filter(col("order_item_subtotal") != (col("order_item_quantity") * col("price"))).show()


+-------------+-------------------+---------------------+-------------------+-------------------+----------+-----+-----+-----+
|order_item_id|order_item_order_id|order_item_product_id|order_item_quantity|order_item_subtotal|product_id|pname|ptype|price|
+-------------+-------------------+---------------------+-------------------+-------------------+----------+-----+-----+-----+
+-------------+-------------------+---------------------+-------------------+-------------------+----------+-----+-----+-----+



(h) Get all orders placed on the 1st of every month

In [11]:
orders_df.filter(dayofmonth(col("order_date")) == 1).show()


+--------+----------+-----------------+------------+
|order_id|order_date|order_customer_id|order_status|
+--------+----------+-----------------+------------+
|    1001|2013-08-01|                1|    COMPLETE|
|    1003|2013-09-01|                3|     PENDING|
+--------+----------+-----------------+------------+



(i) Get count by status from orders

In [12]:
orders_df.groupBy("order_status").count().show()


+------------+-----+
|order_status|count|
+------------+-----+
|    COMPLETE|    2|
|      CLOSED|    1|
|     PENDING|    1|
+------------+-----+



(j) Get revenue for each order id from order items

In [13]:
order_items_df.groupBy("order_item_order_id").agg(_sum("order_item_subtotal").alias("order_revenue")).show()


+-------------------+-------------+
|order_item_order_id|order_revenue|
+-------------------+-------------+
|               1001|        61000|
|               1002|        20000|
|               1003|        60000|
|               1004|         1500|
+-------------------+-------------+



(k) Get daily product revenue

In [14]:
daily_revenue = orders_df.join(order_items_df, orders_df.order_id == order_items_df.order_item_order_id) \
    .groupBy("order_date", "order_item_product_id") \
    .agg(_sum("order_item_subtotal").alias("daily_product_revenue"))

daily_revenue.show()


+----------+---------------------+---------------------+
|order_date|order_item_product_id|daily_product_revenue|
+----------+---------------------+---------------------+
|2013-08-15|                  102|                20000|
|2013-09-10|                  103|                 1500|
|2013-08-01|                  101|                60000|
|2013-09-01|                  104|                60000|
|2013-08-01|                  103|                 1000|
+----------+---------------------+---------------------+

