In [1]:
from pyspark import SparkConf, SparkContext

In [2]:
conf= SparkConf().setMaster("local").setAppName("daily revenue").set("conf.ui.port", "12908")

In [3]:
sc= SparkContext(conf=conf)

In [4]:
#get the 'orderItems' table from local storage
orderItems= sc.textFile("file:///C:/samp_db/retail_db-master/order_items/part-00000")

In [6]:
for i in orderItems.take(5): print(i)

1,1,957,1,299.98,299.98
2,2,1073,1,199.99,199.99
3,2,502,5,250.0,50.0
4,2,403,1,129.99,129.99
5,4,897,2,49.98,24.99


In [7]:
orderItems.count()

172198

In [23]:
orderItemsMap= orderItems.map(lambda x : (int(x.split(",")[1]), float(x.split(",")[4])))

In [9]:
orderItemsMap.count()

172198

In [10]:
orderItemsMap.take(4)

[(1, 299.98), (2, 199.99), (2, 250.0), (2, 129.99)]

In [11]:
orderItemsGBK= orderItemsMap.groupByKey()

In [12]:
orderItemsGBK.count()

57431

In [13]:
orderItemsGBK.take(2)

[(1, <pyspark.resultiterable.ResultIterable at 0x2a94eb3df70>),
 (2, <pyspark.resultiterable.ResultIterable at 0x2a94eb3deb0>)]

In [14]:
orderItemsRevenue= orderItemsGBK.map(lambda x: (int(x[0]), float(sum(x[1]))))

In [15]:
orderItemsRevenue.take(2)

[(1, 299.98), (2, 579.98)]

Note: above I performed the GroupByKey operation which is always a costly affair to spark as its shuffles the data lot.
    so I am going use ReduceByKey which gonna be shuffle the data less. It always recommended using ReduceByKey than GroupByKey.

In [16]:
orderItemsRBK= orderItemsMap.reduceByKey(lambda x,y:x+y)

In [17]:
orderITemsRevenue= orderItemsRBK.map(lambda x: (int(x[0]), float(sum(x[1]))))

In [18]:
orderItemsRBK.take(2)

[(1, 299.98), (2, 579.98)]

# we will see order Item Id, order Item Price and order Item order items count.

In [26]:
orderItemsMap= orderItems.map(lambda x: (int(x.split(",")[1]), (float(x.split(",")[4]),1)))

In [27]:
#if we want how many same kind of orders together and how much revenue we got by each order
RevenueAndCountPerOrder= orderItemsMap.reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1]))

In [28]:
RevenueAndCountPerOrder.take(5)  #here order item id is 1 and order item price is 299.98 and no of order items 1

[(1, (299.98, 1)),
 (2, (579.98, 3)),
 (4, (699.85, 4)),
 (5, (1129.8600000000001, 5)),
 (7, (579.9200000000001, 3))]

In [29]:
#above approach is normal approach we can get this by using aggregateByKey api as well
orderItems.take(3)

['1,1,957,1,299.98,299.98', '2,2,1073,1,199.99,199.99', '3,2,502,5,250.0,50.0']

In [30]:
orderItemsMap= orderItems.map(lambda x: (int(x.split(",")[1]), float(x.split(",")[4])))

In [31]:
orderItemsMap.take(3)

[(1, 299.98), (2, 199.99), (2, 250.0)]

In [32]:
RevenueAndCountPerOrderAGBK= orderItemsMap.aggregateByKey((0.0,0), lambda x,y: (x[0]+y, x[1]+1),
                                                          lambda x,y : (x[0]+y[0], x[1]+y[1]))

In [33]:
RevenueAndCountPerOrderAGBK.take(5)

[(1, (299.98, 1)),
 (2, (579.98, 3)),
 (4, (699.85, 4)),
 (5, (1129.8600000000001, 5)),
 (7, (579.9200000000001, 3))]