In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("2-rdd-aggregations").getOrCreate()
sc = spark.sparkContext

In [3]:
spark

In [4]:
ord_rdd = sc.textFile("data/orders.csv").map( lambda x: tuple(x.split(',')) )
ordItems_rdd = sc.textFile("./data/orderItems.csv").map( lambda x: tuple(x.split(",")) )

In [5]:
ord_rdd.getNumPartitions()

2

In [6]:
print(sc.defaultParallelism)

8


In [11]:
ord_rdd.take(10)

[('1', '2013-07-25 00:00:00.0', '11599', 'CLOSED'),
 ('2', '2013-07-25 00:00:00.0', '256', 'PENDING_PAYMENT'),
 ('3', '2013-07-25 00:00:00.0', '12111', 'COMPLETE'),
 ('4', '2013-07-25 00:00:00.0', '8827', 'CLOSED'),
 ('5', '2013-07-25 00:00:00.0', '11318', 'COMPLETE'),
 ('6', '2013-07-25 00:00:00.0', '7130', 'COMPLETE'),
 ('7', '2013-07-25 00:00:00.0', '4530', 'COMPLETE'),
 ('8', '2013-07-25 00:00:00.0', '2911', 'PROCESSING'),
 ('9', '2013-07-25 00:00:00.0', '5657', 'PENDING_PAYMENT'),
 ('10', '2013-07-25 00:00:00.0', '5648', 'PENDING_PAYMENT')]

In [17]:
ordItems_rdd.take(10)

[('1', '1', '957', '1', '299.98', '299.98'),
 ('2', '2', '1073', '1', '199.99', '199.99'),
 ('3', '2', '502', '5', '250.0', '50.0'),
 ('4', '2', '403', '1', '129.99', '129.99'),
 ('5', '4', '897', '2', '49.98', '24.99'),
 ('6', '4', '365', '5', '299.95', '59.99'),
 ('7', '4', '502', '3', '150.0', '50.0'),
 ('8', '4', '1014', '4', '199.92', '49.98'),
 ('9', '5', '957', '1', '299.98', '299.98'),
 ('10', '5', '365', '5', '299.95', '59.99')]

In [9]:
ord_rdd.getStorageLevel()

StorageLevel(False, False, False, False, 1)

### 1. Numbers of orders that are closed:-
--------------------------------------------

In [10]:
ord_rdd.filter( lambda x: x[-1]=='CLOSED' ).count()

7556

### 2. Find the total quanty sold for OrderID 1 to 10:-
------------------------------------------------------------
In the <u>orderItems</u> dataset-
- The 2nd column is the ***orderId*** column.
- The 4th column is the ***quantity*** column.

In [18]:
ordItems_rdd.take(5)

[('1', '1', '957', '1', '299.98', '299.98'),
 ('2', '2', '1073', '1', '199.99', '199.99'),
 ('3', '2', '502', '5', '250.0', '50.0'),
 ('4', '2', '403', '1', '129.99', '129.99'),
 ('5', '4', '897', '2', '49.98', '24.99')]

In [26]:
q2_rdd = ordItems_rdd.map( lambda x: (int(x[1]), int(x[3])) ).filter( lambda x: x[0] in range(1,11) )
q2_rdd.collect()

[(1, 1),
 (2, 1),
 (2, 5),
 (2, 1),
 (4, 2),
 (4, 5),
 (4, 3),
 (4, 4),
 (5, 1),
 (5, 5),
 (5, 2),
 (5, 1),
 (5, 1),
 (7, 1),
 (7, 1),
 (7, 5),
 (8, 3),
 (8, 5),
 (8, 4),
 (8, 1),
 (9, 2),
 (9, 1),
 (9, 1),
 (10, 1),
 (10, 2),
 (10, 1),
 (10, 1),
 (10, 1)]

In [27]:
q2_rdd.reduceByKey( lambda x,y: x+y ).collect()

[(2, 7), (4, 14), (8, 13), (10, 6), (1, 1), (5, 10), (7, 7), (9, 4)]

In [51]:
#Total order:
total_qty = q2_rdd.map( lambda x: x[1] ).reduce( lambda x,y: x+y )
print( "Total quantity of orders 1-10 is:", total_qty )

Total quantity of orders 1-10 is: 62


### 3. For a given order (say 10), find the maximum subtotal out of all orders:
--------------------------------------------------------------------------------
In the *"orderItems.csv"* dataset-
- The 2nd column is orderId, and it denotes a particular order(here, we want 10 as orderId)
- The 5th column contains the subtotal.

In [31]:
ordItems_rdd.take(5)

[('1', '1', '957', '1', '299.98', '299.98'),
 ('2', '2', '1073', '1', '199.99', '199.99'),
 ('3', '2', '502', '5', '250.0', '50.0'),
 ('4', '2', '403', '1', '129.99', '129.99'),
 ('5', '4', '897', '2', '49.98', '24.99')]

In [45]:
q3_rdd = ordItems_rdd.filter( lambda x: x[1]=='10' ).map( lambda x: float(x[4]) )

In [46]:
type( q3_rdd )

pyspark.rdd.PipelinedRDD

In [48]:
q3_rdd.collect()

[199.99, 99.96, 129.99, 21.99, 199.99]

In [50]:
max_subtotal_for_order_10 = q3_rdd.reduce( lambda x,y: x if x>y else y )
print( "Max Subtotal for order 10 is:", max_subtotal_for_order_10 )

Max Subtotal for order 10 is: 199.99


### 4. For each product, find its aggregated(total) revenue:
<hr>

In [20]:
ordItems_rdd.take(5)

[('1', '1', '957', '1', '299.98', '299.98'),
 ('2', '2', '1073', '1', '199.99', '199.99'),
 ('3', '2', '502', '5', '250.0', '50.0'),
 ('4', '2', '403', '1', '129.99', '129.99'),
 ('5', '4', '897', '2', '49.98', '24.99')]

In [11]:
ordItems_schema = ["order_item_id", "order_item_order_id", "order_item_product_id", "order_item_quantity", "order_item_revenue", "order_item_price"]

ordItems_df = ordItems_rdd.toDF(ordItems_schema)

In [12]:
ordItems_df.show(10)

+-------------+-------------------+---------------------+-------------------+------------------+----------------+
|order_item_id|order_item_order_id|order_item_product_id|order_item_quantity|order_item_revenue|order_item_price|
+-------------+-------------------+---------------------+-------------------+------------------+----------------+
|            1|                  1|                  957|                  1|            299.98|          299.98|
|            2|                  2|                 1073|                  1|            199.99|          199.99|
|            3|                  2|                  502|                  5|             250.0|            50.0|
|            4|                  2|                  403|                  1|            129.99|          129.99|
|            5|                  4|                  897|                  2|             49.98|           24.99|
|            6|                  4|                  365|                  5|           

In [19]:
prod_rev_rdd = ordItems_rdd.map( lambda x: ( int(x[2]), float(x[4]) ) )
prod_rev_rdd.take(5)

[(957, 299.98), (1073, 199.99), (502, 250.0), (403, 129.99), (897, 49.98)]

In [41]:
prod_rev_rdd.reduceByKey( lambda x,y: x+y ).sortBy( lambda x: x[1], ascending=False ).take(10)

[(1004, 6929653.499999708),
 (365, 4421143.019999638),
 (957, 4118425.419999785),
 (191, 3667633.1999997487),
 (502, 3147800.0),
 (1073, 3099844.999999871),
 (403, 2891757.5399998166),
 (1014, 2888993.9399996493),
 (627, 1269082.649999932),
 (565, 67830.0)]

In [44]:
prod_rev_grp = prod_rev_rdd.groupByKey()

In [48]:
for i in prod_rev_grp.collect()[0]:
    print(i)

502
<pyspark.resultiterable.ResultIterable object at 0x0000022ED60BE0D0>


### 5. For each product, find its max revenue:
<hr>

In [50]:
prod_rev_rdd.reduceByKey( lambda x,y: x if x>y else y ).take(20)

[(502, 250.0),
 (1014, 249.9),
 (926, 79.95),
 (134, 125.0),
 (276, 159.95),
 (1004, 399.98),
 (828, 159.95),
 (810, 99.95),
 (906, 124.95),
 (924, 79.95),
 (886, 124.95),
 (572, 199.95),
 (778, 124.95),
 (278, 224.95),
 (642, 150.0),
 (804, 99.95),
 (564, 150.0),
 (792, 74.95),
 (172, 150.0),
 (822, 239.95)]

In [21]:
prod_rev_rdd.aggregateByKey( 0, lambda x,y: x if x>y else y, lambda x,y: x if x>y else y ).take(20)

[(502, 250.0),
 (1014, 249.9),
 (926, 79.95),
 (134, 125.0),
 (276, 159.95),
 (1004, 399.98),
 (828, 159.95),
 (810, 99.95),
 (906, 124.95),
 (924, 79.95),
 (886, 124.95),
 (572, 199.95),
 (778, 124.95),
 (278, 224.95),
 (642, 150.0),
 (804, 99.95),
 (564, 150.0),
 (792, 74.95),
 (172, 150.0),
 (822, 239.95)]

### Using Accumulators:-
-----------------------------

In [12]:
accum_var = sc.accumulator(0)

In [13]:
print( accum_var.value )

0


In [14]:
test_rdd = sc.parallelize( [1,2,3,4,5,6] )
test_rdd.collect()

[1, 2, 3, 4, 5, 6]

In [16]:
test_rdd.foreach( lambda x: accum_var.add(x) )

In [17]:
print( accum_var.value )

21


### Find max revenue for each order, and also print the customer names:-
<hr>

In [4]:
ord_cust_rdd = sc.parallelize([
(2,"Joseph",200), (2, "Jimmy",250), (2, "Tina",130), (4,"Jimmy",50),
(4,"Tina",300), (4,"Joseph",150), (4, "Ram",200), (7,"Tina",200),
(7, "Joseph",300), (7,"Jimmy",80)
] , 2)

In [5]:
ord_cust_rdd.collect()

[(2, 'Joseph', 200),
 (2, 'Jimmy', 250),
 (2, 'Tina', 130),
 (4, 'Jimmy', 50),
 (4, 'Tina', 300),
 (4, 'Joseph', 150),
 (4, 'Ram', 200),
 (7, 'Tina', 200),
 (7, 'Joseph', 300),
 (7, 'Jimmy', 80)]

In [6]:
ord_cust_rdd = ord_cust_rdd.map( lambda x: (x[0], (x[1],x[2])) )

In [7]:
ord_cust_rdd.collect()

[(2, ('Joseph', 200)),
 (2, ('Jimmy', 250)),
 (2, ('Tina', 130)),
 (4, ('Jimmy', 50)),
 (4, ('Tina', 300)),
 (4, ('Joseph', 150)),
 (4, ('Ram', 200)),
 (7, ('Tina', 200)),
 (7, ('Joseph', 300)),
 (7, ('Jimmy', 80))]

In [17]:
max_rev_cust_rdd = ord_cust_rdd.aggregateByKey( ('',0), lambda x,y: x if x[1]>y[1] else y , lambda x,y: x if x[1]>y[1] else y )

In [18]:
# type( max_rev_cust_rdd )
max_rev_cust_rdd.collect()

[(2, ('Jimmy', 250)), (4, ('Tina', 300)), (7, ('Joseph', 300))]

In [55]:
print( max_rev_cust_rdd )

PythonRDD[58] at RDD at PythonRDD.scala:53


### Sum up all revenues and number of records for each order:-
<hr>

In [19]:
data = [
(2,"Joseph",200), (2, "Jimmy",250), (2, "Tina",130), (4,"Jimmy",50),
(4,"Tina",300), (4,"Joseph",150), (4, "Ram",200), (7,"Tina",200),
(7, "Joseph",300), (7,"Jimmy",80)
]

ord_cust_rdd = sc.parallelize( data, 2)

In [20]:
ord_cust_rdd.collect()

[(2, 'Joseph', 200),
 (2, 'Jimmy', 250),
 (2, 'Tina', 130),
 (4, 'Jimmy', 50),
 (4, 'Tina', 300),
 (4, 'Joseph', 150),
 (4, 'Ram', 200),
 (7, 'Tina', 200),
 (7, 'Joseph', 300),
 (7, 'Jimmy', 80)]

In [21]:
rev_sum_count_rdd = ord_cust_rdd.map( lambda x: ( x[0], (x[2],1) ) )

In [22]:
rev_sum_count_rdd.collect()

[(2, (200, 1)),
 (2, (250, 1)),
 (2, (130, 1)),
 (4, (50, 1)),
 (4, (300, 1)),
 (4, (150, 1)),
 (4, (200, 1)),
 (7, (200, 1)),
 (7, (300, 1)),
 (7, (80, 1))]

In [23]:
rev_sum_count_rdd.reduceByKey( lambda x,y: ( x[0]+y[0], x[1]+y[1] ) ).collect()

[(2, (580, 3)), (4, (700, 4)), (7, (580, 3))]

In [25]:
rev_sum_count_rdd.aggregateByKey( (0,0), lambda x,y: (x[0]+y[0], x[1]+y[1]), lambda x,y: (x[0]+y[0], x[1]+y[1]) ).collect()

[(2, (580, 3)), (4, (700, 4)), (7, (580, 3))]

In [26]:
ord_rev_df = spark.createDataFrame( data=data, schema = ["ordId","custName","revenue"] )

In [27]:
ord_rev_df.show()

+-----+--------+-------+
|ordId|custName|revenue|
+-----+--------+-------+
|    2|  Joseph|    200|
|    2|   Jimmy|    250|
|    2|    Tina|    130|
|    4|   Jimmy|     50|
|    4|    Tina|    300|
|    4|  Joseph|    150|
|    4|     Ram|    200|
|    7|    Tina|    200|
|    7|  Joseph|    300|
|    7|   Jimmy|     80|
+-----+--------+-------+



In [28]:
ord_rev_df.isEmpty()

False

In [29]:
ord_rev_df.printSchema()

root
 |-- ordId: long (nullable = true)
 |-- custName: string (nullable = true)
 |-- revenue: long (nullable = true)



In [30]:
from pyspark.sql import functions as F

In [32]:
total_rev_count_df = ord_rev_df.groupBy("ordId").agg( F.sum(F.col("revenue")).alias("total_revenue"), F.count(F.col("ordId")) ).show() 

+-----+-------------+------------+
|ordId|total_revenue|count(ordId)|
+-----+-------------+------------+
|    2|          580|           3|
|    4|          700|           4|
|    7|          580|           3|
+-----+-------------+------------+



#### - using seperate sequence_operation and combiner_operation function methodology:-
<hr>

In [9]:
data = [
(2,"Joseph",200), (2, "Jimmy",250), (2, "Tina",130), (4,"Jimmy",50),
(4,"Tina",300), (4,"Joseph",150), (4, "Ram",200), (7,"Tina",200),
(7, "Joseph",300), (7,"Jimmy",80)
]

cust_rev_rdd = sc.parallelize(data, 2)

In [10]:
cust_rev_rdd.collect()

[(2, 'Joseph', 200),
 (2, 'Jimmy', 250),
 (2, 'Tina', 130),
 (4, 'Jimmy', 50),
 (4, 'Tina', 300),
 (4, 'Joseph', 150),
 (4, 'Ram', 200),
 (7, 'Tina', 200),
 (7, 'Joseph', 300),
 (7, 'Jimmy', 80)]

In [11]:
cust_rev_rdd = cust_rev_rdd.map( lambda x: (x[0],x[2]) )

In [15]:
cust_rev_rdd.collect()

[(2, 200),
 (2, 250),
 (2, 130),
 (4, 50),
 (4, 300),
 (4, 150),
 (4, 200),
 (7, 200),
 (7, 300),
 (7, 80)]

In [13]:
cust_rev_rdd.getNumPartitions()

2

- When using aggregateByKey(), for each unique key, a separate accumulator is created, of the datatype mentioned in the zero_value (1st) parameter of the function. For each key, that accumulator goes on collecting the aggregated value, updated at each step of the function execution, and at last, the accumulator contains the total aggregated value for each key, and thus that value is being placed in the resultant rdd, with respect to their corresponding keys.

In [16]:
# seq_op() function will acct on each partition, and will receive the zero_value accumulator as the 1st parameter, and each 
# value of the key value rdd will be passed one-by-one as the 2nd parameter.

zero_val = (0,0) # As each element of cust_rev_rdd is a tuple with 2 integer elements
# 0th index of zero_val will be used for counting the total revenue for each id
# 1st index of zero_val will be used for counting the number of records for each order_id

def seq_op( accum, value ):
    # in this case, the value and key, both are integers.
    return ( accum[0]+value, accum[1]+1 )

def comb_op( accum1, accum2 ):
    return ( accum1[0]+accum2[0], accum1[1]+accum2[1] )

cust_rev_rdd.aggregateByKey( zero_val, seq_op, comb_op ).collect()

[(2, (580, 3)), (4, (700, 4)), (7, (580, 3))]

<hr><hr>

In [20]:
rec_count = cust_rev_rdd.countByKey()
print( rec_count )

defaultdict(<class 'int'>, {2: 3, 4: 4, 7: 3})


In [24]:
for i in rec_count:
    print("Key:",i,"->    Record Count:",rec_count[i])

Key: 2 ->    Record Count: 3
Key: 4 ->    Record Count: 4
Key: 7 ->    Record Count: 3


### Count number of Orders for each status:-
---------------------------------------------

In [37]:
ord_rdd = sc.textFile("./data/orders.csv").map( lambda x: (x.split(",")[-1], x.split(",")) )

In [38]:
ord_rdd.take(10)

[('CLOSED', ['1', '2013-07-25 00:00:00.0', '11599', 'CLOSED']),
 ('PENDING_PAYMENT', ['2', '2013-07-25 00:00:00.0', '256', 'PENDING_PAYMENT']),
 ('COMPLETE', ['3', '2013-07-25 00:00:00.0', '12111', 'COMPLETE']),
 ('CLOSED', ['4', '2013-07-25 00:00:00.0', '8827', 'CLOSED']),
 ('COMPLETE', ['5', '2013-07-25 00:00:00.0', '11318', 'COMPLETE']),
 ('COMPLETE', ['6', '2013-07-25 00:00:00.0', '7130', 'COMPLETE']),
 ('COMPLETE', ['7', '2013-07-25 00:00:00.0', '4530', 'COMPLETE']),
 ('PROCESSING', ['8', '2013-07-25 00:00:00.0', '2911', 'PROCESSING']),
 ('PENDING_PAYMENT',
  ['9', '2013-07-25 00:00:00.0', '5657', 'PENDING_PAYMENT']),
 ('PENDING_PAYMENT',
  ['10', '2013-07-25 00:00:00.0', '5648', 'PENDING_PAYMENT'])]

In [39]:
status_count = ord_rdd.countByKey()

In [43]:
for i in sorted(status_count):
    print("Status: ", i, "->   Count: ",status_count[i])

Status:  CANCELED ->   Count:  1428
Status:  CLOSED ->   Count:  7556
Status:  COMPLETE ->   Count:  22899
Status:  ON_HOLD ->   Count:  3798
Status:  PAYMENT_REVIEW ->   Count:  729
Status:  PENDING ->   Count:  7610
Status:  PENDING_PAYMENT ->   Count:  15030
Status:  PROCESSING ->   Count:  8275
Status:  SUSPECTED_FRAUD ->   Count:  1558


## Sorting:
<hr><hr>

### Sort orders by customer_id (1st value in each line):

In [49]:
ord_rdd = sc.textFile("./data/orders.csv").map( lambda x: x.split(",") ).map( lambda x: (int(x[0]), x[1], int(x[2]), x[3]) )

In [50]:
ord_rdd.take(5)

[(1, '2013-07-25 00:00:00.0', 11599, 'CLOSED'),
 (2, '2013-07-25 00:00:00.0', 256, 'PENDING_PAYMENT'),
 (3, '2013-07-25 00:00:00.0', 12111, 'COMPLETE'),
 (4, '2013-07-25 00:00:00.0', 8827, 'CLOSED'),
 (5, '2013-07-25 00:00:00.0', 11318, 'COMPLETE')]

In [55]:
# sorting by customer_id in descending order:-
ord_rdd.sortBy( lambda x: x[2] ).take(20)

[(22945, '2013-12-13 00:00:00.0', 1, 'COMPLETE'),
 (15192, '2013-10-29 00:00:00.0', 2, 'PENDING_PAYMENT'),
 (33865, '2014-02-18 00:00:00.0', 2, 'COMPLETE'),
 (57963, '2013-08-02 00:00:00.0', 2, 'ON_HOLD'),
 (67863, '2013-11-30 00:00:00.0', 2, 'COMPLETE'),
 (22646, '2013-12-11 00:00:00.0', 3, 'COMPLETE'),
 (23662, '2013-12-19 00:00:00.0', 3, 'COMPLETE'),
 (35158, '2014-02-26 00:00:00.0', 3, 'COMPLETE'),
 (46399, '2014-05-09 00:00:00.0', 3, 'PROCESSING'),
 (56178, '2014-07-15 00:00:00.0', 3, 'PENDING'),
 (57617, '2014-07-24 00:00:00.0', 3, 'COMPLETE'),
 (61453, '2013-12-14 00:00:00.0', 3, 'COMPLETE'),
 (9023, '2013-09-19 00:00:00.0', 4, 'COMPLETE'),
 (9704, '2013-09-24 00:00:00.0', 4, 'COMPLETE'),
 (17253, '2013-11-09 00:00:00.0', 4, 'PENDING_PAYMENT'),
 (37878, '2014-03-15 00:00:00.0', 4, 'COMPLETE'),
 (49339, '2014-05-28 00:00:00.0', 4, 'COMPLETE'),
 (51157, '2014-06-10 00:00:00.0', 4, 'CLOSED'),
 (13705, '2013-10-18 00:00:00.0', 5, 'COMPLETE'),
 (36472, '2014-03-06 00:00:00.0', 5, 'PR

In [59]:
# sort by customer_id and status
ord_rdd.map( lambda x: ( (x[-2],x[-1]), x ) ).sortByKey().take(20)

[((1, 'COMPLETE'), (22945, '2013-12-13 00:00:00.0', 1, 'COMPLETE')),
 ((2, 'COMPLETE'), (33865, '2014-02-18 00:00:00.0', 2, 'COMPLETE')),
 ((2, 'COMPLETE'), (67863, '2013-11-30 00:00:00.0', 2, 'COMPLETE')),
 ((2, 'ON_HOLD'), (57963, '2013-08-02 00:00:00.0', 2, 'ON_HOLD')),
 ((2, 'PENDING_PAYMENT'),
  (15192, '2013-10-29 00:00:00.0', 2, 'PENDING_PAYMENT')),
 ((3, 'COMPLETE'), (22646, '2013-12-11 00:00:00.0', 3, 'COMPLETE')),
 ((3, 'COMPLETE'), (23662, '2013-12-19 00:00:00.0', 3, 'COMPLETE')),
 ((3, 'COMPLETE'), (35158, '2014-02-26 00:00:00.0', 3, 'COMPLETE')),
 ((3, 'COMPLETE'), (57617, '2014-07-24 00:00:00.0', 3, 'COMPLETE')),
 ((3, 'COMPLETE'), (61453, '2013-12-14 00:00:00.0', 3, 'COMPLETE')),
 ((3, 'PENDING'), (56178, '2014-07-15 00:00:00.0', 3, 'PENDING')),
 ((3, 'PROCESSING'), (46399, '2014-05-09 00:00:00.0', 3, 'PROCESSING')),
 ((4, 'CLOSED'), (51157, '2014-06-10 00:00:00.0', 4, 'CLOSED')),
 ((4, 'COMPLETE'), (9023, '2013-09-19 00:00:00.0', 4, 'COMPLETE')),
 ((4, 'COMPLETE'), (970

In [62]:
ord_rdd.takeOrdered(2)

[(1, '2013-07-25 00:00:00.0', 11599, 'CLOSED'),
 (2, '2013-07-25 00:00:00.0', 256, 'PENDING_PAYMENT')]

## Global Ranking:
<hr><hr>

### Find top 5 products with highest prices:-
---------------------------------------------

In [35]:
prod_rdd = sc.textFile("./data/products.csv") \
             .map( lambda x: x.split(",") ) \
             .filter( lambda x: x[4] != '' ) \
             .map( lambda x: (int(x[0]),int(x[1]),x[2],x[3],float(x[4]),x[5]) )

In [34]:
prod_rdd.take(5)

[(1,
  2,
  'Quest Q64 10 FT. x 10 FT. Slant Leg Instant U',
  '',
  59.98,
  'http://images.acmesports.sports/Quest+Q64+10+FT.+x+10+FT.+Slant+Leg+Instant+Up+Canopy'),
 (2,
  2,
  "Under Armour Men's Highlight MC Football Clea",
  '',
  129.99,
  'http://images.acmesports.sports/Under+Armour+Men%27s+Highlight+MC+Football+Cleat'),
 (3,
  2,
  "Under Armour Men's Renegade D Mid Football Cl",
  '',
  89.99,
  'http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat'),
 (4,
  2,
  "Under Armour Men's Renegade D Mid Football Cl",
  '',
  89.99,
  'http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat'),
 (5,
  2,
  'Riddell Youth Revolution Speed Custom Footbal',
  '',
  199.99,
  'http://images.acmesports.sports/Riddell+Youth+Revolution+Speed+Custom+Football+Helmet')]

In [7]:
schema = "prod_id LONG, prod_category_id LONG, prod_name STRING, prod_desc STRING, prod_price DOUBLE, prod_image STRING"

prod_df = spark.read.format("csv").schema(schema).load("./data/products.csv")

In [8]:
prod_df.show(5, truncate=35)

+-------+----------------+-----------------------------------+---------+----------+-----------------------------------+
|prod_id|prod_category_id|                          prod_name|prod_desc|prod_price|                         prod_image|
+-------+----------------+-----------------------------------+---------+----------+-----------------------------------+
|      1|               2|Quest Q64 10 FT. x 10 FT. Slant ...|     null|     59.98|http://images.acmesports.sports/...|
|      2|               2|Under Armour Men's Highlight MC ...|     null|    129.99|http://images.acmesports.sports/...|
|      3|               2|Under Armour Men's Renegade D Mi...|     null|     89.99|http://images.acmesports.sports/...|
|      4|               2|Under Armour Men's Renegade D Mi...|     null|     89.99|http://images.acmesports.sports/...|
|      5|               2|Riddell Youth Revolution Speed C...|     null|    199.99|http://images.acmesports.sports/...|
+-------+----------------+--------------

In [9]:
prod_df.printSchema()

root
 |-- prod_id: long (nullable = true)
 |-- prod_category_id: long (nullable = true)
 |-- prod_name: string (nullable = true)
 |-- prod_desc: string (nullable = true)
 |-- prod_price: double (nullable = true)
 |-- prod_image: string (nullable = true)



In [23]:
# prod_rdd.filter( lambda x: bool(x[4])==False ).collect()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage 15.0 (TID 19) (DEBANJAN executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "D:\Softwares\Apache_Spark\spark\python\lib\pyspark.zip\pyspark\worker.py", line 686, in main
  File "D:\Softwares\Apache_Spark\spark\python\lib\pyspark.zip\pyspark\worker.py", line 678, in process
  File "D:\Softwares\Apache_Spark\spark\python\lib\pyspark.zip\pyspark\serializers.py", line 273, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "D:\Softwares\Apache_Spark\spark\python\lib\pyspark.zip\pyspark\util.py", line 81, in wrapper
    return f(*args, **kwargs)
  File "C:\Users\DEBANJ~1\AppData\Local\Temp/ipykernel_25188/1422624839.py", line 3, in <lambda>
ValueError: could not convert string to float: ''

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:552)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:758)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:740)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:505)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1021)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2278)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:833)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2238)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2259)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2278)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1021)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1020)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:180)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "D:\Softwares\Apache_Spark\spark\python\lib\pyspark.zip\pyspark\worker.py", line 686, in main
  File "D:\Softwares\Apache_Spark\spark\python\lib\pyspark.zip\pyspark\worker.py", line 678, in process
  File "D:\Softwares\Apache_Spark\spark\python\lib\pyspark.zip\pyspark\serializers.py", line 273, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "D:\Softwares\Apache_Spark\spark\python\lib\pyspark.zip\pyspark\util.py", line 81, in wrapper
    return f(*args, **kwargs)
  File "C:\Users\DEBANJ~1\AppData\Local\Temp/ipykernel_25188/1422624839.py", line 3, in <lambda>
ValueError: could not convert string to float: ''

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:552)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:758)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:740)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:505)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1021)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2278)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more


In [37]:
# ord_rdd.sortBy( lambda x: x[2] ).take(20)

prod_rdd.sortBy( lambda x: x[4], ascending=False ).take(5)

# prod_df.rdd.sortBy( lambda x: x.prod_price, ascending=False ).take(5)

[(208,
  10,
  'SOLE E35 Elliptical',
  '',
  1999.99,
  'http://images.acmesports.sports/SOLE+E35+Elliptical'),
 (66,
  4,
  'SOLE F85 Treadmill',
  '',
  1799.99,
  'http://images.acmesports.sports/SOLE+F85+Treadmill'),
 (199,
  10,
  'SOLE F85 Treadmill',
  '',
  1799.99,
  'http://images.acmesports.sports/SOLE+F85+Treadmill'),
 (496,
  22,
  'SOLE F85 Treadmill',
  '',
  1799.99,
  'http://images.acmesports.sports/SOLE+F85+Treadmill'),
 (1048,
  47,
  '"Spalding Beast 60"" Glass Portable Basketball "',
  '',
  1099.99,
  'http://images.acmesports.sports/Spalding+Beast+60%22+Glass+Portable+Basketball+Hoop')]

In [97]:
prod_df.orderBy( prod_df.prod_price.desc() ).show(10)

+-------+----------------+--------------------+---------+----------+--------------------+
|prod_id|prod_category_id|           prod_name|prod_desc|prod_price|          prod_image|
+-------+----------------+--------------------+---------+----------+--------------------+
|    208|              10| SOLE E35 Elliptical|     null|   1999.99|http://images.acm...|
|    199|              10|  SOLE F85 Treadmill|     null|   1799.99|http://images.acm...|
|     66|               4|  SOLE F85 Treadmill|     null|   1799.99|http://images.acm...|
|    496|              22|  SOLE F85 Treadmill|     null|   1799.99|http://images.acm...|
|   1048|              47|"Spalding Beast 6...|     null|   1099.99|http://images.acm...|
|     60|               4| SOLE E25 Elliptical|     null|    999.99|http://images.acm...|
|    197|              10| SOLE E25 Elliptical|     null|    999.99|http://images.acm...|
|    694|              32|Callaway Women's ...|     null|    999.99|http://images.acm...|
|    488| 

## Ranking by Group ( windowed ranking ):
<hr><hr>

### Find top 2 products by price for each product_category:

In [23]:
prod_rdd = sc.textFile('data/products.csv') \
             .map( lambda x: x.split(",") ) \
             .filter( lambda x: x[4] != '' ) \
             .map( lambda x: ( int(x[0]), int(x[1]), x[2], x[3], float(x[4]), x[5] ) )

In [24]:
prod_rdd.take(3)

[(1,
  2,
  'Quest Q64 10 FT. x 10 FT. Slant Leg Instant U',
  '',
  59.98,
  'http://images.acmesports.sports/Quest+Q64+10+FT.+x+10+FT.+Slant+Leg+Instant+Up+Canopy'),
 (2,
  2,
  "Under Armour Men's Highlight MC Football Clea",
  '',
  129.99,
  'http://images.acmesports.sports/Under+Armour+Men%27s+Highlight+MC+Football+Cleat'),
 (3,
  2,
  "Under Armour Men's Renegade D Mid Football Cl",
  '',
  89.99,
  'http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat')]

In [25]:
#Building a key-value rdd of prod_rdd so that we can apply groupByKey()
prod_kv = prod_rdd.map( lambda x: ( x[1], x ) )

In [26]:
prod_kv.take(3)

[(2,
  (1,
   2,
   'Quest Q64 10 FT. x 10 FT. Slant Leg Instant U',
   '',
   59.98,
   'http://images.acmesports.sports/Quest+Q64+10+FT.+x+10+FT.+Slant+Leg+Instant+Up+Canopy')),
 (2,
  (2,
   2,
   "Under Armour Men's Highlight MC Football Clea",
   '',
   129.99,
   'http://images.acmesports.sports/Under+Armour+Men%27s+Highlight+MC+Football+Cleat')),
 (2,
  (3,
   2,
   "Under Armour Men's Renegade D Mid Football Cl",
   '',
   89.99,
   'http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat'))]

In [27]:
prod_kv_grp = prod_kv.groupByKey()

In [36]:
prod_kv_grp.collect()

[(2, <pyspark.resultiterable.ResultIterable at 0x1a062153d00>),
 (4, <pyspark.resultiterable.ResultIterable at 0x1a062153f40>),
 (6, <pyspark.resultiterable.ResultIterable at 0x1a062153100>),
 (8, <pyspark.resultiterable.ResultIterable at 0x1a062070940>),
 (10, <pyspark.resultiterable.ResultIterable at 0x1a0621737f0>),
 (12, <pyspark.resultiterable.ResultIterable at 0x1a062173100>),
 (38, <pyspark.resultiterable.ResultIterable at 0x1a0621730a0>),
 (16, <pyspark.resultiterable.ResultIterable at 0x1a062173880>),
 (18, <pyspark.resultiterable.ResultIterable at 0x1a062173760>),
 (20, <pyspark.resultiterable.ResultIterable at 0x1a062173ca0>),
 (22, <pyspark.resultiterable.ResultIterable at 0x1a062173970>),
 (24, <pyspark.resultiterable.ResultIterable at 0x1a062173b20>),
 (26, <pyspark.resultiterable.ResultIterable at 0x1a062070100>),
 (30, <pyspark.resultiterable.ResultIterable at 0x1a062075190>),
 (32, <pyspark.resultiterable.ResultIterable at 0x1a06055d790>),
 (34, <pyspark.resultiterable

<hr><hr>
- Each element of the 'prod_kv_grp' is a tuple, and the tuples contain the category (key) in their 0th index, and an iterator in their 1st index.
- The iterator is actually a group/collection of the value elements in the rdd that has been grouped. Using for loop on that iterator, eah value of that group can be extracted, as shown in the cell below:

In [31]:
for i in prod_kv_grp.take(2)[0][1]:
    print(i, type(i))

(1, 2, 'Quest Q64 10 FT. x 10 FT. Slant Leg Instant U', '', 59.98, 'http://images.acmesports.sports/Quest+Q64+10+FT.+x+10+FT.+Slant+Leg+Instant+Up+Canopy') <class 'tuple'>
(2, 2, "Under Armour Men's Highlight MC Football Clea", '', 129.99, 'http://images.acmesports.sports/Under+Armour+Men%27s+Highlight+MC+Football+Cleat') <class 'tuple'>
(3, 2, "Under Armour Men's Renegade D Mid Football Cl", '', 89.99, 'http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat') <class 'tuple'>
(4, 2, "Under Armour Men's Renegade D Mid Football Cl", '', 89.99, 'http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat') <class 'tuple'>
(5, 2, 'Riddell Youth Revolution Speed Custom Footbal', '', 199.99, 'http://images.acmesports.sports/Riddell+Youth+Revolution+Speed+Custom+Football+Helmet') <class 'tuple'>
(6, 2, "Jordan Men's VI Retro TD Football Cleat", '', 134.99, 'http://images.acmesports.sports/Jordan+Men%27s+VI+Retro+TD+Football+Cleat') <class 'tu

 #### Converting the iterable obtained by *groupByKey()* into another rdd, using *sc.parallelize()* :
 <hr>

In [34]:
test_rdd = sc.parallelize( prod_kv_grp.take(2)[0][1] )

In [38]:
test_rdd.collect()

[(1,
  2,
  'Quest Q64 10 FT. x 10 FT. Slant Leg Instant U',
  '',
  59.98,
  'http://images.acmesports.sports/Quest+Q64+10+FT.+x+10+FT.+Slant+Leg+Instant+Up+Canopy'),
 (2,
  2,
  "Under Armour Men's Highlight MC Football Clea",
  '',
  129.99,
  'http://images.acmesports.sports/Under+Armour+Men%27s+Highlight+MC+Football+Cleat'),
 (3,
  2,
  "Under Armour Men's Renegade D Mid Football Cl",
  '',
  89.99,
  'http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat'),
 (4,
  2,
  "Under Armour Men's Renegade D Mid Football Cl",
  '',
  89.99,
  'http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat'),
 (5,
  2,
  'Riddell Youth Revolution Speed Custom Footbal',
  '',
  199.99,
  'http://images.acmesports.sports/Riddell+Youth+Revolution+Speed+Custom+Football+Helmet'),
 (6,
  2,
  "Jordan Men's VI Retro TD Football Cleat",
  '',
  134.99,
  'http://images.acmesports.sports/Jordan+Men%27s+VI+Retro+TD+Football+Cleat'),
 (7,
  2,
  'Schut

#### Defining the logic, that will use the iterables from the rdd containing key and group iterabless, and will return the rdd with 2 top-priced products from each rdd:-
<hr>

In [37]:
for i in prod_kv_grp.collect():
    category = i[0]
    grp = i[1]
    grp_rdd = sc.parallelize( grp )
    for top_prod in grp_rdd.sortBy( lambda x: x[4], ascending=False ).take(2):
        print( top_prod )

(16, 2, 'Riddell Youth 360 Custom Football Helmet', '', 299.99, 'http://images.acmesports.sports/Riddell+Youth+360+Custom+Football+Helmet')
(11, 2, 'Fitness Gear 300 lb Olympic Weight Set', '', 209.99, 'http://images.acmesports.sports/Fitness+Gear+300+lb+Olympic+Weight+Set')
(66, 4, 'SOLE F85 Treadmill', '', 1799.99, 'http://images.acmesports.sports/SOLE+F85+Treadmill')
(60, 4, 'SOLE E25 Elliptical', '', 999.99, 'http://images.acmesports.sports/SOLE+E25+Elliptical')
(117, 6, 'YETI Tundra 65 Chest Cooler', '', 399.99, 'http://images.acmesports.sports/YETI+Tundra+65+Chest+Cooler')
(106, 6, 'Teeter Hang Ups NXT-S Inversion Table', '', 299.99, 'http://images.acmesports.sports/Teeter+Hang+Ups+NXT-S+Inversion+Table')
(162, 8, 'YETI Tundra 65 Chest Cooler', '', 399.99, 'http://images.acmesports.sports/YETI+Tundra+65+Chest+Cooler')
(153, 8, 'Teeter Hang Ups NXT-S Inversion Table', '', 299.99, 'http://images.acmesports.sports/Teeter+Hang+Ups+NXT-S+Inversion+Table')
(208, 10, 'SOLE E35 Elliptica

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "D:\Softwares\Apache_Spark\spark\python\lib\py4j-0.10.9.5-src.zip\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "D:\Softwares\Apache_Spark\spark\python\lib\py4j-0.10.9.5-src.zip\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "D:\Softwares\Anaconda3\anaconda3\lib\socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [44]:
#slightly better logic

top_prod = []

for i in prod_kv_grp.collect():
    category = i[0]
    grp = i[1]
    for prod in sorted( grp, key = lambda x: x[4] ,reverse=True )[0:2]:
        top_prod.append(prod)

prod_category_rdd = sc.parallelize( top_prod )

In [45]:
prod_category_rdd.collect()

[(16,
  2,
  'Riddell Youth 360 Custom Football Helmet',
  '',
  299.99,
  'http://images.acmesports.sports/Riddell+Youth+360+Custom+Football+Helmet'),
 (11,
  2,
  'Fitness Gear 300 lb Olympic Weight Set',
  '',
  209.99,
  'http://images.acmesports.sports/Fitness+Gear+300+lb+Olympic+Weight+Set'),
 (66,
  4,
  'SOLE F85 Treadmill',
  '',
  1799.99,
  'http://images.acmesports.sports/SOLE+F85+Treadmill'),
 (60,
  4,
  'SOLE E25 Elliptical',
  '',
  999.99,
  'http://images.acmesports.sports/SOLE+E25+Elliptical'),
 (117,
  6,
  'YETI Tundra 65 Chest Cooler',
  '',
  399.99,
  'http://images.acmesports.sports/YETI+Tundra+65+Chest+Cooler'),
 (106,
  6,
  'Teeter Hang Ups NXT-S Inversion Table',
  '',
  299.99,
  'http://images.acmesports.sports/Teeter+Hang+Ups+NXT-S+Inversion+Table'),
 (162,
  8,
  'YETI Tundra 65 Chest Cooler',
  '',
  399.99,
  'http://images.acmesports.sports/YETI+Tundra+65+Chest+Cooler'),
 (153,
  8,
  'Teeter Hang Ups NXT-S Inversion Table',
  '',
  299.99,
  'http:/

### Finding top 2 products using '*flatMap()*':
<hr>

In [47]:
prod_kv_grp.take(10)

[(2, <pyspark.resultiterable.ResultIterable at 0x1a06228f280>),
 (4, <pyspark.resultiterable.ResultIterable at 0x1a06228fd30>),
 (6, <pyspark.resultiterable.ResultIterable at 0x1a06228f1c0>),
 (8, <pyspark.resultiterable.ResultIterable at 0x1a06228f820>),
 (10, <pyspark.resultiterable.ResultIterable at 0x1a06228fa90>),
 (12, <pyspark.resultiterable.ResultIterable at 0x1a06228f100>),
 (38, <pyspark.resultiterable.ResultIterable at 0x1a06228fcd0>),
 (16, <pyspark.resultiterable.ResultIterable at 0x1a06228fca0>),
 (18, <pyspark.resultiterable.ResultIterable at 0x1a06228f5b0>),
 (20, <pyspark.resultiterable.ResultIterable at 0x1a06228f250>)]

In [53]:
top_2_prod_rdd = prod_kv_grp.flatMap( lambda x: sorted( x[1], key = lambda y: y[4], reverse=True )[0:2] )

In [54]:
top_2_prod_rdd.take(10)

[(16,
  2,
  'Riddell Youth 360 Custom Football Helmet',
  '',
  299.99,
  'http://images.acmesports.sports/Riddell+Youth+360+Custom+Football+Helmet'),
 (11,
  2,
  'Fitness Gear 300 lb Olympic Weight Set',
  '',
  209.99,
  'http://images.acmesports.sports/Fitness+Gear+300+lb+Olympic+Weight+Set'),
 (66,
  4,
  'SOLE F85 Treadmill',
  '',
  1799.99,
  'http://images.acmesports.sports/SOLE+F85+Treadmill'),
 (60,
  4,
  'SOLE E25 Elliptical',
  '',
  999.99,
  'http://images.acmesports.sports/SOLE+E25+Elliptical'),
 (117,
  6,
  'YETI Tundra 65 Chest Cooler',
  '',
  399.99,
  'http://images.acmesports.sports/YETI+Tundra+65+Chest+Cooler'),
 (106,
  6,
  'Teeter Hang Ups NXT-S Inversion Table',
  '',
  299.99,
  'http://images.acmesports.sports/Teeter+Hang+Ups+NXT-S+Inversion+Table'),
 (162,
  8,
  'YETI Tundra 65 Chest Cooler',
  '',
  399.99,
  'http://images.acmesports.sports/YETI+Tundra+65+Chest+Cooler'),
 (153,
  8,
  'Teeter Hang Ups NXT-S Inversion Table',
  '',
  299.99,
  'http:/

# Sampling:-
<hr><hr>
<ul>
<li><b><i>sample(withReplacement, fraction, seed=None)</i></b>: Transformation <br>
<li><b><i>takeSample()</i></b>: Action
</ul>

In [61]:
full_rdd = sc.parallelize(range(100),4)

In [62]:
full_rdd.take(20)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [63]:
full_rdd.getNumPartitions()

4

In [64]:
full_rdd.getStorageLevel()

StorageLevel(False, False, False, False, 1)

In [68]:
# Elements in the sample may get repeated:-
full_rdd.sample(True, 0.2).collect()

[7,
 8,
 15,
 17,
 19,
 31,
 33,
 34,
 37,
 43,
 47,
 61,
 64,
 65,
 74,
 81,
 81,
 82,
 86,
 90,
 90,
 93,
 96,
 97,
 99]

In [71]:
# No elements in the sample will get repeated
full_rdd.sample(False,0.15).collect()

[4, 17, 24, 26, 34, 42, 44, 45, 47, 50, 51, 54, 71, 72, 75]

In [76]:
# Will give same results for each call, due to same fixed seed:
full_rdd.sample(False,0.15, seed=21).collect()

[0, 3, 17, 19, 21, 25, 33, 40, 53, 54, 60, 72, 82]

In [86]:
# using takeSample() directly shows result, as it an action.
full_rdd.takeSample(False, 20)

[75, 60, 2, 85, 98, 11, 68, 81, 54, 83, 67, 12, 17, 47, 44, 62, 1, 7, 23, 20]

# Set Operations:-
<hr><hr>

## 1. <b><i>union()</i></b>:-
<hr>

#### Usage:-
<hr>

In [90]:
rdd1 = sc.parallelize(range(50))
rdd2 = sc.parallelize(range(30,70))

In [94]:
print( rdd1.collect(), rdd2.collect(), sep='\n' )

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
[30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69]


In [95]:
rdd1.union(rdd2).collect()

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69]

In [98]:
rdd1.union(rdd2).distinct().collect()

[0,
 16,
 32,
 48,
 64,
 1,
 17,
 33,
 49,
 65,
 2,
 18,
 34,
 50,
 66,
 3,
 19,
 35,
 51,
 67,
 4,
 20,
 36,
 52,
 68,
 5,
 21,
 37,
 53,
 69,
 6,
 22,
 38,
 54,
 7,
 23,
 39,
 55,
 8,
 24,
 40,
 56,
 9,
 25,
 41,
 57,
 10,
 26,
 42,
 58,
 11,
 27,
 43,
 59,
 12,
 28,
 44,
 60,
 13,
 29,
 45,
 61,
 14,
 30,
 46,
 62,
 15,
 31,
 47,
 63]

### Number of customers placed orders in July or in August months:-
<hr>

In [102]:
ord_rdd = sc.textFile('data/orders.csv').map( lambda x: x.split(",") ).map( lambda x: ( int(x[1].split('-')[1]) ,x ) )

In [103]:
ord_rdd.take(5)

[(7, ['1', '2013-07-25 00:00:00.0', '11599', 'CLOSED']),
 (7, ['2', '2013-07-25 00:00:00.0', '256', 'PENDING_PAYMENT']),
 (7, ['3', '2013-07-25 00:00:00.0', '12111', 'COMPLETE']),
 (7, ['4', '2013-07-25 00:00:00.0', '8827', 'CLOSED']),
 (7, ['5', '2013-07-25 00:00:00.0', '11318', 'COMPLETE'])]

In [104]:
jul_aug_ord = ord_rdd.filter( lambda x: x[0] in (7,8) )

In [109]:
jul_aug_ord.take(10)

[(7, ['1', '2013-07-25 00:00:00.0', '11599', 'CLOSED']),
 (7, ['2', '2013-07-25 00:00:00.0', '256', 'PENDING_PAYMENT']),
 (7, ['3', '2013-07-25 00:00:00.0', '12111', 'COMPLETE']),
 (7, ['4', '2013-07-25 00:00:00.0', '8827', 'CLOSED']),
 (7, ['5', '2013-07-25 00:00:00.0', '11318', 'COMPLETE']),
 (7, ['6', '2013-07-25 00:00:00.0', '7130', 'COMPLETE']),
 (7, ['7', '2013-07-25 00:00:00.0', '4530', 'COMPLETE']),
 (7, ['8', '2013-07-25 00:00:00.0', '2911', 'PROCESSING']),
 (7, ['9', '2013-07-25 00:00:00.0', '5657', 'PENDING_PAYMENT']),
 (7, ['10', '2013-07-25 00:00:00.0', '5648', 'PENDING_PAYMENT'])]

In [114]:
num_cust = jul_aug_ord.map( lambda x: int( x[1][2] ) ).distinct().count()
print("Number of customers ordered in July and August:",num_cust)

Number of customers ordered in July and August: 7633


### <b><i>subtract()</i></b>:-
Performs Left Anti-Join
<hr>

In [116]:
print( rdd1.collect(), rdd2.collect(), sep='\n\n' )

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]

[30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69]


In [118]:
print( sorted( rdd1.subtract( rdd2 ).collect() ) )

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]


In [None]:
print( sorted( rdd2.subtract( rdd1 ).collect() ) )