In [2]:
import findspark

findspark.init()

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("rdd-operations").getOrCreate()
sc = spark.sparkContext

In [4]:
from pyspark.sql import functions as F

In [5]:
ls = [
    (1,45,1),
    (2,33,1),
    (1,15,2),
    (3,99,1),
    (3,-33,2),
    (4,13,1),
    (5,21,1),
    (4,21,2),
    (1,10,3)
]

col_names = ["roll","marks","rank"]
df1 = spark.createDataFrame( data = ls, schema = col_names )
df1.show()

+----+-----+----+
|roll|marks|rank|
+----+-----+----+
|   1|   45|   1|
|   2|   33|   1|
|   1|   15|   2|
|   3|   99|   1|
|   3|  -33|   2|
|   4|   13|   1|
|   5|   21|   1|
|   4|   21|   2|
|   1|   10|   3|
+----+-----+----+



In [8]:
rdd1 = sc.parallelize(ls)
rdd1.collect()

[(1, 45, 1),
 (2, 33, 1),
 (1, 15, 2),
 (3, 99, 1),
 (3, -33, 2),
 (4, 13, 1),
 (5, 21, 1),
 (4, 21, 2),
 (1, 10, 3)]

In [9]:
print(type(rdd1))

<class 'pyspark.rdd.RDD'>


In [10]:
pair_rdd = rdd1.map( lambda x: (x[0],[x[1],x[2]]) )
pair_rdd.collect()

[(1, [45, 1]),
 (2, [33, 1]),
 (1, [15, 2]),
 (3, [99, 1]),
 (3, [-33, 2]),
 (4, [13, 1]),
 (5, [21, 1]),
 (4, [21, 2]),
 (1, [10, 3])]

In [11]:
pair_rdd.join(pair_rdd).collect()

[(2, ([33, 1], [33, 1])),
 (4, ([13, 1], [13, 1])),
 (4, ([13, 1], [21, 2])),
 (4, ([21, 2], [13, 1])),
 (4, ([21, 2], [21, 2])),
 (1, ([45, 1], [45, 1])),
 (1, ([45, 1], [15, 2])),
 (1, ([45, 1], [10, 3])),
 (1, ([15, 2], [45, 1])),
 (1, ([15, 2], [15, 2])),
 (1, ([15, 2], [10, 3])),
 (1, ([10, 3], [45, 1])),
 (1, ([10, 3], [15, 2])),
 (1, ([10, 3], [10, 3])),
 (3, ([99, 1], [99, 1])),
 (3, ([99, 1], [-33, 2])),
 (3, ([-33, 2], [99, 1])),
 (3, ([-33, 2], [-33, 2])),
 (5, ([21, 1], [21, 1]))]

In [17]:
df1.groupBy("roll").agg( F.sum("marks"), F.avg("rank"), F.count("roll"), F.mean("marks") ).show()

+----+----------+---------+-----------+------------------+
|roll|sum(marks)|avg(rank)|count(roll)|        avg(marks)|
+----+----------+---------+-----------+------------------+
|   5|        21|      1.0|          1|              21.0|
|   1|        70|      2.0|          3|23.333333333333332|
|   3|        66|      1.5|          2|              33.0|
|   2|        33|      1.0|          1|              33.0|
|   4|        34|      1.5|          2|              17.0|
+----+----------+---------+-----------+------------------+



## Word Counting by RDD
--------------------------------

In [3]:
ord_rdd = sc.textFile("data/orders.csv",4)
ord_rdd.take(10)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE',
 '6,2013-07-25 00:00:00.0,7130,COMPLETE',
 '7,2013-07-25 00:00:00.0,4530,COMPLETE',
 '8,2013-07-25 00:00:00.0,2911,PROCESSING',
 '9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT',
 '10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT']

In [8]:
ord_rdd.getNumPartitions()

4

In [4]:
ord_rdd.getStorageLevel()

StorageLevel(False, False, False, False, 1)

In [5]:
print( ord_rdd.getStorageLevel() )

Serialized 1x Replicated


In [11]:
ord_rdd.glom().map(len).collect()

[17414, 17150, 17165, 17154]

In [20]:
words = ord_rdd.map( lambda x: x.split(',')[-1] )
words.take(5)

['CLOSED', 'PENDING_PAYMENT', 'COMPLETE', 'CLOSED', 'COMPLETE']

In [23]:
wordsCount = words.map( lambda x: (x,1) ).reduceByKey( lambda x,y: x+y ).sortBy( lambda x: x[1], ascending=False )
wordsCount.collect()

[('COMPLETE', 22899),
 ('PENDING_PAYMENT', 15030),
 ('PROCESSING', 8275),
 ('PENDING', 7610),
 ('CLOSED', 7556),
 ('ON_HOLD', 3798),
 ('SUSPECTED_FRAUD', 1558),
 ('CANCELED', 1428),
 ('PAYMENT_REVIEW', 729)]

 ## Words Count by using Dataframes
 ------------------------------------

In [12]:
ordrDF = spark.read.format("csv").option("header",False).option("inferSchema",True).load("data/orders.csv")
ordrDF.show(10)

+---+-------------------+-----+---------------+
|_c0|                _c1|  _c2|            _c3|
+---+-------------------+-----+---------------+
|  1|2013-07-25 00:00:00|11599|         CLOSED|
|  2|2013-07-25 00:00:00|  256|PENDING_PAYMENT|
|  3|2013-07-25 00:00:00|12111|       COMPLETE|
|  4|2013-07-25 00:00:00| 8827|         CLOSED|
|  5|2013-07-25 00:00:00|11318|       COMPLETE|
|  6|2013-07-25 00:00:00| 7130|       COMPLETE|
|  7|2013-07-25 00:00:00| 4530|       COMPLETE|
|  8|2013-07-25 00:00:00| 2911|     PROCESSING|
|  9|2013-07-25 00:00:00| 5657|PENDING_PAYMENT|
| 10|2013-07-25 00:00:00| 5648|PENDING_PAYMENT|
+---+-------------------+-----+---------------+
only showing top 10 rows



In [13]:
ordrDF.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: timestamp (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: string (nullable = true)



In [61]:
from pyspark.sql.functions import col

wrd_countDF = ordrDF.select("_c3").groupBy( ordrDF._c3 ).count().withColumnRenamed("count","word_count")
wrd_countDF.orderBy( wrd_countDF.word_count.desc() ).show()

+---------------+----------+
|            _c3|word_count|
+---------------+----------+
|       COMPLETE|     22899|
|PENDING_PAYMENT|     15030|
|     PROCESSING|      8275|
|        PENDING|      7610|
|         CLOSED|      7556|
|        ON_HOLD|      3798|
|SUSPECTED_FRAUD|      1558|
|       CANCELED|      1428|
| PAYMENT_REVIEW|       729|
+---------------+----------+



## Print all the orders which are *CLOSED* or *COMPLETE*, and in the year 2013

### 1. Using RDDS

In [None]:
ord_rdd.take(5)

In [8]:
rdd1 = ord_rdd.map( lambda x: ( x.split(',')[1].split('-')[0], x.split(',')[-1], x ) )
rdd1.filter( lambda x: ( x[0]=='2013' ) & ( x[1] in ['CLOSED','COMPLETED'] ) ).take(10)

[('2013', 'CLOSED', '1,2013-07-25 00:00:00.0,11599,CLOSED'),
 ('2013', 'CLOSED', '4,2013-07-25 00:00:00.0,8827,CLOSED'),
 ('2013', 'CLOSED', '12,2013-07-25 00:00:00.0,1837,CLOSED'),
 ('2013', 'CLOSED', '18,2013-07-25 00:00:00.0,1205,CLOSED'),
 ('2013', 'CLOSED', '24,2013-07-25 00:00:00.0,11441,CLOSED'),
 ('2013', 'CLOSED', '25,2013-07-25 00:00:00.0,9503,CLOSED'),
 ('2013', 'CLOSED', '37,2013-07-25 00:00:00.0,5863,CLOSED'),
 ('2013', 'CLOSED', '51,2013-07-25 00:00:00.0,12271,CLOSED'),
 ('2013', 'CLOSED', '57,2013-07-25 00:00:00.0,7073,CLOSED'),
 ('2013', 'CLOSED', '61,2013-07-25 00:00:00.0,4791,CLOSED')]

### 2. Using Dataframes

In [11]:
ordrDF.show(5)

+---+--------------------+-----+---------------+
|_c0|                 _c1|  _c2|            _c3|
+---+--------------------+-----+---------------+
|  1|2013-07-25 00:00:...|11599|         CLOSED|
|  2|2013-07-25 00:00:...|  256|PENDING_PAYMENT|
|  3|2013-07-25 00:00:...|12111|       COMPLETE|
|  4|2013-07-25 00:00:...| 8827|         CLOSED|
|  5|2013-07-25 00:00:...|11318|       COMPLETE|
+---+--------------------+-----+---------------+
only showing top 5 rows



In [37]:
from pyspark.sql.functions import year

ordrDF = ordrDF.withColumn("year", year( ordrDF._c1 ))
ordrDF = ordrDF.filter( ( ordrDF._c3 == 'COMPLETED' ) | ( ordrDF._c3 == 'CLOSED' )  )
ordrDF.filter( ordrDF.year == 2013 ).show(10)

+---+-------------------+-----+------+----+
|_c0|                _c1|  _c2|   _c3|year|
+---+-------------------+-----+------+----+
|  1|2013-07-25 00:00:00|11599|CLOSED|2013|
|  4|2013-07-25 00:00:00| 8827|CLOSED|2013|
| 12|2013-07-25 00:00:00| 1837|CLOSED|2013|
| 18|2013-07-25 00:00:00| 1205|CLOSED|2013|
| 24|2013-07-25 00:00:00|11441|CLOSED|2013|
| 25|2013-07-25 00:00:00| 9503|CLOSED|2013|
| 37|2013-07-25 00:00:00| 5863|CLOSED|2013|
| 51|2013-07-25 00:00:00|12271|CLOSED|2013|
| 57|2013-07-25 00:00:00| 7073|CLOSED|2013|
| 61|2013-07-25 00:00:00| 4791|CLOSED|2013|
+---+-------------------+-----+------+----+
only showing top 10 rows



# Using Joins
-------------------

## Q. Find the subtotal for each order-customer-id
----------------------------------------------------

### 1. Using Rdds
-----------------------

In [24]:
ord_rdd = sc.textFile("./data/orders.csv").map( lambda x: x.split(",") )
ordItems_rdd = sc.textFile("./data/orderItems.csv").map( lambda x: x.split(",") )

In [25]:
ord_rdd.take(10)

[['1', '2013-07-25 00:00:00.0', '11599', 'CLOSED'],
 ['2', '2013-07-25 00:00:00.0', '256', 'PENDING_PAYMENT'],
 ['3', '2013-07-25 00:00:00.0', '12111', 'COMPLETE'],
 ['4', '2013-07-25 00:00:00.0', '8827', 'CLOSED'],
 ['5', '2013-07-25 00:00:00.0', '11318', 'COMPLETE'],
 ['6', '2013-07-25 00:00:00.0', '7130', 'COMPLETE'],
 ['7', '2013-07-25 00:00:00.0', '4530', 'COMPLETE'],
 ['8', '2013-07-25 00:00:00.0', '2911', 'PROCESSING'],
 ['9', '2013-07-25 00:00:00.0', '5657', 'PENDING_PAYMENT'],
 ['10', '2013-07-25 00:00:00.0', '5648', 'PENDING_PAYMENT']]

In [26]:
ordItems_rdd.take(10)

[['1', '1', '957', '1', '299.98', '299.98'],
 ['2', '2', '1073', '1', '199.99', '199.99'],
 ['3', '2', '502', '5', '250.0', '50.0'],
 ['4', '2', '403', '1', '129.99', '129.99'],
 ['5', '4', '897', '2', '49.98', '24.99'],
 ['6', '4', '365', '5', '299.95', '59.99'],
 ['7', '4', '502', '3', '150.0', '50.0'],
 ['8', '4', '1014', '4', '199.92', '49.98'],
 ['9', '5', '957', '1', '299.98', '299.98'],
 ['10', '5', '365', '5', '299.95', '59.99']]

In [28]:
ord_rdd = ord_rdd.map( lambda x: (x[0],x[1::]) )
ord_rdd.take(10)

[('1', ['2013-07-25 00:00:00.0', '11599', 'CLOSED']),
 ('2', ['2013-07-25 00:00:00.0', '256', 'PENDING_PAYMENT']),
 ('3', ['2013-07-25 00:00:00.0', '12111', 'COMPLETE']),
 ('4', ['2013-07-25 00:00:00.0', '8827', 'CLOSED']),
 ('5', ['2013-07-25 00:00:00.0', '11318', 'COMPLETE']),
 ('6', ['2013-07-25 00:00:00.0', '7130', 'COMPLETE']),
 ('7', ['2013-07-25 00:00:00.0', '4530', 'COMPLETE']),
 ('8', ['2013-07-25 00:00:00.0', '2911', 'PROCESSING']),
 ('9', ['2013-07-25 00:00:00.0', '5657', 'PENDING_PAYMENT']),
 ('10', ['2013-07-25 00:00:00.0', '5648', 'PENDING_PAYMENT'])]

In [29]:
ordItems_rdd = ordItems_rdd.map( lambda x: (x[1],[x[0],x[2],x[3],x[4],x[5]]) )
ordItems_rdd.take(10)

[('1', ['1', '957', '1', '299.98', '299.98']),
 ('2', ['2', '1073', '1', '199.99', '199.99']),
 ('2', ['3', '502', '5', '250.0', '50.0']),
 ('2', ['4', '403', '1', '129.99', '129.99']),
 ('4', ['5', '897', '2', '49.98', '24.99']),
 ('4', ['6', '365', '5', '299.95', '59.99']),
 ('4', ['7', '502', '3', '150.0', '50.0']),
 ('4', ['8', '1014', '4', '199.92', '49.98']),
 ('5', ['9', '957', '1', '299.98', '299.98']),
 ('5', ['10', '365', '5', '299.95', '59.99'])]

In [30]:
joined_ord_ordItems_rdd = ord_rdd.join( ordItems_rdd )
joined_ord_ordItems_rdd.take(10)

[('1',
  (['2013-07-25 00:00:00.0', '11599', 'CLOSED'],
   ['1', '957', '1', '299.98', '299.98'])),
 ('4',
  (['2013-07-25 00:00:00.0', '8827', 'CLOSED'],
   ['5', '897', '2', '49.98', '24.99'])),
 ('4',
  (['2013-07-25 00:00:00.0', '8827', 'CLOSED'],
   ['6', '365', '5', '299.95', '59.99'])),
 ('4',
  (['2013-07-25 00:00:00.0', '8827', 'CLOSED'],
   ['7', '502', '3', '150.0', '50.0'])),
 ('4',
  (['2013-07-25 00:00:00.0', '8827', 'CLOSED'],
   ['8', '1014', '4', '199.92', '49.98'])),
 ('8',
  (['2013-07-25 00:00:00.0', '2911', 'PROCESSING'],
   ['17', '365', '3', '179.97', '59.99'])),
 ('8',
  (['2013-07-25 00:00:00.0', '2911', 'PROCESSING'],
   ['18', '365', '5', '299.95', '59.99'])),
 ('8',
  (['2013-07-25 00:00:00.0', '2911', 'PROCESSING'],
   ['19', '1014', '4', '199.92', '49.98'])),
 ('8',
  (['2013-07-25 00:00:00.0', '2911', 'PROCESSING'],
   ['20', '502', '1', '50.0', '50.0'])),
 ('9',
  (['2013-07-25 00:00:00.0', '5657', 'PENDING_PAYMENT'],
   ['21', '191', '2', '199.98', '99.

In [31]:
custId_subtotal_rdd = joined_ord_ordItems_rdd.map( lambda x: (x[1][0][1], x[1][1][3]) )
custId_subtotal_rdd.take(10)

[('11599', '299.98'),
 ('8827', '49.98'),
 ('8827', '299.95'),
 ('8827', '150.0'),
 ('8827', '199.92'),
 ('2911', '179.97'),
 ('2911', '299.95'),
 ('2911', '199.92'),
 ('2911', '50.0'),
 ('5657', '199.98')]

### 2. Using Dataframes
----------------------------

In [10]:
ord_df = spark.read.format("csv").option("inferSchema", True).load("./data/orders.csv")
ord_df = ord_df.select(F.col("_c0").alias("orderId"), F.col("_c2").alias("custId"))
ord_df.show(5)

+-------+------+
|orderId|custId|
+-------+------+
|      1| 11599|
|      2|   256|
|      3| 12111|
|      4|  8827|
|      5| 11318|
+-------+------+
only showing top 5 rows



In [14]:
ordItems_df = spark.read.format("csv").option("inferSchema", True).load("./data/orderItems.csv")
ordItems_df = ordItems_df.select( F.col("_c1").alias("orderId"), F.col("_c4").alias("subtotal") )
ordItems_df.show(5)

+-------+--------+
|orderId|subtotal|
+-------+--------+
|      1|  299.98|
|      2|  199.99|
|      2|   250.0|
|      2|  129.99|
|      4|   49.98|
+-------+--------+
only showing top 5 rows



In [24]:
customer_subtotal_df = ord_df.join(ordItems_df, ord_df.orderId==ordItems_df.orderId, "inner" ).drop("orderId")
customer_subtotal_df.count()

172198

In [26]:
customer_subtotal_df.show(10)

+------+--------+
|custId|subtotal|
+------+--------+
| 11599|  299.98|
|   256|  199.99|
|   256|   250.0|
|   256|  129.99|
|  8827|   49.98|
|  8827|  299.95|
|  8827|   150.0|
|  8827|  199.92|
| 11318|  299.98|
| 11318|  299.95|
+------+--------+
only showing top 10 rows



In [28]:
customer_subtotal_df.rdd.getNumPartitions()

1

In [29]:
customer_subtotal_df.rdd.getStorageLevel()

StorageLevel(False, False, False, False, 1)

## Using cogroup() rdd method:-
----------------------------------
Experimentation for understanding purpose

In [30]:
co_ls1 = [(1,'a'),(2,'b'),(3,'c'),(4,'d'),(5,'e')]
co_rdd1 = sc.parallelize(co_ls1)
co_rdd1.collect()

[(1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e')]

In [32]:
co_ls2 = [(1,'w'),(2,'x'),(3,'y'),(4,'z')]
co_rdd2 = sc.parallelize(co_ls2)
co_rdd2.collect()

[(1, 'w'), (2, 'x'), (3, 'y'), (4, 'z')]

In [33]:
co_main_rdd = co_rdd1.cogroup(co_rdd2)
co_main_rdd.collect()

[(2,
  (<pyspark.resultiterable.ResultIterable at 0x274d177eee0>,
   <pyspark.resultiterable.ResultIterable at 0x274d25e5250>)),
 (4,
  (<pyspark.resultiterable.ResultIterable at 0x274d25e52b0>,
   <pyspark.resultiterable.ResultIterable at 0x274d25e5310>)),
 (1,
  (<pyspark.resultiterable.ResultIterable at 0x274d25e5370>,
   <pyspark.resultiterable.ResultIterable at 0x274d25e53d0>)),
 (3,
  (<pyspark.resultiterable.ResultIterable at 0x274d25e5430>,
   <pyspark.resultiterable.ResultIterable at 0x274d25e5490>)),
 (5,
  (<pyspark.resultiterable.ResultIterable at 0x274d25e54f0>,
   <pyspark.resultiterable.ResultIterable at 0x274d25e5550>))]