In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
import os

In [2]:
sc = SparkContext()

22/05/06 14:09:31 WARN Utils: Your hostname, Nathans-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.247.137.235 instead (on interface en0)
22/05/06 14:09:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/06 14:09:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### 0. Pair RDDs

- Real dataset regularly is `key-value` format.
- Each row is key that maps to one or many values.
- In Pair Rdd: `key` is `indetifier` and `value` is `data`.

### 1. Create RDDs

#### 1.1. Pair RDDs from key-value tuple list

In [6]:
key_val_tup_lst = [('ID001', 'Nathan Ngo'), ('ID002', 'Elon Musk'), ('ID003', 'Jisoo')]
pairRDD_tup = sc.parallelize(key_val_tup_lst)
pairRDD_tup.collect()

[('ID001', 'Nathan Ngo'), ('ID002', 'Elon Musk'), ('ID003', 'Jisoo')]

#### 1.2. Pair RDDs from other RDDs
Use `map` method.

In [7]:
RDD = sc.parallelize([['ID001', 'Nathan Ngo'], ['ID002', 'Elon Musk'], ['ID003', 'Jisoo']])
pairRDD_tup = RDD.map(lambda x: (x[0], x[1])) 
pairRDD_tup.collect()

[('ID001', 'Nathan Ngo'), ('ID002', 'Elon Musk'), ('ID003', 'Jisoo')]

### 2. Transformation on Pair RDDs

- All transformations on RDDs can run on Pair RDDs.
- Passing funtion on pari `key-value`.

Some speical transformation

#### 2.1. Transformation - reduceByKey()
`reduceByKey()`: Operate values with the same key.

In [8]:
PairRDD_1 = sc.parallelize([('ID001', 8), ('ID002', 9), ('ID003', 10), 
                           ('ID001', 9), ('ID002', 7), ('ID003', 10)])
PairRDD_1_reduce = PairRDD_1.reduceByKey(lambda x, y: x + y)
for pair in PairRDD_1_reduce.collect():
    print(f'{pair[0]} with total score is {pair[1]}')

ID003 with total score is 20
ID002 with total score is 16
ID001 with total score is 17


Sort RDD by `sortByKey()` method

In [13]:
PairRDD_1_reduce_sort = PairRDD_1_reduce.sortByKey(ascending=True)
for pair in PairRDD_1_reduce_sort.collect():
    print(f'{pair[0]} with total score is {pair[1]}')

ID001 with total score is 17
ID002 with total score is 16
ID003 with total score is 20


#### 2.2. Transformation - groupByKey()

`groupByKey()`: Group values with the same key.

In [16]:
PairRDD_1.groupByKey()

PythonRDD[54] at RDD at PythonRDD.scala:53

In [15]:
PairRDD_1.groupByKey().collect()

[('ID003', <pyspark.resultiterable.ResultIterable at 0x10a7e4a60>),
 ('ID002', <pyspark.resultiterable.ResultIterable at 0x10a7e4b50>),
 ('ID001', <pyspark.resultiterable.ResultIterable at 0x10a7e4be0>)]

In [18]:
PairRDD_1_groupby = PairRDD_1.groupByKey().collect()
for id_, score in PairRDD_1_groupby:
    print(id_, list(score))

ID003 [10, 10]
ID002 [9, 7]
ID001 [8, 9]


#### 2.3. Transformation - join

`PairRDD_1.join(PairRDD_2)`: Concatenate 2 RDD according to keys. 

In [19]:
key_val_tup_lst = [('ID001', 'Nathan Ngo'), ('ID002', 'Elon Musk'), ('ID003', 'Jisoo')]
pairRDD_1 = sc.parallelize(key_val_tup_lst)
pairRDD_2 = sc.parallelize([('ID001', 8), ('ID002', 9), ('ID003', 10), 
                           ('ID001', 9), ('ID002', 7), ('ID003', 10)])

In [20]:
pairRDD_join = pairRDD_1.join(pairRDD_2)
pairRDD_join.collect()

[('ID003', ('Jisoo', 10)),
 ('ID003', ('Jisoo', 10)),
 ('ID002', ('Elon Musk', 9)),
 ('ID002', ('Elon Musk', 7)),
 ('ID001', ('Nathan Ngo', 8)),
 ('ID001', ('Nathan Ngo', 9))]

### 3. Action on Pair RDDs

- RDD's actions can apply for Pair RDDs.
Some RDD action work for `key-value`.

#### 3.1. Action - countByKey()
`countByKey()`: return number of total value for each key.

In [22]:
pairRDD_3 = sc.parallelize([('ID001', 8), ('ID002', 9), ('ID003', 10), 
                           ('ID001', 9), ('ID002', 7), ('ID003', 10), ('ID003', 9)])

for key, val in pairRDD_3.countByKey().items():
    print(key, val)

ID001 2
ID002 2
ID003 3


#### 3.2. Action - collectAsMap()
`collectAsMap()`: return the last key-value, duplicate key will not be returned, only the last pair will be returned.

In [24]:
pairRDD_3.collectAsMap()

{'ID001': 9, 'ID002': 7, 'ID003': 9}