In [1]:
import pyspark 
sc = pyspark.SparkContext('local[*]')

### Creating Paired RDDs

In [11]:
rdd = sc.parallelize(['a|c', 'b|d', 'b|e', 'c|e'])
pair_rdd = rdd.map(lambda x: (x.split('|')[0], x.split('|')[1]))
pair_rdd.collect()

[('a', 'c'), ('b', 'd'), ('b', 'e'), ('c', 'e')]

In [12]:
pair_rdd2 = sc.parallelize([('a', 'b'), ('b', 'b'), ('b', 'e'), ('c', 'e')])
pair_rdd2.collect()

[('a', 'b'), ('b', 'b'), ('b', 'e'), ('c', 'e')]

### Paired RDD Transformations

In [5]:
pair_rdd.reduceByKey(lambda x,y: x + y).collect()

[('b', 'de'), ('c', 'e'), ('a', 'c')]

In [8]:
grouped_collection = pair_rdd.groupByKey().collect()

for key, collection in grouped_collection:
    print(key, list(collection))

b ['d', 'e']
c ['e']
a ['c']


In [9]:
pair_rdd.sortByKey().collect()

[('a', 'c'), ('b', 'd'), ('b', 'e'), ('c', 'e')]

In [13]:
pair_rdd.join(pair_rdd2).collect()

[('a', ('c', 'b')),
 ('b', ('d', 'b')),
 ('b', ('d', 'e')),
 ('b', ('e', 'b')),
 ('b', ('e', 'e')),
 ('c', ('e', 'e'))]

### Paired RDD Actions

In [14]:
pair_rdd.countByKey()

defaultdict(int, {'a': 1, 'b': 2, 'c': 1})

In [15]:
pair_rdd.collectAsMap()

{'a': 'c', 'b': 'e', 'c': 'e'}