In [1]:
from pyspark import SparkContext
sc = SparkContext()

## CREATING RDDs USING PYTHON COLLECTIONS

In [11]:
sc.parallelize(['this', 'is', 'an', 'example'])

ParallelCollectionRDD[9] at parallelize at PythonRDD.scala:480

A task  was created

In [4]:
range_rdd = sc.parallelize(range(1,100))
#range_rdd.collect()

In [5]:
range_rdd.count()

99

## CREATING RDDs USING TEXT FILES

In [6]:
rdd_hp = sc.textFile('data/HP.txt')

In [7]:
rdd_hp # Check the type of the RDD

data/HP.txt MapPartitionsRDD[6] at textFile at NativeMethodAccessorImpl.java:0

In [8]:
# Check the contents of the rdd created
rdd_hp.collect()

['The place where things are hidden',
 'If you have to ask you will never know',
 'If you know you need only ask',
 'The hidden room of requirement']

In [9]:
# Check the total elements in the RDD
rdd_hp.count()

4

## SPARK TRANSFORMATIONS & ACTIONS

#### map(func) : Return a new distributed dataset formed by passing each element of the source through a function func. 

In [17]:
ten_range_rdd = sc.parallelize(range(1,10)) # [1,2,.....10]
ten_range_rdd_mapped = ten_range_rdd.map(lambda x: (x,x)).map(lambda x:(x[0]+1,x[1]*2)) # RDD created from an RDD
ten_range_rdd_mapped.collect()  # collect is an action!

[(2, 2), (3, 4), (4, 6), (5, 8), (6, 10), (7, 12), (8, 14), (9, 16), (10, 18)]

In [18]:
ten_range_rdd_mapped_again = ten_range_rdd_mapped.map(lambda x: (x[0]*2, x[1])).collect() # Python collection
print(ten_range_rdd_mapped_again)

[(4, 2), (6, 4), (8, 6), (10, 8), (12, 10), (14, 12), (16, 14), (18, 16), (20, 18)]


#### flatMap(func) : Similar to map, but flattens the final result. 

In [19]:
ten_range_rdd_flatmapped = ten_range_rdd_mapped.flatMap(lambda x: (x[0], x[1] + 1))
ten_range_rdd_flatmapped.collect()

[2, 3, 3, 5, 4, 7, 5, 9, 6, 11, 7, 13, 8, 15, 9, 17, 10, 19]

#### filter(func) : return a new dataset formed by selecting those elements of the source on which func returns true. 

In [20]:
rdd_hp.collect() # Get back to the RDD we created

['The place where things are hidden',
 'If you have to ask you will never know',
 'If you know you need only ask',
 'The hidden room of requirement']

In [21]:
rdd_hp_filtered = rdd_hp.filter(lambda line: line.startswith('If'))
rdd_hp_filtered.collect()

['If you have to ask you will never know', 'If you know you need only ask']

##### USER EXERCISE

In [25]:
# Convert all the lines to Uppercase
rdd_hp1 = rdd_hp.map(lambda line: line.upper())
rdd_hp1.collect()

['THE PLACE WHERE THINGS ARE HIDDEN',
 'IF YOU HAVE TO ASK YOU WILL NEVER KNOW',
 'IF YOU KNOW YOU NEED ONLY ASK',
 'THE HIDDEN ROOM OF REQUIREMENT']

In [44]:
# Remove the lines which contain the word "HIDDEN” in it
rdd_hp2 = rdd_hp1.filter(lambda line: 'HIDDEN' not in line )
rdd_hp2.collect()

['IF YOU HAVE TO ASK YOU WILL NEVER KNOW', 'IF YOU KNOW YOU NEED ONLY ASK']

In [51]:
# Create a new RDD with elements that are tuples (x, y), where x represents a line and y is the length of that line
rdd_hp1.map(lambda line: (line,len(line))).collect()


[('THE PLACE WHERE THINGS ARE HIDDEN', 33),
 ('IF YOU HAVE TO ASK YOU WILL NEVER KNOW', 38),
 ('IF YOU KNOW YOU NEED ONLY ASK', 29),
 ('THE HIDDEN ROOM OF REQUIREMENT', 30)]

In [57]:
# Remove lines that are longer than 30 characters
rdd_hp4 = rdd_hp1.filter(lambda line: len(line)<=30)
rdd_hp4.collect()

['IF YOU KNOW YOU NEED ONLY ASK', 'THE HIDDEN ROOM OF REQUIREMENT']

In [65]:
# Arrange all the words in a single list using a one-liner and count the total number of words
rdd_hp5 = rdd_hp.flatMap(lambda line: line.split())

In [66]:
rdd_hp5.collect()

['The',
 'place',
 'where',
 'things',
 'are',
 'hidden',
 'If',
 'you',
 'have',
 'to',
 'ask',
 'you',
 'will',
 'never',
 'know',
 'If',
 'you',
 'know',
 'you',
 'need',
 'only',
 'ask',
 'The',
 'hidden',
 'room',
 'of',
 'requirement']

In [73]:
# Arrange the words with the length of each word in a tuple for eg. ("word", 4)
rdd_hp5.map(lambda line:(line,len(line))).collect()

[('The', 3),
 ('place', 5),
 ('where', 5),
 ('things', 6),
 ('are', 3),
 ('hidden', 6),
 ('If', 2),
 ('you', 3),
 ('have', 4),
 ('to', 2),
 ('ask', 3),
 ('you', 3),
 ('will', 4),
 ('never', 5),
 ('know', 4),
 ('If', 2),
 ('you', 3),
 ('know', 4),
 ('you', 3),
 ('need', 4),
 ('only', 4),
 ('ask', 3),
 ('The', 3),
 ('hidden', 6),
 ('room', 4),
 ('of', 2),
 ('requirement', 11)]

#### groupByKey() : When called on a dataset of (K, V) pairs, returns a dataset of (K, Iterable &lt;V&gt;) pairs. 

In [74]:
rdd_places = sc.parallelize([("Finland", "Helsinki"), ("Norway", "Oslo"), ("Sweden", "Stockholm"),
                             ("Denmark", "Copenhagen"), ("Norway", "Bergen"), ("Finland", "Tampere"),
                             ("Denmark", "Aarhus"), ("Finland", "Turku")])
rdd_places.collect()

[('Finland', 'Helsinki'),
 ('Norway', 'Oslo'),
 ('Sweden', 'Stockholm'),
 ('Denmark', 'Copenhagen'),
 ('Norway', 'Bergen'),
 ('Finland', 'Tampere'),
 ('Denmark', 'Aarhus'),
 ('Finland', 'Turku')]

In [116]:
rdd_places_grouped = rdd_places.groupByKey()
rdd_places_grouped.collect()

[('Finland', <pyspark.resultiterable.ResultIterable at 0x7f27261fbb38>),
 ('Norway', <pyspark.resultiterable.ResultIterable at 0x7f27261fb7f0>),
 ('Denmark', <pyspark.resultiterable.ResultIterable at 0x7f27261fb588>),
 ('Sweden', <pyspark.resultiterable.ResultIterable at 0x7f27261fbf98>)]

In [125]:
# User readable format
rdd_places_grouped.map(lambda x : (x[0], list(x[1]))).collect()

[('Finland', ['Helsinki', 'Tampere', 'Turku']),
 ('Norway', ['Oslo', 'Bergen']),
 ('Denmark', ['Copenhagen', 'Aarhus']),
 ('Sweden', ['Stockholm'])]

In [128]:
rdd_places_grouped.mapValues(list).collect()

[('Finland', ['Helsinki', 'Tampere', 'Turku']),
 ('Norway', ['Oslo', 'Bergen']),
 ('Denmark', ['Copenhagen', 'Aarhus']),
 ('Sweden', ['Stockholm'])]

#### join(otherDataset, [numTasks]) : When called on datasets of type (K, V) and (K, W), returns a dataset of (K, (V, W)) pairs with all pairs of elements for each key. Outer joins are also supported

In [76]:
rdd_places_other = sc.parallelize([("Finland", "Espoo"), ("Norway", "Stavanger"), ("Sweden", "Gothenburg"), ("Finland", "Vantaa")])
rdd_places.join(rdd_places_other).collect()

[('Norway', ('Oslo', 'Stavanger')),
 ('Norway', ('Bergen', 'Stavanger')),
 ('Finland', ('Helsinki', 'Espoo')),
 ('Finland', ('Helsinki', 'Vantaa')),
 ('Finland', ('Tampere', 'Espoo')),
 ('Finland', ('Tampere', 'Vantaa')),
 ('Finland', ('Turku', 'Espoo')),
 ('Finland', ('Turku', 'Vantaa')),
 ('Sweden', ('Stockholm', 'Gothenburg'))]

In [77]:
rdd_places_grouped.mapValues(list).join(rdd_places_other).collect()

[('Norway', (['Oslo', 'Bergen'], 'Stavanger')),
 ('Finland', (['Helsinki', 'Tampere', 'Turku'], 'Espoo')),
 ('Finland', (['Helsinki', 'Tampere', 'Turku'], 'Vantaa')),
 ('Sweden', (['Stockholm'], 'Gothenburg'))]

#### cogroup(otherDataset, [numTasks]) : When called on datasets of type (K, V) and (K, W), returns a dataset of (K, (Iterable&lt;V&gt; , Iterable&lt;W&gt;)) tuples.

In [78]:
rdd_places_other_cogrouped = rdd_places.cogroup(rdd_places_other)

In [79]:
rdd_places_other_cogrouped.map(lambda x: (x[0], (list(x[1][0]), list(x[1][1])))).collect()

[('Norway', (['Oslo', 'Bergen'], ['Stavanger'])),
 ('Finland', (['Helsinki', 'Tampere', 'Turku'], ['Espoo', 'Vantaa'])),
 ('Denmark', (['Copenhagen', 'Aarhus'], [])),
 ('Sweden', (['Stockholm'], ['Gothenburg']))]

#### sortByKey([ascending], [numTasks]) : When called on a dataset of (K, V) pairs, returns a dataset of (K, V) pairs sorted by keys in ascending or descending order, as specified in the boolean ascending argument.

In [80]:
# Sort by key
rdd_places.sortByKey().collect()

[('Denmark', 'Copenhagen'),
 ('Denmark', 'Aarhus'),
 ('Finland', 'Helsinki'),
 ('Finland', 'Tampere'),
 ('Finland', 'Turku'),
 ('Norway', 'Oslo'),
 ('Norway', 'Bergen'),
 ('Sweden', 'Stockholm')]

In [81]:
rdd_places.sortByKey(0).collect()

[('Sweden', 'Stockholm'),
 ('Norway', 'Oslo'),
 ('Norway', 'Bergen'),
 ('Finland', 'Helsinki'),
 ('Finland', 'Tampere'),
 ('Finland', 'Turku'),
 ('Denmark', 'Copenhagen'),
 ('Denmark', 'Aarhus')]

#### reduceByKey(func, [numTasks]) 	When called on a dataset of (K, V) pairs, returns a dataset of (K, V) pairs where the values for each key are aggregated using the given reduce function func, which must be of type (V,V) => V.

In [133]:
rdd_reduce_example = sc.parallelize([("Messi", 2), ("Suarez", 2), ("Ronaldo", 0), ("Ronaldo", 3), ("Messi", 2)])
rdd_reduce_example.reduceByKey(lambda a,b: a+b).collect()

[('Ronaldo', 3), ('Suarez', 2), ('Messi', 4)]

In [134]:
# Sort by value
rdd_reduce_example.reduceByKey(lambda a,b: a+b).sortByKey().collect()

[('Messi', 4), ('Ronaldo', 3), ('Suarez', 2)]

In [137]:
rdd_reduce_example.sortByKey().collect()

[('Messi', 2), ('Messi', 2), ('Ronaldo', 0), ('Ronaldo', 3), ('Suarez', 2)]

## SOME RDD ACTIONS

In [85]:
rdd_places.collect()

[('Finland', 'Helsinki'),
 ('Norway', 'Oslo'),
 ('Sweden', 'Stockholm'),
 ('Denmark', 'Copenhagen'),
 ('Norway', 'Bergen'),
 ('Finland', 'Tampere'),
 ('Denmark', 'Aarhus'),
 ('Finland', 'Turku')]

In [87]:
# Check the total elements in rdd_places
rdd_places.count()

8

In [92]:
# Output the first element in the rdd
rdd_places.take(1)

[('Finland', 'Helsinki')]

In [93]:
# Output an array with the first two elements
rdd_places.take(2)

[('Finland', 'Helsinki'), ('Norway', 'Oslo')]

In [126]:
# Output a sample of 2 random elements with replacement
rdd_places.takeSample(False,2)

[('Denmark', 'Aarhus'), ('Finland', 'Turku')]

In [104]:
# Output the first 2 elements by using their natural ordering
rdd_places.takeOrdered(2)

[('Denmark', 'Aarhus'), ('Denmark', 'Copenhagen')]

In [None]:
# Output the total number of elements for each key

In [108]:
rdd_places.countByKey()

defaultdict(int, {'Denmark': 2, 'Finland': 3, 'Norway': 2, 'Sweden': 1})