In [12]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("RDD_Lab").master("spark://spark-master:7077").getOrCreate() 
# A SparkContext represents the connection to a Spark cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.
sc = spark.sparkContext
sc.setLogLevel('FATAL')

In [14]:
data = [1, 2, 3, 4, 5]
data_rdd = spark.sparkContext.parallelize(data) # create a RDD from an array 
data_rdd = data_rdd.map(lambda x: x*x) # apply a transformation
print(data_rdd.collect()) # collection data

[1, 4, 9, 16, 25]


In [15]:
lines_rdd = spark.sparkContext.textFile("/home/jovyan/data/wordcount.txt") # create a RDD from a text file
# Differences between Flatmap and Map
print(lines_rdd.map(lambda x: x.split(' ')).take(5))
print("\n")
print(lines_rdd.flatMap(lambda x: x.split(' ')).take(5))

[["We've", 'all', 'heard', 'the', 'scare', 'stories', 'about', 'North', 'Korea:', 'the', 'homemade', 'nuclear', 'arsenal', 'built', 'while', 'their', 'people', 'starve', 'and', 'then', 'aimed', 'imprecisely', 'at', 'the', 'rest', 'of', 'the', 'world,', 'a', ''], ['leader', 'so', 'deluded', 'he', 'makes', 'L', 'Ron', 'Hubbard', 'look', 'like', 'a', 'man', 'excessively', 'overburdened', 'with', 'self-doubt', 'and', 'their', 'deep-seated', 'belief', 'that', 'foreign', 'capitalists', 'will', 'invade', 'at', 'any', ''], ['moment', 'and', 'steal', 'all', 'their', 'bauxite.'], ['The', 'popular', 'portrayal', 'of', 'this', 'Marxist', 'nation', 'is', 'something', 'like', 'one', 'of', 'the', 'more', 'harrowing', 'episodes', 'of', 'M*A*S*H,', 'only', 'with', 'the', 'cast', 'of', 'wacky', 'characters', 'replaced', 'by', 'twitchy,', ''], ['heavily', 'armed', 'Stalinist', 'meth', 'addicts']]


["We've", 'all', 'heard', 'the', 'scare']


In [16]:
from operator import add
# https://spark.apache.org/docs/latest/rdd-programming-guide.html#passing-functions-to-spark
def myMapFun(x):
    return (x, 1)

# Word Couting MapReduce
counts = lines_rdd.flatMap(lambda x: x.split(' ')) \
                  .map(myMapFun) \
                  .reduceByKey(add)
output = counts.collect()
for (word, count) in output:
    print("%s: %i" % (word, count))

heard: 1
North: 5
homemade: 1
nuclear: 1
starve: 1
imprecisely: 1
at: 4
rest: 1
of: 11
world,: 1
: 11
leader: 2
he: 2
L: 1
Ron: 1
look: 1
like: 4
self-doubt: 1
deep-seated: 1
belief: 1
foreign: 1
invade: 1
steal: 1
bauxite.: 1
The: 1
portrayal: 1
this: 2
Marxist: 1
nation: 1
is: 2
something: 1
more: 2
M*A*S*H,: 1
only: 1
cast: 1
characters: 1
replaced: 1
twitchy,: 1
heavily: 1
armed: 1
Stalinist: 1
meth: 1
Cracked: 1
would: 2
take: 1
good: 1
things: 2
Korea: 2
though,: 1
country's: 1
suppress: 1
as: 1
motivated: 1
jealousy.: 1
no: 1
different: 1
there's: 1
Korean: 1
likes: 1
after: 1
an: 1
plant: 1
than: 1
nice: 1
go: 1
his: 2
dried: 1
fish: 1
attentive: 1
people's: 1
needs: 1
in: 2
twinkling: 1
Korea's: 1
leadership: 1
bought,: 1
transported: 1
rebuilt: 1
discover: 1
reproduce: 1
secrets: 1
sweet: 1
people,: 1
bottles: 1
And: 3
When: 1
was: 1
last: 1
YOUR: 1
got: 1
YOU,: 1
do: 1
question: 1
are: 1
Or: 1
restaurant: 1
sleeping: 1
optional: 1
even: 2
have: 1
remove: 1
out: 1
Americans: 

In [17]:
# Stop the spark context
spark.stop()