In [None]:
"""
NOTE: to be able to start this jupyter Notebook, I've added the below statement to $HOME/.bashrc 
using vi editor (in Cloudera VM)   
export PATH=$PATH:/home/cloudera/anaconda3/bin
then run the below on the terminal:

source $HOME/.bashrc
pyspark
"""

In [None]:
# A SparkContext is a way to access the spark cluster, and can be used to create RDDs,
# accumulators and broadcast variables on that cluster

# To create a SparkContext, first SparkConf should be made. The SparkConf has a
# a configuration parameters that define our Spark driver application and/or
# are used by Spark to allocate resources on the cluster, like memory size and cores
# used by executor running on the worker nodes

In [None]:
from pyspark import SparkConf
from pyspark import SparkContext as sc

In [None]:
# create the spark configuation for spark and then the context

sc=SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

# getOrCreate is used to instantiate a sparkcontext and register it as a singleton object. SparkConf() - default system 
# parameters
# local[*] Run Spark locally with as many worker threads as logical cores on your machine. 
# local[2] creates two threads which we may need in case of spark streaming, to avoid starvation
# skipping the above line, I encounter an error while reading textFile: TypeError: textFile() missing 1 required positional
# argument: 'name', 

In [None]:
# each element in the RDD 'lines' are lines

lines=sc.textFile("hdfs:/user/cloudera/words.txt")

In [None]:
lines.count()

# this line throws an error that the kernel doesn't know which python version to use
# for this, I added the below lines in $HOME/.bashrc file:
"""
export IPYTHON=1
export PYSPARK_PYTHON=python3
export PYSPARK_DRIVER_PYTHON=ipython3
export PYSPARK_DRIVER_PYTHON_OPTS="notebook"

"""
#output: 124456

In [None]:
# flatMap() method iterates over every line in the RDD, and lambda line:line.split(" ") will be applied to each row or line in 'lines'
# the lambda notation is an anonymous function in Python, i.e a functiont defined without using a name
# in this case, the anonymous function takes a single argument, line and calls split("") which splits the line into an array of words

words=lines.flatMap(lambda line:line.split(" "))

In [None]:
# mapper step, aor ssign initial value for each word.
# the lambda expression creates a tupe with a word and a value of 1

tuples=words.map(lambda word:(word,1))

In [None]:
# Used 'flatmap' in [5] as we're splitting each line into a set of words, i.e there is 
# there is a one-to-many mapping between input lines and output words
# in [6], used just 'map' because we want to create a tuple for every word, i.e 
# we have a one to one mapping between the input words and output tuples
# 'flatMap' is used for one to many or one to none kind of mappings

In [None]:
# reducer step: Sum all word count valuese same word 
# reduceBykey method calls for all tuples with the same word
# the lambda exp has two args a and b, which are the count values in two tup
# counts is an RDD

counts=tuples.reduceByKey(lambda a, b:(a+b))

In [None]:
# Write word counts to text file in HDFS
# coalesce() c combines all teh RDD partitions into a single partition since we 
# want a single output file, and saveAsTextFile() writes RDD to the specified location
# write output to file : 'counts.txt' in hdfs

counts.coalesce(1).saveAsTextFile('hdfs:/user/cloudera/wordcount/outputDir2')

In [None]:
# Write word counts to text file in HDFS
# coalesce() c combines all teh RDD partitions into a single partition since we 
# want a single output file, and saveAsTextFile() writes RDD to the specified location
# write output to file : 'counts.txt' in hdfs

counts.coalesce(1).saveAsTextFile('hdfs:/user/cloudera/wordcount/outputDir2')# to view contents of count.txt, copy the file from HDFS to the local file system
# on the cloudera VM terminal, run:
# hadoop fs -ls /user/cloudera/wordcount/outputDir2 # specifying the path to list contents of outputDir2
"""
-rw-r--r--   1 cloudera cloudera          0 2018-08-25 06:21 /user/cloudera/wordcount/outputDir2/_SUCCESS
-rw-r--r--   1 cloudera cloudera    1055325 2018-08-25 06:21 /user/cloudera/wordcount/outputDir2/part-00000
"""
# hadoop fs -copyToLocal wordcount/outputDir/part-00000 count.txt
# more count.txt
"""
('', 517065)
('VENTIDIUS', 3)
('Stockfish,', 1)
('Corin,', 2)
('Begin', 6)
"""