In [7]:
#import the tools that we need to use Spark
import findspark
import pyspark
findspark.init()
sc = pyspark.SparkContext.getOrCreate()

In [31]:
#check the version of Spark
sc.version

'3.1.1'

In [12]:
#read README.md file which located in the same folder
readme = sc.textFile("README.md")

In [14]:
#Count the number of items in the RDD
readme.count()

98

In [15]:
#show first item in the RDD
readme.first()

'# Apache Spark'

In [16]:
#Use the filter transformation to return a new RDD with a subset of the items in the file
linesWithSpark = readme.filter(lambda line: "Spark" in line)

In [17]:
#find out how many lines contains the word “Spark”
readme.filter(lambda line: "Spark" in line).count()

18

In [19]:
#find the line from that "README.md" file with the most words in it
readme.map(lambda line: len(line.split())).reduce(lambda a, b: a if (a > b) else b)

14

In [20]:
def max(a, b):
 if a > b:
    return a
 else:
    return b
readme.map(lambda line: len(line.split())).reduce(max)

14

In [21]:
#use flatMap, map, and the reduceByKey functions to word count of each word in the readme file
wordCounts = readme.flatMap(lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b)

In [23]:
wordCounts.collect()

[('#', 1),
 ('Apache', 1),
 ('Spark', 14),
 ('is', 6),
 ('It', 2),
 ('provides', 1),
 ('high-level', 1),
 ('APIs', 1),
 ('in', 5),
 ('Scala,', 1),
 ('Java,', 1),
 ('an', 3),
 ('optimized', 1),
 ('engine', 1),
 ('supports', 2),
 ('computation', 1),
 ('analysis.', 1),
 ('set', 2),
 ('of', 5),
 ('tools', 1),
 ('SQL', 2),
 ('MLlib', 1),
 ('machine', 1),
 ('learning,', 1),
 ('GraphX', 1),
 ('graph', 1),
 ('processing,', 1),
 ('Documentation', 1),
 ('latest', 1),
 ('programming', 1),
 ('guide,', 1),
 ('[project', 2),
 ('README', 1),
 ('only', 1),
 ('basic', 1),
 ('instructions.', 1),
 ('Building', 1),
 ('using', 2),
 ('[Apache', 1),
 ('run:', 1),
 ('do', 2),
 ('this', 1),
 ('downloaded', 1),
 ('documentation', 3),
 ('project', 1),
 ('site,', 1),
 ('at', 2),
 ('Spark"](http://spark.apache.org/docs/latest/building-spark.html).', 1),
 ('Interactive', 2),
 ('Shell', 2),
 ('The', 1),
 ('way', 1),
 ('start', 1),
 ('Try', 1),
 ('following', 2),
 ('1000:', 2),
 ('scala>', 1),
 ('1000).count()', 1),


In [25]:
#determine what is the most frequent word in the README, and how many times was it used?
wordCounts.reduce(lambda a, b: a if (a[1] > b[1]) else b)

('the', 21)

In [26]:
print(linesWithSpark.count())

18


In [27]:
from timeit import Timer
def count():
    return linesWithSpark.count()
t = Timer(lambda: count())

In [28]:
print(t.timeit(number=50))

93.94240620000028


In [29]:
linesWithSpark.cache()
print(t.timeit(number=50))

97.42261800000006
