For help, look here:
https://spark.apache.org/docs/latest/rdd-programming-guide.html

In [2]:
# Check out pre-loaded dataset
display(dbutils.fs.ls('dbfs:/'))

path,name,size
dbfs:/FileStore/,FileStore/,0
dbfs:/databricks-datasets/,databricks-datasets/,0
dbfs:/databricks-results/,databricks-results/,0
dbfs:/tmp/,tmp/,0


In [3]:
# Create a rdd (sc = SparkContext)
rdd = sc.textFile("dbfs:/databricks-datasets/SPARK_README.md")

In [4]:
# Read 20 lines 
rdd.take(20)

In [5]:
# Example: lambda functions  
words = rdd.flatMap(lambda lines: lines.split(" "))
words.take(10)

In [6]:
# Take the previous function and
# 1. count all the words
wordCounts = words.map(lambda x: (x, 1))\
                  .reduceByKey(lambda x,y: x + y)

for w in wordCounts.collect():
  print(w)

In [7]:
# 2. change all capital letters to lower case
lowerCaseWords = words.map(lambda x: x.lower())\
                      .map(lambda x: (x, 1))\
                      .reduceByKey(lambda x,y: x + y)

for w in lowerCaseWords.collect():
  print(w)

In [8]:
# 3. eliminate stopwords 
stopWords = ['', '#', '##', 'and', 'to', 'in', 'the', 'a', 'an', 'for', 'on', 'is', 'of'] # define the list of stop words

filteredWords = words.filter(lambda x: x not in stopWords)\
                      .map(lambda x: x.lower())\
                      .map(lambda x: (x, 1))\
                      .reduceByKey(lambda x,y: x + y)

for w in filteredWords.collect():
  print(w)

In [9]:
# 4. sort in alphabetical order
sortedWords = words.filter(lambda x: x not in stopWords)\
                    .map(lambda x: x.lower())\
                    .map(lambda x: (x, 1))\
                    .reduceByKey(lambda x,y: x + y)\
                    .sortByKey()

for w in sortedWords.collect():
  print(w)

In [10]:
# 5.** remove punctuations 
same_words = rdd.map(lambda w: w.translate(w.maketrans(",.!?:;[]()-/=+#", 15*" "))) \
                .flatMap(lambda lines: lines.split()) \
                .map(lambda w: w.lower()) \
                .filter(lambda w: w not in stopWords) \
                .map(lambda w: (w, 1)) \
                .reduceByKey(lambda x, y: x + y) \
                .sortByKey()

for w in same_words.collect():
  print(w)

In [11]:
# 6. sort from most to least frequent word
sortedWords_2 = words.filter(lambda x: x not in stopWords)\
                      .map(lambda x: x.lower())\
                      .map(lambda x: (x, 1))\
                      .reduceByKey(lambda x,y: x + y)\
                      .map(lambda x: (x[1], x[0])) \
                      .sortByKey(ascending=False)

for w in sortedWords_2.collect():
  print(w)