In [1]:
# Setup of Apache Spark context
# - Needed to install openjdk8, scala and findspark on ubuntu
# - Also copied spark folder to /usr/local/spark-2.3.1-bin-hadoop2.7
# - defined $SPARK_HOME variable
# Started jupyter notebook normally with 'jupyter notebook' command
# Import of findspark and initializing it to import pyspark
# If this block runs without errors, Spark is ready to be used!
import findspark
findspark.init('/usr/local/spark-2.3.1-bin-hadoop2.7')
import pyspark

In [2]:
# import all needed functions and configrations
from pyspark.sql.functions import regexp_replace, trim, col, lower, split, explode, desc
from pyspark.sql import SQLContext
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName('pyspark')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [3]:
# Helper methods
def removePunctuation(text):
    return lower(trim(regexp_replace(regexp_replace(text, '[^\w\s]', ''),'_','')))
# Count words in the given text
def countWords(text):
    return (text.groupBy('word').count())



# ***** Beginn of exercise: ********
# Load and save shakespeare file with removed punctuation
shakespeare = sqlContext.read.text("shakespeare_cleaned.txt").select(removePunctuation(col('value')))

# print the first 10 lines to check if file is loaded correctly
shakespeare.show(10, truncate=False)

+------------------------------------------------------------------+
|lower(trim(regexp_replace(regexp_replace(value, [^\w\s], ), _, )))|
+------------------------------------------------------------------+
|1609                                                              |
|                                                                  |
|the sonnets                                                       |
|                                                                  |
|by william shakespeare                                            |
|                                                                  |
|                                                                  |
|                                                                  |
|1                                                                 |
|from fairest creatures we desire increase                         |
+------------------------------------------------------------------+
only showing top 10 rows



In [4]:
# Split whole text into single words and save it
shakespeareWords = shakespeare.select(explode(split(shakespeare[0],"\s")).alias("word")).where('word != ""')
shakespeareWords.show()

+-----------+
|       word|
+-----------+
|       1609|
|        the|
|    sonnets|
|         by|
|    william|
|shakespeare|
|          1|
|       from|
|    fairest|
|  creatures|
|         we|
|     desire|
|   increase|
|       that|
|    thereby|
|    beautys|
|       rose|
|      might|
|      never|
|        die|
+-----------+
only showing top 20 rows



In [8]:
# Count overall words
display('Overall words: %s' % shakespeareWords.count())

'Overall words: 882996'

In [9]:
# Calculate top 24 words used and order by count as defined in the exercise
exerciseResult = countWords(shakespeareWords).orderBy(desc('count'))
exerciseResult.show(24)

+----+-----+
|word|count|
+----+-----+
| the|27361|
| and|26028|
|   i|20681|
|  to|19150|
|  of|17463|
|   a|14593|
| you|13615|
|  my|12480|
|  in|10956|
|that|10890|
|  is| 9134|
| not| 8497|
|with| 7771|
|  me| 7768|
|  it| 7678|
| for| 7558|
| his| 6857|
|  be| 6857|
|your| 6655|
|this| 6602|
| but| 6265|
|  he| 6251|
|have| 5880|
|  as| 5733|
+----+-----+
only showing top 24 rows

