In [27]:
# use Apache Spark to count words in all works from shakespear
# Setup environment with the following:
# - spark-2.3.0-bin-hadoop2.7.tgz
# - jdk1.8.0_171.jdk
# - scala-2.11.12
# - sbt-0.13.17.tgz
#
# Started jupyter notebook with the following command:
# 'PYSPARK_DRIVER_PYTHON="jupyter" PYSPARK_DRIVER_PYTHON_OPTS="notebook" pyspark'

In [28]:
# import all relevant functions
from pyspark.sql.functions import regexp_replace, trim, col, lower, split, explode, desc

# helper methods #
# Remove all punctuation
def removePunctuation(text):
    return lower(trim(regexp_replace(regexp_replace(text, '[^\w\s]', ''),'_','')))

# Count words in the given text
def wordCount(text):
    return (text.groupBy('word').count())

In [29]:
# Load all shakespear works and remove punctuation
shakeSpeare = sqlContext.read.text("shakespeare_cleaned.txt").select(removePunctuation(col('value')))

# print only the first ten rows
shakeSpeare.show(10, truncate=False)

+------------------------------------------------------------------+
|lower(trim(regexp_replace(regexp_replace(value, [^\w\s], ), _, )))|
+------------------------------------------------------------------+
|1609                                                              |
|                                                                  |
|the sonnets                                                       |
|                                                                  |
|by william shakespeare                                            |
|                                                                  |
|                                                                  |
|                                                                  |
|1                                                                 |
|from fairest creatures we desire increase                         |
+------------------------------------------------------------------+
only showing top 10 rows



In [30]:
# Split whole text into single words
shakeSpearWords = shakeSpeare.select(explode(split(shakeSpeare[0],"\s")).alias("word")).where('word != ""')
shakeSpearWords.show()

+-----------+
|       word|
+-----------+
|       1609|
|        the|
|    sonnets|
|         by|
|    william|
|shakespeare|
|          1|
|       from|
|    fairest|
|  creatures|
|         we|
|     desire|
|   increase|
|       that|
|    thereby|
|    beautys|
|       rose|
|      might|
|      never|
|        die|
+-----------+
only showing top 20 rows



In [31]:
# Count overall words
print(shakeSpearWords.count())

882996


In [32]:
# Calculate top 24 words used and order by count
topWordsAndCounts = wordCount(shakeSpearWords).orderBy(desc('count'))
topWordsAndCounts.show(24)

+----+-----+
|word|count|
+----+-----+
| the|27361|
| and|26028|
|   i|20681|
|  to|19150|
|  of|17463|
|   a|14593|
| you|13615|
|  my|12480|
|  in|10956|
|that|10890|
|  is| 9134|
| not| 8497|
|with| 7771|
|  me| 7768|
|  it| 7678|
| for| 7558|
| his| 6857|
|  be| 6857|
|your| 6655|
|this| 6602|
| but| 6265|
|  he| 6251|
|have| 5880|
|  as| 5733|
+----+-----+
only showing top 24 rows

