# Word count using Data Frames

In [14]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *

In [15]:
spark = (SparkSession
        .builder
        .appName("Word Count")
        .getOrCreate())

In [26]:
lines = spark.read.text("../word-count/input/*.txt")

In [27]:
lines.printSchema()

root
 |-- value: string (nullable = true)



In [28]:
words = lines.withColumn("words", explode(split(trim(col("value")), " ")))

In [29]:
words.show(100)

+--------------------+---------------+
|               value|          words|
+--------------------+---------------+
|                    |               |
|***The Project Gu...|         ***The|
|***The Project Gu...|        Project|
|***The Project Gu...|      Gutenberg|
|***The Project Gu...|          Etext|
|***The Project Gu...|             of|
|***The Project Gu...|            The|
|***The Project Gu...|      Gutenberg|
|***The Project Gu...|Encyclopedia***|
|!!!Remember. . .t...|   !!!Remember.|
|!!!Remember. . .t...|              .|
|!!!Remember. . .t...|           .the|
|!!!Remember. . .t...|    information|
|!!!Remember. . .t...|      contained|
|!!!Remember. . .t...|         herein|
|!!!Remember. . .t...|             is|
|!!!Remember. . .t...|           OLD.|
|!!!Remember. . .t...|              .|
|!!!Remember. . .t...|           .!!!|
|*!!!It reflects t...|         *!!!It|
|*!!!It reflects t...|       reflects|
|*!!!It reflects t...|            the|
|*!!!It reflects t...|   

In [30]:
wordcounts = words.groupBy("words").count()

In [31]:
wordcounts.printSchema()

root
 |-- words: string (nullable = false)
 |-- count: long (nullable = false)



In [32]:
wordcounts.show(10)

+---------+-----+
|    words|count|
+---------+-----+
|       By| 1279|
|   Volume|   13|
|    still| 1617|
|    those| 2165|
|     some| 3835|
|     cit.|   20|
|   waters|  132|
|  reddish|   21|
|      art|  982|
|connected|  262|
+---------+-----+
only showing top 10 rows



In [33]:
wordcounts.sort(col("count"), ascending=False).show(10)

+-----+------+
|words| count|
+-----+------+
|  the|178944|
|     |175257|
|   of|114801|
|  and| 82425|
|   to| 63000|
|   in| 55577|
|    a| 54487|
|   is| 29562|
|  was| 26047|
|    I| 24753|
+-----+------+
only showing top 10 rows



In [34]:
wordcounts.filter(col("words") == "Rabbit").show(10)

+------+-----+
| words|count|
+------+-----+
|Rabbit|   29|
+------+-----+

