In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F


spark = SparkSession.builder.getOrCreate()
spark

In the following code we are reading from a text file, containing many rows, we want to count the total amount of words with the rows starting with "BG:".

In [2]:
df = spark.read.text("./data/biographies.list.gz")
df.show(truncate=False)

+-------------------------------------------------------------------------------+
|value                                                                          |
+-------------------------------------------------------------------------------+
|CRC: 0x72F64CB3  File: biographies.list  Date: Sat Dec 19 00:00:00 2015        |
|                                                                               |
|BIOGRAPHY LIST                                                                 |
|-------------------------------------------------------------------------------|
|NM: "Ghost", Matthew Clark                                                     |
|                                                                               |
|NK: Ghost                                                                      |
|                                                                               |
|HT: 5' 9"                                                                      |
|               

In order to reach our aim, the following code: 
* Excludes all the rows whose start is different from "BG:".
* Removes all the punctuation
* Splits each row on blanks
* Removes from each resulting array all the empty strings
* Removes the `BG`'s ([sorry guys](https://images-na.ssl-images-amazon.com/images/I/71Qh%2BLuRE0L._AC_SX522_.jpg))
* Counts the number of words for each row
* Sums the number of of words in each row

In [3]:
df = (df      
      .where(F.col("value").startswith("BG:"))
      .select(F.regexp_replace("value", "\p{Punct}", "").alias("value"))
      .select(F.split(F.col("value"), '\s+').alias("value"))
      .select(F.array_remove("value", "").alias("value"))
      .select(F.array_remove("value", "BG").alias("value"))
      .select("value", F.size("value").alias("word_count"))
     )

df.show(truncate=False)

+-----------------------------------------------------------------------------------------+----------+
|value                                                                                    |word_count|
+-----------------------------------------------------------------------------------------+----------+
|[Richard, Matthew, Clark, was, born, and, raised, in, southeastern, NC, in, a]           |12        |
|[small, town, called, Chadbourn, NC]                                                     |5         |
|[]                                                                                       |0         |
|[He, has, a, tragic, tale, filled, with, emotion, that, becomes, one, with, the]         |13        |
|[canvas, he, paints, Clark, has, a, way, through, any, media, to, lead, you, into, a]    |15        |
|[world, that, traps, your, soul]                                                         |5         |
|[]                                                                      

In [4]:
df = df.select(F.sum("word_count"))
df.show(truncate=False)

+---------------+
|sum(word_count)|
+---------------+
|25581588       |
+---------------+

