In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, explode, count, collect_list
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.feature import NGram, StopWordsRemover, Tokenizer

In [2]:
from os import environ, path
environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.10:0.4.1 pyspark-shell' 

In [3]:
spark = SparkSession \
    .builder \
    .appName("Python Spark wordcount") \
    .getOrCreate()

In [4]:
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words", stopWords=["."]) # init stopword remover
ngram = NGram(n=2, inputCol="filtered_words", outputCol="ngrams")  # init ngram maker

In [5]:
take_firsts = udf(lambda l: [i[0] for i in l], ArrayType(StringType())) # take word from each list in list

In [6]:
# read xml files as df
df_xml = spark.read \
  .format("com.databricks.spark.xml") \
  .option("rowTag", "s") \
  .option("rootTag", "document") \
  .load("../subtitles/*/*/*/*/*/*/*.xml.gz") # spark does not support recursive load

In [7]:
df_words = df_xml \
    .withColumn("words", take_firsts(col("w"))) \
    .drop("_emphasis", "time", "w") # reformat to only lists of words

In [8]:
df_words = stopwords_remover.transform(df_words) # remove stopwords

In [9]:
df_words = ngram.transform(df_words) # make ngrams with n=2 (words)

In [10]:
df_words.show()

+---+--------------------+--------------------+--------------------+
|_id|               words|      filtered_words|              ngrams|
+---+--------------------+--------------------+--------------------+
|  1|[Die, muur, sal, ...|[Die, muur, sal, ...|[Die muur, muur s...|
|  2|[Die, beveiliging...|[Die, beveiliging...|[Die beveiliging,...|
|  3|[', n, Regime, va...|[', n, Regime, va...|[' n, n Regime, R...|
|  4|[Daarom, is, ek, ...|[Daarom, is, ek, ...|[Daarom is, is ek...|
|  5|[", Ek, is, ', n,...|[", Ek, is, ', n,...|[" Ek, Ek is, is ...|
|  6|[Die, WONDER, van...|[Die, WONDER, van...|[Die WONDER, WOND...|
|  7|[-, Oos-Berlyn, 1...|[-, Oos-Berlyn, 1...|[- Oos-Berlyn, Oo...|
|  8|  [Hello, grootseun]|  [Hello, grootseun]|   [Hello grootseun]|
|  9|[Ek, het, nie, ty...|[Ek, het, nie, ty...|[Ek het, het nie,...|
| 10|[Die, prof., was,...|[Die, prof., was,...|[Die prof., prof....|
| 11|[maar, as, hy, jo...|[maar, as, hy, jo...|[maar as, as hy, ...|
| 12|[Hy, doen, ma, ',...|[Hy, doe