In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, explode, count

In [2]:
from os import environ, path
environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.10:0.4.1 pyspark-shell' 

In [3]:
spark = SparkSession \
    .builder \
    .appName("Python Spark wordcount") \
    .getOrCreate()

In [4]:
take_first = udf(lambda l: l[0]) # take first from list

In [7]:
# read xml files as df
df_xml = spark.read \
  .format("com.databricks.spark.xml") \
  .option("rowTag", "s") \
  .option("rootTag", "document") \
  .load("../subtitles/*/*/*/*/*/*/*.xml.gz") # spark does not support recursive load

In [8]:
df_wc = df_xml \
    .withColumn("words", explode(col("w"))) \
    .drop("_emphasis", "_id", "time", "w") \
    .withColumn("word", take_first(col("words"))) \
    .groupBy("word").agg(count(col("word"))) \
    .sort(col("count(word)").desc())

In [10]:
df_wc.show(10, False)

+----+-----------+
|word|count(word)|
+----+-----------+
|.   |13115      |
|,   |9955       |
|die |5915       |
|nie |4973       |
|?   |4758       |
|is  |4630       |
|'   |3826       |
|het |3769       |
|!   |3582       |
|n   |3420       |
+----+-----------+
only showing top 10 rows



In [None]:
spark.sparkContext.uiWebUrl