### Preconditions
- Make sure data has been downloaded with `download_data.py`
- Make sure the first pass of cleaning has been performed with `clean_files.scala`

In [9]:
val ss = org.apache.spark.sql.SparkSession.builder.getOrCreate()
import ss.implicits._
import org.apache.spark.sql.functions._

In [10]:
val files = List("cooking", "crypto", "robotics", "biology", "travel", "diy")

/* Load and print all files */
val df_all = files.map(f => {
                        spark.read.format("csv").option("header", "true").load("../dat/"+f+"_clean.csv")
                   }).reduce(_ union _).withColumn("tags", split($"tags", " "))

In [11]:
df_all.show

+---+--------------------+--------------------+--------------------+
| id|               title|             content|                tags|
+---+--------------------+--------------------+--------------------+
|  1|How can I get che...|<p>My chocolate c...|[baking, cookies,...|
|  2|How should I cook...|<p>I've heard of ...|[oven, cooking-ti...|
|  3|What is the diffe...|<p>I always use b...|              [eggs]|
|  4|What is the diffe...|<p>And can I use ...|[substitutions, p...|
|  5|In a tomato sauce...|<p>It seems that ...|[sauce, pasta, to...|
|  6|What ingredients ...|<p>I have a recip...|[substitutions, h...|
|  9|What is the inter...|<p>I'd like to kn...|[food-safety, bee...|
| 11|How should I poac...|<p>What's the bes...|[eggs, basics, po...|
| 12|How can I make my...|<p>My ice cream d...|         [ice-cream]|
| 17|How long and at w...|<p>I'm interested...|[baking, chicken,...|
| 23|Besides salmon, w...|<p>I've fallen in...|[grilling, salmon...|
| 27|Do I need to sift...|<p>Is th

In [4]:
df_all.count

87000

In [12]:
val rdd = df_all.select("tags").as[Seq[String]].rdd

In [15]:
val tags = rdd.collect.flatMap(_.map(t => (t, t.split("-").size)))
val tagsDF = tags.toSeq.distinct.toDF("tag", "length")

tags.maxBy(_._2)

(meet-in-the-middle-attack,5)

In [16]:
tagsDF.show

+--------------------+------+
|                 tag|length|
+--------------------+------+
|              baking|     1|
|             cookies|     1|
|             texture|     1|
|                oven|     1|
|        cooking-time|     2|
|               bacon|     1|
|                eggs|     1|
|       substitutions|     1|
|please-remove-thi...|     4|
|         baking-soda|     2|
|       baking-powder|     2|
|               sauce|     1|
|               pasta|     1|
|            tomatoes|     1|
|     italian-cuisine|     2|
|               herbs|     1|
|             parsley|     1|
|         food-safety|     2|
|                beef|     1|
|              basics|     1|
+--------------------+------+
only showing top 20 rows



### TODO: load all data and do a histogram over tag lengths

In [21]:
tagsDF.sort(-$"length").show(false)

+-------------------------+------+
|tag                      |length|
+-------------------------+------+
|meet-in-the-middle-attack|5     |
|proof-provenance-of-funds|4     |
|us-visa-waiver-program   |4     |
|please-remove-this-tag   |4     |
|90-180-visa-rules        |4     |
|man-in-the-middle        |4     |
|ho-chi-minh-city         |4     |
|gondolas-and-cable-cars  |4     |
|great-wall-of-china      |4     |
|change-purpose-of-travel |4     |
|proof-of-onward-travel   |4     |
|san-francisco-bay-area   |4     |
|t-and-t-citizens         |4     |
|self-leveling-concrete   |3     |
|glass-top-range          |3     |
|diy-vs-pro               |3     |
|damp-proof-course        |3     |
|thermostat-c-wire        |3     |
|grounding-and-bonding    |3     |
|sliding-glass-door       |3     |
+-------------------------+------+
only showing top 20 rows

