In [1]:
/* Add Deps */
%AddDeps edu.stanford.nlp stanford-corenlp 3.7.0
%AddDeps com.google.protobuf protobuf-java 2.6.1
%AddDeps com.databricks spark-csv_2.10 1.5.0 --transitive

// Non-repo dependencies 
%AddJar file:lib/corenlp-models.jar
%AddJar file:SE/target/scala-2.10/se_2.10-1.1.jar

Marking edu.stanford.nlp:stanford-corenlp:3.7.0 for download
Preparing to fetch from:
-> file:/tmp/toree_add_deps8127662170221276672/
-> https://repo1.maven.org/maven2
-> New file at /tmp/toree_add_deps8127662170221276672/https/repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/3.7.0/stanford-corenlp-3.7.0.jar
Marking com.google.protobuf:protobuf-java:2.6.1 for download
Preparing to fetch from:
-> file:/tmp/toree_add_deps8127662170221276672/
-> https://repo1.maven.org/maven2
-> New file at /tmp/toree_add_deps8127662170221276672/https/repo1.maven.org/maven2/com/google/protobuf/protobuf-java/2.6.1/protobuf-java-2.6.1.jar
Marking com.databricks:spark-csv_2.10:1.5.0 for download
Preparing to fetch from:
-> file:/tmp/toree_add_deps8127662170221276672/
-> https://repo1.maven.org/maven2
-> New file at /tmp/toree_add_deps8127662170221276672/https/repo1.maven.org/maven2/com/databricks/spark-csv_2.10/1.5.0/spark-csv_2.10-1.5.0.jar
-> New file at /tmp/toree_add_deps8127662170221276672/https

In [2]:
import com.evan.kaggle.se.FeatureEngineering._
val sqlContext = org.apache.spark.sql.SQLContext.getOrCreate(sc)
import sqlContext.implicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Row

In [3]:
val files = List("cooking", "crypto", "robotics", "biology", "travel", "diy")

/* Load and print all files */
val df_all = files.map(f => {
                        sqlContext.read.format("com.databricks.spark.csv").option("header", "true").load("../dat/"+f+"_clean.csv")
                   }).reduce(_ unionAll _).withColumn("tags", split($"tags", " "))

In [4]:
df_all.show

+---+--------------------+--------------------+--------------------+
| id|               title|             content|                tags|
+---+--------------------+--------------------+--------------------+
|  1|How can I get che...|<p>My chocolate c...|[baking, cookies,...|
|  2|How should I cook...|<p>I've heard of ...|[oven, cooking-ti...|
|  3|What is the diffe...|<p>I always use b...|              [eggs]|
|  4|What is the diffe...|<p>And can I use ...|[substitutions, p...|
|  5|In a tomato sauce...|<p>It seems that ...|[sauce, pasta, to...|
|  6|What ingredients ...|<p>I have a recip...|[substitutions, h...|
|  9|What is the inter...|<p>I'd like to kn...|[food-safety, bee...|
| 11|How should I poac...|<p>What's the bes...|[eggs, basics, po...|
| 12|How can I make my...|<p>My ice cream d...|         [ice-cream]|
| 17|How long and at w...|<p>I'm interested...|[baking, chicken,...|
| 23|Besides salmon, w...|<p>I've fallen in...|[grilling, salmon...|
| 27|Do I need to sift...|<p>Is th

In [5]:
val rdd = df_all.select("title", "content").rdd.map{case Row(t: String, c: String) => (t, c)}

In [6]:
val twoGramFeats = makeStdFeatures(5) _
val featDF = rdd.flatMap(x => twoGramFeats(x._1, x._2)).toDF

In [7]:
featDF.show

+---------+-------+---------+------------------+--------+--------+-------+
|    nGram|posTags|  depTags|            relPos|numWords|hasUpper|isTitle|
+---------+-------+---------+------------------+--------+--------+-------+
|      <p>|     NN| compound|               0.0|       1|   false|  false|
|       my|   PRP$|nmod:poss|0.1111111111111111|       1|    true|  false|
|chocolate|     NN| compound|0.2222222222222222|       1|   false|  false|
|     chip|    NNS| compound|0.3333333333333333|       1|   false|  false|
|   cookie|    NNS|    nsubj|0.4444444444444444|       1|   false|  false|
|       be|    VBP|      cop|0.5555555555555556|       1|   false|  false|
|   always|     RB|   advmod|0.6666666666666666|       1|   false|  false|
|      too|     RB|   advmod|0.7777777777777778|       1|   false|  false|
|    crisp|     JJ|     root|0.8888888888888888|       1|   false|  false|
|      how|    WRB|   advmod|               0.0|       1|    true|  false|
|      can|     MD|      

In [8]:
val s = System.nanoTime
println(s"There are ${featDF.count} rows")
val diff = (System.nanoTime - s)/1e9d
println(s"Elapsed: $diff seconds")

There are 50603365 rows
Elapsed: 3120.469312262 seconds


In [None]:
featDF.count / df_all.count.toDouble