In [None]:
/* Add Deps */
%AddDeps edu.stanford.nlp stanford-corenlp 3.7.0
%AddDeps com.google.protobuf protobuf-java 2.6.1
%AddDeps com.databricks spark-csv_2.10 1.5.0 --transitive

// Non-repo dependencies 
%AddJar file:lib/corenlp-models.jar
%AddJar file:SE/target/scala-2.10/se_2.10-1.1.jar

In [2]:
import com.evan.kaggle.se.FeatureEngineering._
val sqlContext = org.apache.spark.sql.SQLContext.getOrCreate(sc)
import sqlContext.implicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Row
import scala.collection.immutable.HashSet

In [3]:
val samp = sqlContext.read.parquet("union.parquet").sample(false, .5)

In [4]:
samp.count

43419

In [5]:
val feat_udf = udf((t: String, c: String, ta: String) => 
    makeTrFeatures(3)(t, c, ta))

In [6]:
val flat = samp.select(explode(feat_udf($"title", $"content", $"tags")))
val flat2 = flat.select($"col.*")

In [7]:
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
import org.apache.spark.ml.feature.VectorAssembler

//change name
val data = flat2.withColumn("isTag", when($"isTag", lit("Y")).otherwise(lit("N"))).
    withColumn("hasUpper", when($"hasUpper", lit(1.0)).otherwise(0.0)).persist

val labelIndexer = new StringIndexer().
  setInputCol("isTag").
  setOutputCol("indexedLabel").
  fit(data)

val hashingTF = new HashingTF().setInputCol("posTags").
    setInputCol("depTags").
    setOutputCol("rawFeatures").setNumFeatures(20)

val idf = new IDF().setInputCol("rawFeatures").
    setOutputCol("text_features")

val assembler = new VectorAssembler().
    setInputCols(Array("text_features", "relPos", "hasUpper", "numWords")).
    setOutputCol("features")

In [8]:
// Split the data into training and test sets (30% held out for testing)
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))

// Train a RandomForest model.
val rf = new RandomForestClassifier().setLabelCol("indexedLabel").
  setFeaturesCol("features").
  setNumTrees(10)

// Convert indexed labels back to original labels.
val labelConverter = new IndexToString().
  setInputCol("prediction").
  setOutputCol("predictedLabel").
  setLabels(labelIndexer.labels)

In [10]:
data.write.save("featurized.parquet")

In [18]:
// Chain indexers and forest in a Pipeline
val pipeline = new Pipeline().setStages(Array(labelIndexer, hashingTF, idf, assembler, rf, labelConverter))

// Train model.  This also runs the indexers.
val model = pipeline.fit(trainingData)

// Make predictions.
val predictions = model.transform(testData).persist

// Select example rows to display.
predictions.select("predictedLabel", "indexedLabel", "features").show(5)

+--------------+------------+--------------------+
|predictedLabel|indexedLabel|            features|
+--------------+------------+--------------------+
|             N|         0.0|(23,[5,21,22],[2....|
|             N|         0.0|(23,[12,20,22],[1...|
|             N|         0.0|(23,[19,20,22],[2...|
|             N|         0.0|(23,[19,21,22],[2...|
|             N|         0.0|(23,[16,20,21,22]...|
+--------------+------------+--------------------+
only showing top 5 rows



In [14]:
val binaryClassificationEvaluator = new BinaryClassificationEvaluator().
  setLabelCol("indexedLabel").
  setRawPredictionCol("rawPrediction")

def printlnMetric(metricName: String): Unit = {
  println(metricName + " = " + binaryClassificationEvaluator.setMetricName(metricName).evaluate(predictions))
}

printlnMetric("areaUnderROC")
printlnMetric("areaUnderPR")

areaUnderROC = 0.8787221210280507
areaUnderPR = 0.08621660482920898


In [17]:
val n_rows = data.count.toDouble
val n_pos = data.filter($"isTag" === "Y").count

println(s"Total # of rows: $n_rows")
println(s"Perctage of positive examples: ${n_pos/n_rows}%")

Total # of rows: 1.0535507E7
Perctage of positive examples: 0.012138191356144513%


In [19]:
val yeses = predictions.filter($"predictedLabel" === "Y")
yeses.show

                                                                                +-----+-------+-------+------+--------+--------+-------+-----+------------+-----------+-------------+--------+-------------+-----------+----------+--------------+
|nGram|posTags|depTags|relPos|numWords|hasUpper|isTitle|isTag|indexedLabel|rawFeatures|text_features|features|rawPrediction|probability|prediction|predictedLabel|
+-----+-------+-------+------+--------+--------+-------+-----+------------+-----------+-------------+--------+-------------+-----------+----------+--------------+
+-----+-------+-------+------+--------+--------+-------+-----+------------+-----------+-------------+--------+-------------+-----------+----------+--------------+



In [20]:
yeses.count

0

## This $\Uparrow$ is NOT any good