In [None]:
%AddDeps edu.stanford.nlp stanford-corenlp 3.6.0
%AddDeps com.google.protobuf protobuf-java 2.6.1
%AddDeps com.databricks spark-csv_2.10 1.4.0 --transitive
%AddDeps net.sf.jwordnet jwnl 1.4_rc3 --transitive

// Non-repo dependencies" 
%AddJar file:lib/corenlp-models.jar

In [None]:
2+2

In [10]:
import edu.stanford.nlp.semgraph.semgrex.SemgrexPattern
import edu.stanford.nlp.simple.Sentence
import org.apache.spark.sql._
import scala.io.Source
import org.apache.spark.sql.functions._
val sqlContext = SQLContext.getOrCreate(sc)
import sqlContext.implicits._
import scala.collection.immutable.HashSet
import edu.stanford.nlp.semgraph.SemanticGraph
import java.io.File
import org.apache.spark.sql.types._
import edu.stanford.nlp.trees.UniversalEnglishGrammaticalRelations.{COMPOUND_MODIFIER, ADJECTIVAL_MODIFIER, NEGATION_MODIFIER, ADVERBIAL_MODIFIER}
import collection.JavaConverters._

In [28]:
val NEU = "N"
val POS = "+"
val NEG = "-"
val UNK = "?"

val positive = Source.fromFile("extraction_dat/positive.txt").getLines.toList
val negative = Source.fromFile("extraction_dat/negative.txt").getLines.toList
val neutral = Source.fromFile("extraction_dat/neutral.txt").getLines.toList
val nonfoods = Source.fromFile("extraction_dat/nonfoods.txt").getLines.toList
val pos_hash = HashSet(positive: _*)
val neg_hash = HashSet(negative: _*)
val neu_hash = HashSet(neutral: _*)
val non_hash = HashSet(nonfoods: _*)

/**** Pattern Broadcasting ****/
/* "x is y" pattern */
val x_is_y_pattern = Source.fromFile("rules/xiy.semgrex").getLines.toList.head
val xiyPatBr = sc.broadcast(SemgrexPattern.compile(x_is_y_pattern))
/* "good for" pattern */
val g4_pattern = Source.fromFile("rules/g4.semgrex").getLines.toList.head
val g4PatBr = sc.broadcast(SemgrexPattern.compile(g4_pattern))

/**** Utilities ****/
def cleanString(s: String): String =
{
    var sl = s

    /* Remove non-ACII chars */
    sl = sl.replaceAll("[^\\p{ASCII}]+", "")

    /* Space out scrunched words */
    sl = sl.replaceAll("[\\.\\!\\?\\,\\;\\:]", " $0 ")

    /* Infer period (this may not work well...) */
    sl = sl.replaceAll("(\\p{Lower})(\\p{Upper})", "$1. $2")

    sl.toLowerCase
}

def applySentiment(adj: String): String =  if (pos_hash.contains(adj)) POS else if (neg_hash.contains(adj)) NEG else if (neu_hash.contains(adj)) NEU else UNK
def reverseSent(s: String):String = if (s == POS || s == NEU) NEG else if (s == NEG) POS else s

def pullAspect(g: SemanticGraph, node: edu.stanford.nlp.ling.IndexedWord): String =
{
    val mods = g.getChildrenWithRelns(node, List(COMPOUND_MODIFIER, ADJECTIVAL_MODIFIER).asJava).asScala.toList
    var modStr = mods.map(_.word).mkString("_")
    modStr = if (modStr != "") modStr + "_" else ""
    modStr + node.word
}

def pullAdjSentiment(g: SemanticGraph, node: edu.stanford.nlp.ling.IndexedWord): (String, String) =
{
    /*
        Three cases:
            - not_too X => Neutral
            - too _ X => Negative
            - not X => Opposite of X sentiment
    */
    var tooList = g.getChildren(node).asScala.toList
    val too = if (tooList.size != 0) {var t = tooList.map(_.word).filter(_ == "too"); if (t.size != 0) t.head else ""} else ""
    var negList = g.getChildrenWithReln(node, NEGATION_MODIFIER).asScala.toList
    val neg = if (negList.size != 0) {var n = negList.map(_.word); if (n.size != 0) n.head else ""} else ""
    val adj = node.word

    if (too != "" && neg != "")
    {
        /* Sentiment is neutral */
        val full_adj = neg + "_" + too + "_" + adj
        var sent = applySentiment(full_adj)
        sent = if (sent == UNK) NEU else sent
        (full_adj, sent)
    }
    else if (too != "")
    {
        /* Sentiment is negative */
        val full_adj = too + "_" + adj
        var sent = applySentiment(full_adj)
        sent = if (sent == UNK) NEG else sent
        (full_adj, sent)
    }
    else if (neg != "")
    {
        /* Sentiment is the opposite of adj */
        val full_adj = neg + "_" + adj
        var sent = applySentiment(full_adj)
        if (sent == UNK)
        {
            val sent_orig = applySentiment(adj)
            sent = reverseSent(sent_orig) 
        }
        (full_adj, sent)
    }
    else
    {
        /* vanilla case */
        (adj, applySentiment(adj))
    }
}

def extrXIYPattern(review: SemanticGraph): Vector[(String, String, String)] =
{
    var triples = Vector[(String, String, String)]()
    val pattern = xiyPatBr.value
    val matcher = pattern.matcher(review)
    while (matcher.find)
    {
        val aspNode = matcher.getNode("aspect")
        if (!non_hash.contains(aspNode.word))
        {
            val adjNode = matcher.getNode("adj")
            val graph = matcher.getGraph
            val aspect = pullAspect(graph, aspNode)
            val adjSent = pullAdjSentiment(graph, adjNode)
            triples = triples :+ (aspect, adjSent._1, adjSent._2)
        }
    }
    triples
}

def extrG4Pattern(review: SemanticGraph): Vector[(String, String, String)] =
{
    var triples = Vector[(String, String, String)]()
    val pattern = g4PatBr.value
    val matcher = pattern.matcher(review)
    while (matcher.find)
    {
        /* grab good vs great */
        val adj = matcher.getNode("pos_jj").word

        /* grab comp and/or jj modifying the noun */
        val aspNode = matcher.getNode("aspect")
        if (!non_hash.contains(aspNode.word))
        {
            val graph = matcher.getGraph
            val aspect = pullAspect(graph, aspNode)
            triples = triples :+ (aspect, adj, POS)
        }
    }
    triples
}

/* Broadcasted list of pattern functions to apply to each review */
val pattExtrBrs = sc.broadcast(Vector(extrXIYPattern _, extrG4Pattern _))

/* Function which extracts all patterns from the given review */
def extractPatterns(review: String): Vector[(String, String, String)] =
{
    if (review == null || review == "")
    {
        null
    }
    else
    {
        val sg: SemanticGraph = new Sentence(cleanString(review)).dependencyGraph
        pattExtrBrs.value.flatMap(_(sg))
    }
}

val ep_udf = udf(extractPatterns _)
