In [9]:
%AddDeps edu.stanford.nlp stanford-corenlp 3.7.0
%AddDeps com.google.protobuf protobuf-java 2.6.1

// Non-repo dependencies 
%AddJar file:lib/corenlp-models.jar

Marking edu.stanford.nlp:stanford-corenlp:3.7.0 for download
Preparing to fetch from:
-> file:/tmp/toree_add_deps2087003410623593241/
-> https://repo1.maven.org/maven2
-> New file at /tmp/toree_add_deps2087003410623593241/https/repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/3.7.0/stanford-corenlp-3.7.0.jar
Marking com.google.protobuf:protobuf-java:2.6.1 for download
Preparing to fetch from:
-> file:/tmp/toree_add_deps2087003410623593241/
-> https://repo1.maven.org/maven2
-> New file at /tmp/toree_add_deps2087003410623593241/https/repo1.maven.org/maven2/com/google/protobuf/protobuf-java/2.6.1/protobuf-java-2.6.1.jar
Starting download from file:lib/corenlp-models.jar
Finished download of corenlp-models.jar


In [23]:
import edu.stanford.nlp.simple.Sentence
import edu.stanford.nlp.semgraph.SemanticGraph

In [24]:
val post = "In prokaryotic translation, how critical for efficient translation is the location of the ribosome binding site, relative to the start codon? Ideally, it is supposed to be -7b away from the start. How about if it is -9 bases away or even more? Will this have an observable effect on translation?"

In [25]:
val s = new Sentence(post)

In [27]:
s.posTags

[IN, JJ, NN, ,, WRB, JJ, IN, JJ, NN, VBZ, DT, NN, IN, DT, NN, NN, NN, ,, JJ, TO, DT, NN, NN, ., RB, ,, PRP, VBZ, VBN, TO, VB, CD, NN, RB, IN, DT, NN, ., WRB, RB, IN, PRP, VBZ, CD, NNS, RB, CC, RB, RBR, ., MD, DT, VB, DT, JJ, NN, IN, NN, .]

In [41]:
import collection.JavaConverters._
// mk dep/ pos lists
var w_d_p: List[(String, String, String)] = 
           (s.words.asScala, s.incomingDependencyLabels.asScala.map(_.get.toString), s.posTags.asScala).zipped.toList

// filter punuation, unzip
val (w, d, p) = w_d_p.filterNot(_._2 == "punct").unzip3

In [42]:
w

List(In, prokaryotic, translation, how, critical, for, efficient, translation, is, the, location, of, the, ribosome, binding, site, relative, to, the, start, codon, Ideally, it, is, supposed, to, be, -7, b, away, from, the, start, How, about, if, it, is, -9, bases, away, or, even, more, Will, this, have, an, observable, effect, on, translation)

In [37]:
a.toList

List((In,case,IN), (prokaryotic,amod,JJ), (translation,nmod:in,NN), (,,punct,,), (how,advmod,WRB), (critical,dep,JJ), (for,case,IN), (efficient,amod,JJ), (translation,nmod:for,NN), (is,root,VBZ), (the,det,DT), (location,nsubj,NN), (of,case,IN), (the,det,DT), (ribosome,compound,NN), (binding,compound,NN), (site,nmod:of,NN), (,,punct,,), (relative,amod,JJ), (to,case,TO), (the,det,DT), (start,compound,NN), (codon,nmod:to,NN), (?,punct,.), (Ideally,advmod,RB), (,,punct,,), (it,nsubj:xsubj,PRP), (is,auxpass,VBZ), (supposed,parataxis,VBN), (to,mark,TO), (be,cop,VB), (-7,nummod,CD), (b,xcomp,NN), (away,advmod,RB), (from,case,IN), (the,det,DT), (start,nmod:from,NN), (.,punct,.), (How,advmod,WRB), (about,advmod,RB), (if,mark,IN), (it,nsubj,PRP), (is,cop,VB...

In [16]:
import collection.JavaConverters._

In [3]:
def toTuple[A <: Object](as:List[A]):Product = {
  val tupleClass = Class.forName("scala.Tuple" + as.size)
  tupleClass.getConstructors.apply(0).newInstance(as:_*).asInstanceOf[Product]
}

In [4]:
def nGramsTup[A <: Object](n: Int, in: List[A]): List[Product] = 
{
    in.sliding(n).map(toTuple).toList
}

def nGramList[A](n: Int, in: List[A]): List[List[A]] = 
{
    in.sliding(n).map(_.toList).toList
}

In [5]:
val t = List("abc", "def", "ghi", "jkl", "mno", "pqr")

In [6]:
nGramsTup(3, t)

List((abc,def,ghi), (def,ghi,jkl), (ghi,jkl,mno), (jkl,mno,pqr))

In [7]:
val grams = (1 to 5).flatMap(nGramList(_, t))

In [8]:
grams.map(_.mkString("-"))

Vector(abc, def, ghi, jkl, mno, pqr, abc-def, def-ghi, ghi-jkl, jkl-mno, mno-pqr, abc-def-ghi, def-ghi-jkl, ghi-jkl-mno, jkl-mno-pqr, abc-def-ghi-jkl, def-ghi-jkl-mno, ghi-jkl-mno-pqr, abc-def-ghi-jkl-mno, def-ghi-jkl-mno-pqr)

## Manipulation Steps
- Get pos tags
- Make `(word,pos)`, `(word dep)` tuples
- Make nGrams for n 1 through 5

In [15]:
/*
    Returns Seq of case classes  with the following entries
    posTags: String = string of - delimited pos tags for this ngram
    depTags: String = string of - delimited dependency tags for this ngram
    relPos: Double[0,1] = relative position of first element of the ngra,
    numWords: Int = length of the ngram
    isTitle: Boolean = indicates if this
    isTag: Boolean = target classification value, inidcates if this ngram is a tag

*/

case class TrainingFeatures(nGram: String, posTags: String, depTags: String, relPos: Double, numWords: Int, isTitle: Boolean, isTag: Boolean)

case class StdFeatures(nGram: String, posTags: String, depTags: String, relPos: Double, numWords: Int, isTitle: Boolean)

def mkTrFeat(s: StdFeatures, isTag: Boolean): TrainingFeatures = 
                TrainingFeatures(s.nGram, s.posTags, s.depTags, s.relPos, s.numWords, s.isTitle, isTag)

/* OR I can use a UDF to make the transition */

def makeTrainingFeatures(n: Int)(title: String, content: String, tags: List[String]): TrainingFeatures =
{
    /* get standard features */
    
    /* apply tags and return training features*/
    
}

def makeStdFeatures(n: Int)(title: String, content: String): Seq[StdFeatures] = 
{
    // mk sentence
    val s = new Sentence(post)
    
    // mk dep/ pos lists
    var w_d_p: List[(String, String, String)] = 
               (s.words.asScala, s.incomingDependencyLabels.asScala.map(_.get.toString), s.posTags.asScala).zipped.toList

    // filter punuation, unzip
    val (w, d, p) = w_d_p.filterNot(_._2 == "punct").unzip3
    
    // calc rel pos
    val relPos = (0 until w.length).map(_/w.length)
    
    (1 to n).flatMap{nGramLen =>
        
    }
    
    // mk ngrams, ADD to master list
    
    // uxe transpose on list of lists, mk output objects
}

val punctReg = """[\p{Punct}]""".r
def isPunc(s: String): Boolean =
{
    punctReg.findFirstIn(s).isDefined
}

Name: Compile Error
Message: <console>:22: error: type mismatch;
 found   : Unit
 required: TrainingFeatures
       {
       ^
StackTrace: 

In [4]:
case class A(x: Int, y: String)
case class B(a: A, z: Double)

In [2]:
val sqlContext = org.apache.spark.sql.SQLContext.getOrCreate(sc)
import sqlContext.implicits._

In [8]:
Seq(B(A(1,"2"),3d))

Name: Compile Error
Message: <console>:30: error: not enough arguments for method withColumn: (colName: String, col: org.apache.spark.sql.Column)org.apache.spark.sql.DataFrame.
Unspecified value parameter col.
              Seq(B(A(1,"2"),3d)).toDF.withColumn(explode($"a"))
                                                 ^
StackTrace: 

In [11]:
List((1,2,3)).unzip3

(List(1),List(2),List(3))

In [20]:
(0 to 5, 'a' to 'f', 10 to 15).zipped.toList

List((0,a,10), (1,b,11), (2,c,12), (3,d,13), (4,e,14), (5,f,15))