In [1]:
%AddDeps edu.stanford.nlp stanford-corenlp 3.7.0
%AddDeps com.google.protobuf protobuf-java 2.6.1

// Non-repo dependencies 
%AddJar file:lib/corenlp-models.jar

Marking edu.stanford.nlp:stanford-corenlp:3.7.0 for download
Preparing to fetch from:
-> file:/tmp/toree_add_deps6042086609772552663/
-> https://repo1.maven.org/maven2
-> New file at /tmp/toree_add_deps6042086609772552663/https/repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/3.7.0/stanford-corenlp-3.7.0.jar
Marking com.google.protobuf:protobuf-java:2.6.1 for download
Preparing to fetch from:
-> file:/tmp/toree_add_deps6042086609772552663/
-> https://repo1.maven.org/maven2
-> New file at /tmp/toree_add_deps6042086609772552663/https/repo1.maven.org/maven2/com/google/protobuf/protobuf-java/2.6.1/protobuf-java-2.6.1.jar
Starting download from file:lib/corenlp-models.jar
Finished download of corenlp-models.jar


In [2]:
import edu.stanford.nlp.simple.Sentence
import edu.stanford.nlp.semgraph.SemanticGraph

In [3]:
val post = "In prokaryotic translation, how critical for efficient translation is the location of the ribosome binding site, relative to the start codon? Ideally, it is supposed to be -7b away from the start. How about if it is -9 bases away or even more? Will this have an observable effect on translation?"

In [6]:
val s = new Sentence(post)

In [7]:
s. s.posTags

[IN, JJ, NN, ,, WRB, JJ, IN, JJ, NN, VBZ, DT, NN, IN, DT, NN, NN, NN, ,, JJ, TO, DT, NN, NN, ., RB, ,, PRP, VBZ, VBN, TO, VB, CD, NN, RB, IN, DT, NN, ., WRB, RB, IN, PRP, VBZ, CD, NNS, RB, CC, RB, RBR, ., MD, DT, VB, DT, JJ, NN, IN, NN, .]

In [16]:
import collection.JavaConverters._

In [17]:
s.words.asScala.zip(s.posTags.asScala)

ArrayBuffer((In,IN), (prokaryotic,JJ), (translation,NN), (,,,), (how,WRB), (critical,JJ), (for,IN), (efficient,JJ), (translation,NN), (is,VBZ), (the,DT), (location,NN), (of,IN), (the,DT), (ribosome,NN), (binding,NN), (site,NN), (,,,), (relative,JJ), (to,TO), (the,DT), (start,NN), (codon,NN), (?,.), (Ideally,RB), (,,,), (it,PRP), (is,VBZ), (supposed,VBN), (to,TO), (be,VB), (-7,CD), (b,NN), (away,RB), (from,IN), (the,DT), (start,NN), (.,.), (How,WRB), (about,RB), (if,IN), (it,PRP), (is,VBZ), (-9,CD), (bases,NNS), (away,RB), (or,CC), (even,RB), (more,RBR), (?,.), (Will,MD), (this,DT), (have,VB), (an,DT), (observable,JJ), (effect,NN), (on,IN), (translation,NN), (?,.))

In [46]:
s.words.asScala.zip(s.incomingDependencyLabels.asScala.map(_.get))

ArrayBuffer((In,case), (prokaryotic,amod), (translation,nmod:in), (,,punct), (how,advmod), (critical,dep), (for,case), (efficient,amod), (translation,nmod:for), (is,root), (the,det), (location,nsubj), (of,case), (the,det), (ribosome,compound), (binding,compound), (site,nmod:of), (,,punct), (relative,amod), (to,case), (the,det), (start,compound), (codon,nmod:to), (?,punct), (Ideally,advmod), (,,punct), (it,nsubj:xsubj), (is,auxpass), (supposed,parataxis), (to,mark), (be,cop), (-7,nummod), (b,xcomp), (away,advmod), (from,case), (the,det), (start,nmod:from), (.,punct), (How,advmod), (about,advmod), (if,mark), (it,nsubj), (is,cop), (-9,nummod), (bases,dep), (away,compound:prt), (or,cc), (even,advmod), (more,conj:or), (?,punct), (Wil...

In [50]:
s.words.asScala.sliding(2).toList

List(ArrayBuffer(In, prokaryotic), ArrayBuffer(prokaryotic, translation), ArrayBuffer(translation, ,), ArrayBuffer(,, how), ArrayBuffer(how, critical), ArrayBuffer(critical, for), ArrayBuffer(for, efficient), ArrayBuffer(efficient, translation), ArrayBuffer(translation, is), ArrayBuffer(is, the), ArrayBuffer(the, location), ArrayBuffer(location, of), ArrayBuffer(of, the), ArrayBuffer(the, ribosome), ArrayBuffer(ribosome, binding), ArrayBuffer(binding, site), ArrayBuffer(site, ,), ArrayBuffer(,, relative), ArrayBuffer(relative, to), ArrayBuffer(to, the), ArrayBuffer(the, start), ArrayBuffer(start, codon), ArrayBuffer(codon, ?), ArrayBuffer(?, Ideally), ArrayBuffer(Ideally, ,), ArrayBuffer(,, it), ArrayBuffer(it, is), ArrayBuffer(is, ...

In [53]:
def toTuple[A <: Object](as:List[A]):Product = {
  val tupleClass = Class.forName("scala.Tuple" + as.size)
  tupleClass.getConstructors.apply(0).newInstance(as:_*).asInstanceOf[Product]
}

In [60]:
def nGramsTup[A <: Object](n: Int, in: List[A]): List[Product] = 
{
    in.sliding(n).map(toTuple).toList
}

def nGramList[A](n: Int, in: List[A]): List[List[A]] = 
{
    in.sliding(n).map(_.toList).toList
}

In [62]:
val t = List("abc", "def", "ghi", "jkl", "mno", "pqr")

In [63]:
nGramsTup(3, t)

List((abc,def,ghi), (def,ghi,jkl), (ghi,jkl,mno), (jkl,mno,pqr))

In [66]:
val grams = (1 to 5).flatMap(nGramList(_, t))

In [73]:
grams.map(_.mkString("-"))

Vector(abc, def, ghi, jkl, mno, pqr, abc-def, def-ghi, ghi-jkl, jkl-mno, mno-pqr, abc-def-ghi, def-ghi-jkl, ghi-jkl-mno, jkl-mno-pqr, abc-def-ghi-jkl, def-ghi-jkl-mno, ghi-jkl-mno-pqr, abc-def-ghi-jkl-mno, def-ghi-jkl-mno-pqr)

## Manipulation Steps
- Get pos tags
- Make `(word,pos)`, `(word dep)` tuples
- Make nGrams for n 1 through 5