## Features:
- `LEN`: number of words in tuple (ranges from 1 to 5)
- `HAS_NN`: boolean indicating if the 
- `DEP_PATT`: string dependency pattern
- `PREV_2_W`: previous 2 words
- `NEXT_2_W`: next 2 words
- `NEXT_W`: next word
- `PREV_W`: next word
- `PREV_2_T`: previous 2 tags
- `NEXT_2_T`: next 2 tags
- `NEXT_T`: next tag
- `PREV_T`: prev tag
- `DOC_FR`: document frequency
- `IS_TITLE`: boolean indicating is this tuple from the title(true) or the post(false)
- `REL_POS`: proportion of sentence preceding the first element of the tuple

### Imports

In [1]:
/* Add Deps */
%AddDeps edu.stanford.nlp stanford-corenlp 3.7.0
%AddDeps com.google.protobuf protobuf-java 2.6.1

// Non-repo dependencies 
%AddJar file:lib/corenlp-models.jar

Marking edu.stanford.nlp:stanford-corenlp:3.7.0 for download
Preparing to fetch from:
-> file:/tmp/toree_add_deps6007752999750887948/
-> https://repo1.maven.org/maven2
-> New file at /tmp/toree_add_deps6007752999750887948/https/repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/3.7.0/stanford-corenlp-3.7.0.jar
Marking com.google.protobuf:protobuf-java:2.6.1 for download
Preparing to fetch from:
-> file:/tmp/toree_add_deps6007752999750887948/
-> https://repo1.maven.org/maven2
-> New file at /tmp/toree_add_deps6007752999750887948/https/repo1.maven.org/maven2/com/google/protobuf/protobuf-java/2.6.1/protobuf-java-2.6.1.jar
Starting download from file:lib/corenlp-models.jar
Finished download of corenlp-models.jar


In [2]:
/* Class Imports */
import edu.stanford.nlp.simple.Sentence
import edu.stanford.nlp.simple.Document
import collection.JavaConverters._

### Utility functions

In [28]:
/* Make n-grams from input list */
/* TODO: use iterators to save conversion costs*/
def mkNgram[A](n: Int, in: List[A]): List[List[A]] = 
{
    in.sliding(n).map(_.toList).toList
}

/* LDP is just a (lemma, hasUpper), dependency, part of speech triple */
type LDP = ((String, Boolean), String, String)

/* Maps a sentence to a list of LDP triples */
def mkLPDlists(s: Sentence): List[LDP] = 
{
    val w = s.words.asScala.toList
    val trips = (s.lemmas.asScala.zip(w).map(x => (x._1, x._2.exists(_.isUpper))), 
                                            s.incomingDependencyLabels.asScala.map(_.get.toString),
                                            s.posTags.asScala).zipped.toList
    trips.filterNot(_._2 == "punct")
}

/* 
    TODO: Think about  filters and implement them
    The following is a set of filters to remove undesirable nrgram candidates:
    - Ends with article (a/the)
    - Starts or ends with a conjunction
    - ???
*/
def ngramFilter(in: List[List[LDP]]): List[List[LDP]] =
{
    in
}

### Feature Structure: 

For now I am implementing the following features:

- posTags: String = string of - delimited pos tags for this ngram
- depTags: String = string of - delimited dependency tags for this ngram
- relPos: Double[0,1] = relative position of first element of the ngra,
- numWords: Int = length of the ngram
- hasUpper: contains uppercase char?
- isTitle: Boolean = indicates if this
- isTag: Boolean = target classification value, inidcates if this ngram is a tag

These features will be represented in my `TrainingFeatures` and `StdFeatures` classes:

In [12]:
/* Simple struct representing training features */
case class TrainingFeatures(nGram: String, posTags: String, depTags: String, relPos: Double,
                            numWords: Int, hasUpper: Boolean, isTitle: Boolean, isTag: Boolean)

/* Simple struct representing std features */
case class StdFeatures(nGram: String, posTags: String, depTags: String, relPos: Double,
                            numWords: Int, hasUpper: Boolean, isTitle: Boolean)
                            
/* Creates a TrainingFeatures instance from a StdFeatures instance */
def mkTrFeat(s: StdFeatures, isTag: Boolean): TrainingFeatures = 
                TrainingFeatures(s.nGram, s.posTags, s.depTags, s.relPos, s.numWords,
                                 s.hasUpper, s.isTitle, isTag)

## Feature Generation Steps
- Get pos tags
- Make `(lemma, dependency, part of speech)` triples
- Make nGrams for n 1 through 5

In [30]:

def makeStdFeatures(n: Int)(title: String, content: String): Seq[StdFeatures] = 
{
    // mk post sentence
    val post_sents = (new Document(content)).sentences.asScala.toList
    
    // mk title sentence
    val title_sents = (new Document(title)).sentences.asScala.toList
    
    // gather lemmas, pos tags, deps
    // these features will be also be ngrammed 
    val post_feats: List[(List[LDP], Boolean)] = post_sents.map(x => (mkLPDlists(x), false))
    val title_feats: List[(List[LDP], Boolean)] = title_sents.map(x => (mkLPDlists(x), true))
    
    /* N-Grammable features (plus isTitle)*/
    val gram_feats_it: List[(List[LDP], Boolean)] = post_feats ++ title_feats
    
    /* Loop over all ngram lengths */
    (1 to n).flatMap{nGramLen => {
        /* loop over all sentences in this post+title */
        gram_feats_it.flatMap{ case (n_grammable: List[LDP], isTitle: Boolean) => {
            /* make n grams from n-grammable items */
            val ngrams: List[List[LDP]] = mkNgram(nGramLen, n_grammable)
            
            /* sentence length is the lenth of n_grammable*/
            val senLen: Double = n_grammable.length.toDouble
            
            /* TODO: apply filters */
            // val ngrams_good = ngramFilter(ngrams)
            
            /* Map ngrams to StdFeature */
            ngrams.zipWithIndex.map{case (ngram: List[LDP], index: Int) => {
                /* here we have the ngram for each l, d, and p */
                val (l, d, p) = ngram.unzip3
                
                /* Now we can calculate all of the std features: */
                val ngram = l.map(_._1).mkString("-").toLowerCase
                val posTags = p.mkString("-").toLowerCase
                val depTags = d.mkString("-").toLowerCase
                val relPos = index / senLen
                val numWords = n
                val hasUpper = l.exists(_._2)
                StdFeatures(ngram, posTags, depTags, relPos, numWords, hasUpper, isTitle)
            }}
        }
    }}}
}

Name: Compile Error
Message: <console>:72: error: recursive value x$1 needs type
                       val (l, d, p) = ngram.unzip3
                            ^
<console>:76: error: value mkString is not a member of Any
                       val posTags = p.mkString("-").toLowerCase
                                       ^
<console>:77: error: value mkString is not a member of Any
                       val depTags = d.mkString("-").toLowerCase
                                       ^
StackTrace: 

In [26]:
val (a,b) = List(((1, 'a'), 2), ((3, 'b'), 4)).zipWithIndex.unzip

In [27]:
println(a)

List(((1,a),2), ((3,b),4))
