### Note: this nb is for demo purposes, the actual feature engineering is done in `com.evan.kaggle.se.FeatureEngineering`

## Features:
- `LEN`: number of words in tuple (ranges from 1 to 5)
- `HAS_NN`: boolean indicating if the 
- `DEP_PATT`: string dependency pattern
- `PREV_2_W`: previous 2 words
- `NEXT_2_W`: next 2 words
- `NEXT_W`: next word
- `PREV_W`: next word
- `PREV_2_T`: previous 2 tags
- `NEXT_2_T`: next 2 tags
- `NEXT_T`: next tag
- `PREV_T`: prev tag
- `DOC_FR`: document frequency
- `IS_TITLE`: boolean indicating is this tuple from the title(true) or the post(false)
- `REL_POS`: proportion of sentence preceding the first element of the tuple

### Imports

In [2]:
/* Add Deps */
%AddDeps edu.stanford.nlp stanford-corenlp 3.7.0
%AddDeps com.google.protobuf protobuf-java 2.6.1

// Non-repo dependencies 
%AddJar file:lib/corenlp-models.jar

Marking edu.stanford.nlp:stanford-corenlp:3.7.0 for download
Preparing to fetch from:
-> file:/tmp/toree_add_deps7449986899999614546/
-> https://repo1.maven.org/maven2
-> New file at /tmp/toree_add_deps7449986899999614546/https/repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/3.7.0/stanford-corenlp-3.7.0.jar
Marking com.google.protobuf:protobuf-java:2.6.1 for download
Preparing to fetch from:
-> file:/tmp/toree_add_deps7449986899999614546/
-> https://repo1.maven.org/maven2
-> New file at /tmp/toree_add_deps7449986899999614546/https/repo1.maven.org/maven2/com/google/protobuf/protobuf-java/2.6.1/protobuf-java-2.6.1.jar
Starting download from file:lib/corenlp-models.jar
Finished download of corenlp-models.jar


In [3]:
/* Class Imports */
import edu.stanford.nlp.simple.Sentence
import edu.stanford.nlp.simple.Document
import collection.JavaConverters._

### Utility functions

In [29]:
import scala.collection.mutable.ArrayBuffer
def unzip4[A, B, C, D](in: List[(A, B, C, D)]): (List[A], List[B], List[C], List[D]) = 
{
    val a_a = ArrayBuffer[A]()
    val a_b = ArrayBuffer[B]()
    val a_c = ArrayBuffer[C]()
    val a_d = ArrayBuffer[D]()
    for ((a,b,c,d) <- in)
    {
        a_a += a
        a_b += b
        a_c += c
        a_d += d
    }
    (a_a.toList, a_b.toList, a_c.toList, a_d.toList)
}

/* Make n-grams from input list */
/* TODO: use iterators to save conversion costs*/
def mkNgram[A](n: Int, in: List[A]): List[List[A]] = 
{
    in.sliding(n).map(_.toList).toList
}

/* LPDU is just a (lemma, hasUpper), dependency, part of speech, isUpper quad */
case class LPDU(l: String, p: String, d: String, u:Boolean)

/* Maps a sentence to a list of LPDU triples */
def mkLPDUlist(s: Sentence): List[LPDU] = 
{
    val l = s.lemmas
    val p = s.posTags
    val d = s.incomingDependencyLabels.asScala.map(_.get.toString)
    val u = s.words.asScala.map(_.exists(_.isUpper))
    
    /* so imperative :( :( */
    (for (i <- 0 until l.size if d(i) != "punct") 
        yield LPDU(l.get(i), p.get(i), d(i), u(i))) toList
}

/* 
    TODO: Think about  filters and implement them
    The following is a set of filters to remove undesirable nrgram candidates:
    - Ends with article (a/the)
    - Starts or ends with a conjunction
    - ???
*/
def ngramFilter(in: List[List[LPDU]]): List[List[LPDU]] =
{
    in
}

In [30]:
mkLPDUlist(new Sentence("This, is a test!"))

List(LPDU(this,DT,nsubj,true), LPDU(be,VBZ,cop,false), LPDU(a,DT,det,false), LPDU(test,NN,root,false))

### Feature Structure: 

For now I am implementing the following features:

- posTags: String = string of - delimited pos tags for this ngram
- depTags: String = string of - delimited dependency tags for this ngram
- relPos: Double[0,1] = relative position of first element of the ngra,
- numWords: Int = length of the ngram
- hasUpper: contains uppercase char?
- isTitle: Boolean = indicates if this
- isTag: Boolean = target classification value, inidcates if this ngram is a tag

These features will be represented in my `TrainingFeatures` and `StdFeatures` classes:

In [5]:
/* Simple struct representing training features */
case class TrainingFeatures(nGram: String, posTags: String, depTags: String, relPos: Double,
                            numWords: Int, hasUpper: Boolean, isTitle: Boolean, isTag: Boolean)

/* Simple struct representing std features */
case class StdFeatures(nGram: String, posTags: String, depTags: String, relPos: Double,
                            numWords: Int, hasUpper: Boolean, isTitle: Boolean)
                            
/* Creates a TrainingFeatures instance from a StdFeatures instance */
def mkTrFeat(s: StdFeatures, isTag: Boolean): TrainingFeatures = 
                TrainingFeatures(s.nGram, s.posTags, s.depTags, s.relPos, s.numWords,
                                 s.hasUpper, s.isTitle, isTag)

## Feature Generation Steps
- Get pos tags
- Make `(lemma, dependency, part of speech)` triples
- Make nGrams for n 1 through 5

In [33]:

def makeStdFeatures(n: Int)(title: String, content: String): Seq[StdFeatures] = 
{
    // mk post sentence
    val post_sents = (new Document(content)).sentences.asScala.toList
    
    // mk title sentence
    val title_sents = (new Document(title)).sentences.asScala.toList
    
    // gather lemmas, pos tags, deps
    // these features will be also be ngrammed 
    val post_feats: List[(List[LPDU], Boolean)] = post_sents.map(x => (mkLPDUlist(x), false))
    val title_feats: List[(List[LPDU], Boolean)] = title_sents.map(x => (mkLPDUlist(x), true))
    
    /* N-Grammable features (plus isTitle)*/
    val gram_feats_it: List[(List[LPDU], Boolean)] = post_feats ++ title_feats
    
    /* Loop over all ngram lengths */
    (1 to n).flatMap{nGramLen => {
        /* loop over all sentences in this post+title */
        gram_feats_it.flatMap{ case (n_grammable: List[LPDU], isTitle: Boolean) => {
            /* make n grams from n-grammable items */
            val ngrams: List[List[LPDU]] = mkNgram(nGramLen, n_grammable)
            
            /* sentence length is the lenth of n_grammable*/
            val senLen: Double = n_grammable.length.toDouble
            
            /* TODO: apply filters */
            // val ngrams_good = ngramFilter(ngrams)
            
            /* Map ngrams to StdFeature */
            ngrams.zipWithIndex.map{case (ngram: List[LPDU], index: Int) => {
                /* Now we can calculate all of the std features: */
                val lemmas = new StringBuffer()
                val posTags = new StringBuffer() 
                val depTags = new StringBuffer()
                var hasUpper = false
                
                for (i <- 0 until ngram.length)
                {
                    val lpdu = ngram(i)
                    lemmas.append(lpdu.l)
                    posTags.append(lpdu.p)
                    depTags.append(lpdu.d)
                    /* middle dependent "-" marks */
                    if (i < ngram.length - 1)
                    {
                        lemmas.append("-")
                        posTags.append("-")
                        depTags.append("-")
                    }
                    
                    /* check if any upper case */
                    hasUpper ||= lpdu.u
                }
                /* calc rest of feats */
                val relPos = index / senLen
                val numWords = nGramLen
                StdFeatures(lemmas.toString, posTags.toString, depTags.toString,
                            relPos, numWords, hasUpper, isTitle)
            }}
        }
    }}}
}

In [35]:
makeStdFeatures(3)("This is a title.", "While this is the post.").reverse

Vector(StdFeatures(be-a-title,VBZ-DT-NN,cop-det-root,0.25,3,false,true), StdFeatures(this-be-a,DT-VBZ-DT,nsubj-cop-det,0.0,3,true,true), StdFeatures(be-the-post,VBZ-DT-NN,cop-det-root,0.4,3,false,false), StdFeatures(this-be-the,DT-VBZ-DT,nsubj-cop-det,0.2,3,false,false), StdFeatures(while-this-be,IN-DT-VBZ,mark-nsubj-cop,0.0,3,true,false), StdFeatures(a-title,DT-NN,det-root,0.5,2,false,true), StdFeatures(be-a,VBZ-DT,cop-det,0.25,2,false,true), StdFeatures(this-be,DT-VBZ,nsubj-cop,0.0,2,true,true), StdFeatures(the-post,DT-NN,det-root,0.6,2,false,false), StdFeatures(be-the,VBZ-DT,cop-det,0.4,2,false,false), StdFeatures(this-be,DT-VBZ,nsubj-cop,0.2,2,false,false), StdFeatures(while-this,IN-DT,mark-nsubj,0.0,2,true,false), StdFeatures(title,NN,root,0.75,1,false,tr...