# Resources

https://nlp.stanford.edu/courses/cs224n/2004/may-steinberg-project.pdf  
http://cogcomp.org/Data/QA/QC/  
https://www.nltk.org/api/nltk.classify.html?highlight=maxent  

In [61]:
import numpy as np
import pandas as pd
import nltk
from nltk.classify.maxent import MaxentClassifier, MaxentFeatureEncodingI, accuracy
from nltk.stem import WordNetLemmatizer

# This uses corenlp server, but changing it to use jar files shouldn't be too much work
from nltk.tag.stanford import CoreNLPNERTagger
from nltk.tag.stanford import CoreNLPPOSTagger

ner_tagger = CoreNLPNERTagger(url='http://localhost:9000')
pos_tagger = CoreNLPPOSTagger(url='http://localhost:9000')

wordnet_lemmatizer = WordNetLemmatizer()

In [2]:
import numpy as np
import pandas as pd
import nltk
from nltk.classify.maxent import MaxentClassifier, MaxentFeatureEncodingI, accuracy
import spacy

nlp = spacy.load('en')

# Load data

In [3]:
corpus = list()

with open('../data/input/train_5500.txt', 'r') as f:
    for line in f:
        corpus.append(line[:-1].split())
        
print('Training corpus length:', len(corpus))

labels = [line.pop(0) for line in corpus]

Training corpus length: 5452


In [4]:
corpus_test = list()

with open('../data/input/trec_10.txt', 'r') as f:
    for line in f:
        corpus_test.append(line[:-1].split())
        
print('Test corpus length:', len(corpus_test))

labels_test = [line.pop(0) for line in corpus_test]

Test corpus length: 500


# Feature creation

This is where a lot of the hard work happens. Common features include:
* lemma / stem
* POS
* NER
* parse trees
* WordNet features
* bigrams

In [None]:
# Lazy feature creation - nltk
def create_features(question):
    ner_tags = [i[1] for i in ner_tagger.tag(question)]
    pos_tags = [i[1] for i in pos_tagger.tag(question)]
    
    features = {}
    features['person'] = 'PERSON' in ner_tags
    features['proper_noun'] = 'NNP' in pos_tags
    features['lemma_1'] = wordnet_lemmatizer.lemmatize(question[0])
    return features

In [7]:
# Lazy feature creation - spacy
def create_features(doc):
    pos_tags = [token.pos_ for token in doc]
    ner_tags = [ent.label_ for ent in doc.ents]
    
    features = {}
    features
    features['person'] = 'PERSON' in ner_tags
    features['work_of_art'] = 'WORK_OF_ART' in ner_tags
    features['proper_noun'] = ('NNP' or 'NNPS') in pos_tags
    features['length'] = len(doc)
    features['lemma_1'] = doc[0].lemma_
    
    return features

# Training

#### NLTK

In [None]:
questions = [(question, label) for question, label in zip(corpus, labels)]

feats = [(create_features(q), l) for (q, l) in questions]

In [64]:
questions_test = [(question, label) for question, label in zip(corpus_test, labels_test)]

feats_test = [(create_features(q), l) for (q, l) in questions_test]

#### Spacy

In [9]:
questions = [(question, label) for question, label in zip(corpus, labels)]
    
feats = [(create_features(nlp.make_doc(' '.join(q))), l) for (q, l) in questions]

In [10]:
questions_test = [(question, label) for question, label in zip(corpus_test, labels_test)]

feats_test = [(create_features(nlp.make_doc(' '.join(q))), l) for (q, l) in questions_test]

#### Train

In [11]:
maxent_classifier = MaxentClassifier.train(feats, max_iter=10, trace=3)

  ==> Training (10 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -3.91202        0.002
             2          -2.41749        0.223
             3          -2.20347        0.377
             4          -2.04261        0.392
             5          -1.91910        0.403
             6          -1.82222        0.404
             7          -1.74480        0.405
             8          -1.68196        0.405
             9          -1.63020        0.405
         Final          -1.58703        0.406


# Results

In [12]:
accuracy(maxent_classifier, feats_test)

0.468

In [13]:
predictions = maxent_classifier.classify_many([fs for (fs, l) in feats_test])

pd.options.display.max_columns = 100
crosstab = pd.crosstab(np.array(predictions), np.array([l for (fs, l) in feats_test]))
crosstab.style.apply(lambda x: ['background: lightblue' if x.name == i else '' 
                                for i,_ in x.iteritems()])

col_0,ABBR:abb,ABBR:exp,DESC:def,DESC:desc,DESC:manner,DESC:reason,ENTY:animal,ENTY:body,ENTY:color,ENTY:currency,ENTY:dismed,ENTY:event,ENTY:food,ENTY:instru,ENTY:lang,ENTY:other,ENTY:plant,ENTY:product,ENTY:sport,ENTY:substance,ENTY:techmeth,ENTY:termeq,ENTY:veh,HUM:desc,HUM:gr,HUM:ind,HUM:title,LOC:city,LOC:country,LOC:mount,LOC:other,LOC:state,NUM:count,NUM:date,NUM:dist,NUM:money,NUM:other,NUM:perc,NUM:period,NUM:speed,NUM:temp,NUM:weight
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1
DESC:def,1,6,122,3,0,2,4,0,3,3,1,0,1,1,0,1,3,0,0,9,0,0,0,0,1,2,0,6,0,0,6,1,0,3,1,0,6,0,0,2,1,0
DESC:manner,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,3,0,0,0,0,2,1,1
DESC:reason,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
HUM:desc,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
HUM:ind,0,2,1,4,0,0,12,2,7,3,1,1,3,0,2,11,1,4,1,6,1,7,3,1,5,49,1,11,3,2,18,5,0,18,6,1,5,3,2,1,2,2
LOC:city,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
LOC:other,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,24,0,0,0,0,0,0,0,0,0,0,0
NUM:count,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,6,2,1,0,5,1,0,1
NUM:date,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,1,0,26,0,0,0,0,0,0,0,0


In [15]:
import pickle

with open('..\models\maxent_classifier.pkl', 'wb') as f:
    pickle.dump(maxent_classifier, f)