In [1]:
import os
import itertools
from spacy.en import English
import numpy as np

In [2]:
os.chdir("..")
os.getcwd()

'/mnt/Storage/Coding_Projects/Candidate_Classifier'

In [3]:
os.getcwd()

'/mnt/Storage/Coding_Projects/Candidate_Classifier'

In [4]:
from candidate_classifier.string_processing import *
from candidate_classifier.nltk_model.ngram_classifier import NgramClassifierMulti
from candidate_classifier.tokenizers import *

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
# from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import make_scorer, classification_report, f1_score
# from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import VotingClassifier

import codecs
from pprint import pprint
from time import time
import logging

In [6]:
# from sklearn.calibration import CalibratedClassifierCV
from candidate_classifier.calibration import CalibratedClassifierCV

In [7]:
import sklearn
sklearn.__version__

'0.18.dev0'

In [8]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
fmt = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hndlr = logging.StreamHandler()
hndlr.setFormatter(fmt)
logger.addHandler(hndlr)

In [10]:
#nlp = English(load_vectors=False)

In [9]:
sents_path = 'candidate_classifier/data/processed/clean_sents.txt'
labels_path = 'candidate_classifier/data/processed/sent_labels.txt'

In [10]:
def docs():
    with codecs.open(sents_path, mode='r', encoding='utf-8') as _f:
        for line in _f:
            yield line

def labels():
    with codecs.open(labels_path, mode='r', encoding='utf-8') as _f:
        for line in _f:
            yield line.strip()
labels_list = list(labels())
candidates = sorted(list(set(labels_list)))
docs_list = list(docs())
print candidates

[u'BUSH', u'CARSON', u'CHRISTIE', u'CLINTON', u'CRUZ', u'KASICH', u'RUBIO', u'SANDERS', u'TRUMP']


In [11]:
def get_scores(clf, X, y):
    scores = cross_val_score(clf, X, y, cv=10, scoring='f1_samples')
    print scores
    print "\n"
    # Use 1.96 * std b/c 95% of the data should lie in that range, 
    # which means this represents a 95% confidence interval
    print "F1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 1.960)
    
def fancy_scorer(y, y_pred, **kwargs):
    # Print classification report
    print classification_report(y, y_pred, target_names=candidates)
    return f1_score(y, y_pred, labels=candidates, average='weighted')    

def f1_weighted_scorer(y, y_pred, **kwargs):
    return f1_score(y, y_pred, labels=candidates, average='weighted')

# Decorator to get averages from cross-validation
def avg

In [None]:
avg

In [12]:
def process_grid(pipe, params, data, labels):
    grid_search = GridSearchCV(pipe, params, n_jobs=1, scoring=make_scorer(f1_weighted_scorer), cv=3, verbose=1)
    
    print "Performing grid search..."
    print "pipeline:", [name for name, _ in pipe.steps]
    print "parameters:"
    pprint(params)
    
    t0 = time()
    grid_search.fit(data, labels)
    
    print "done in %0.3fs" % (time() - t0)
    print ''
    
    print "Best score: %0.3f" % grid_search.best_score_
    print "Best parameters set:"
    
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(params.keys()):
        print "\t%s: %r" % (param_name, best_parameters[param_name])
    
    return grid_search

# Data
- Should create a validation set before tuning

- Data is tokenized and pre-processed ahead of time

In [13]:
s_transformer = TransformerABC(prefilter_substitutions=['strip'], tokenizer=lemmas_merge_ents)

In [14]:
data = list(s_transformer(docs_list))

In [35]:
' '.join(t.ent_type_ for t in nlp(docs_list[5001]))

u'CARDINAL                                     '

# Classifiers

In [15]:
noop = lambda x: x

In [16]:
mnb_pipe = Pipeline([
        ('vect', CountVectorizer(preprocessor=noop, 
                                 lowercase=True, 
                                 ngram_range=(1,3),
                                 token_pattern=r".*", 
                                 tokenizer=noop)),
        ('clf', MultinomialNB(alpha=0.075))
])

mnb_calibrated_pipe = Pipeline([
        ('vect', CountVectorizer(preprocessor=noop, 
                                 lowercase=True, 
                                 ngram_range=(1,3),
                                 token_pattern=r".*", 
                                 tokenizer=noop)),
        ('clf', CalibratedClassifierCV(MultinomialNB(alpha=0.075), method='sigmoid', cv=10))
])

mnb_iso_calibrated_pipe = Pipeline([
        ('vect', CountVectorizer(preprocessor=noop, 
                                 lowercase=True, 
                                 ngram_range=(1,3),
                                 token_pattern=r".*", 
                                 tokenizer=noop)),
        ('clf', CalibratedClassifierCV(MultinomialNB(alpha=0.075), method='isotonic', cv=10))
])

ng_pipe = Pipeline([
        ('clf', NgramClassifierMulti(pad_ngrams=True))
])

ng_calibrated_pipe = Pipeline([
        ('clf', CalibratedClassifierCV(NgramClassifierMulti(pad_ngrams=True), method='sigmoid', cv=5))
])


In [17]:
eclf = VotingClassifier(estimators=[('mnb', mnb_pipe),
                                    ('ngram', ng_pipe)],
                        voting='soft')

e_pipe = Pipeline([('eclf', eclf)])

eclf_calibrated = VotingClassifier(estimators=[('mnb', mnb_calibrated_pipe),
                                               ('ngram', ng_calibrated_pipe)],
                                   voting='soft')

e_calibrated_pipe = Pipeline([('eclf', eclf)])

In [17]:
e_grid_params = {
    'eclf__weights': [p for p in itertools.permutations([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 2)]
}

## Ensemble 
Even after using a grid search to find optimal weights the performance of the ensemble was very poor compared to each classifier one it's own.  
- *Ensemble*: 0.527
- MNB: 0.638
- Ngram: 0.603

I think that the reason for this is that the voting classifier uses the predicted probabilities `predict_proba` instead of using the actual predictions and both MNB and the ngram classifier are terrible at predicting the probability of a class given a sample, even though they're very good a predicting the actual class.  There's even a warning about this in the sklearn User Guide.  

The Ngram classifier isn't actually all that terrible as predicing the probabilities, it's just a very different kind of calculation and model.

To combat this, I thought I'd try a new class/feature that is only available in scikit-learn 18.0 (installed from master on github).  This class uses some held-out data to tune the output of `predict_proba`.

#### Attempt 2
After debugging the predict_proba method in the NgramClassifierMulti I tried again, as well as re-running MNB and Ngram separately (I suspect that lowercaing before entity detection may have actually boosted performance in the other tests for some reason) I got this:
- MNB: 0.636
- Ngram: 
- Ensemble: 0.636
- Calibrated Ensemble: 0.636

Both calibrated and uncalibrated ensembles are identical in performance to MNB by itself.  And I'm not entirely sure what to do about it now...

### Uncalibrated Ensemble
Unfortunately, even after a 5 hour grid-search for optimal weights for the two classifiers, the ensemble performed terribly compared to the individual classifiers.

In [18]:
# Best score: 0.527
# Best parameters set:
#    eclf__weights: (0.1, 0.2)
eclf_grid = process_grid(e_pipe, e_grid_params, np.asarray(data), labels_list)

Performing grid search...
pipeline: ['eclf']
parameters:
{'eclf__weights': [(0.1, 0.2),
                   (0.1, 0.3),
                   (0.1, 0.4),
                   (0.1, 0.5),
                   (0.1, 0.6),
                   (0.1, 0.7),
                   (0.1, 0.8),
                   (0.1, 0.9),
                   (0.1, 1.0),
                   (0.2, 0.1),
                   (0.2, 0.3),
                   (0.2, 0.4),
                   (0.2, 0.5),
                   (0.2, 0.6),
                   (0.2, 0.7),
                   (0.2, 0.8),
                   (0.2, 0.9),
                   (0.2, 1.0),
                   (0.3, 0.1),
                   (0.3, 0.2),
                   (0.3, 0.4),
                   (0.3, 0.5),
                   (0.3, 0.6),
                   (0.3, 0.7),
                   (0.3, 0.8),
                   (0.3, 0.9),
                   (0.3, 1.0),
                   (0.4, 0.1),
                   (0.4, 0.2),
                   (0.4, 0.3),
             

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed: 55.0min
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed: 221.4min
[Parallel(n_jobs=1)]: Done 270 out of 270 | elapsed: 300.1min finished


done in 18056.854s

Best score: 0.527
Best parameters set:
	eclf__weights: (0.1, 0.2)


In [23]:
# 0.63627281
cross_val_score(e_pipe, np.asarray(data), y=np.asarray(labels_list), cv=3, scoring='f1_weighted')

array([ 0.6347393 ,  0.62871688,  0.64536225])

In [24]:
sum([ 0.6347393 ,  0.62871688,  0.64536225])/3.0

0.63627281

### Ensemble of Calibrated Classifiers
Parameters to try and tune:
- Calibration cv proportion
- Calibration method
- Voting weights

If I do this single-threaded it's going to take days.  So before moving any further, I need to get the NgramClassifier to be pickleable so that training can be paralellized.  

#### Notes:
- I'm wondering if the Calibration is actually overpowering the ngram model predictions because the probabilities are so small.
    - Maybe try normalizing them agian?

In [18]:
# 0.63627281
scores = cross_val_score(e_calibrated_pipe, np.asarray(data), y=np.asarray(labels_list), cv=3, scoring='f1_weighted')
print scores
print np.mean(scores)

array([ 0.6347393 ,  0.62871688,  0.64536225])

In [19]:
sum([ 0.6347393 ,  0.62871688,  0.64536225])/3.0

0.63627281

### Ngram Classifier
I didn't go and do full testing for this as it's very slow, but I'm going to go with sigmoid calibration and cv=5 for now

In [47]:
ng_clf = NgramClassifierMulti(pad_ngrams=True)
scores = cross_val_score(ng_clf, np.asarray(data), y=np.asarray(labels_list), cv=3, scoring='f1_weighted')
print scores
print np.mean(scores)

[ 0.5755364   0.56865313  0.58511841]
0.576435981044


In [26]:
# 0.51829223 | cv 2
# 0.56232063 | cv 5
ng_sig_calibrated = CalibratedClassifierCV(NgramClassifierMulti(pad_ngrams=True), method='sigmoid', cv=5)
cross_val_score(ng_sig_calibrated, np.asarray(data), y=np.asarray(labels_list), cv=3, scoring='f1_weighted')

array([ 0.56352883,  0.55589307,  0.56753999])

In [29]:
sum([ 0.5055898 ,  0.49214416,  0.52167745])/3.0

0.50647047

In [27]:
# 0.50647047  | cv 2
ng_iso_calibrated = CalibratedClassifierCV(NgramClassifierMulti(pad_ngrams=True), method='isotonic', cv=5)
scores = cross_val_score(ng_iso_calibrated, np.asarray(data), y=np.asarray(labels_list), cv=3, scoring='f1_weighted')
print scores
print np.mean(scores)

array([ 0.5055898 ,  0.49214416,  0.52167745])

#### How much difference does the calibration actually make?
It looks like it makes a HUGE difference.  About 100 orders of magnitude.  And the relative sizes change too which is interesting.  That must be a broduct of the training as well.

In [20]:
# No calibration
ng_clf = NgramClassifierMulti(pad_ngrams=True)
ng_clf.fit(np.asarray(data[:5000]), y=np.asarray(labels_list[:5000]))

NgramClassifierMulti(alpha=0.01, n=4, pad_ngrams=True)

In [21]:
ng_clf.predict_proba(np.asarray([data[5001]]))

array([[ 2.2641113e-97,  8.7620716e-91,  3.6653292e-108,  2.12166e-100,
         1.8747944e-102,  2.5529805e-95,  3.7461873e-93,  3.7618544e-98,
         4.8920979e-94]], dtype=float128)

In [20]:
# Sigmoid calibration
ng_sig_calibrated = CalibratedClassifierCV(NgramClassifierMulti(pad_ngrams=True), method='sigmoid', cv=5)
ng_sig_calibrated.fit(np.asarray(data[:5000]), y=np.asarray(labels_list[:5000]))

CalibratedClassifierCV(base_estimator=NgramClassifierMulti(alpha=0.01, n=4, pad_ngrams=True),
            cv=5, method='sigmoid')

In [21]:
ng_sig_calibrated.predict_proba(np.asarray([data[5001]]))

array([[ 0.03016343,  0.2648251 ,  0.0078094 ,  0.01818411,  0.00965148,
         0.11937918,  0.22514597,  0.05931179,  0.26552953]])

In [None]:
# Isotonic Calibration
ng_iso_calibrated = CalibratedClassifierCV(NgramClassifierMulti(pad_ngrams=True), method='isotonic', cv=5)
ng_iso_calibrated.fit(np.asarray(data[:5000]), y=np.asarray(labels_list[:5000]))

In [None]:
ng_iso_calibrated.predict_proba(np.asarray(data[5001:5011]))

In [26]:
labels_list[5001]

u'SANDERS'

In [27]:
data[5001]

[u'One',
 u'candidate',
 u'say',
 u',',
 u'you',
 u'know',
 u'what',
 u',',
 u'i',
 u'do',
 u'not',
 u'think',
 u'it',
 u"'",
 u'a',
 u'great',
 u'idea',
 u'that',
 u'we',
 u'sell',
 u'automatic',
 u'weapon',
 u'in',
 u'this',
 u'country',
 u'that',
 u'be',
 u'use',
 u'by',
 u'the',
 u'military',
 u'to',
 u'kill',
 u'people',
 u'very',
 u'rapidly',
 u'.']

In [38]:
mnb_clf.predict_proba([data[5001]])

array([[  3.62153317e-16,   9.91291701e-12,   5.59500722e-23,
          5.63048562e-18,   3.68156402e-24,   7.30748436e-04,
          2.69235845e-08,   1.16932402e-07,   9.99269108e-01]])

### MNB
As far as I can tell, the calibration isn't really helping with the predictions of MNB.  I'm assuming that's because tuning the estimated probabilities doesn't really impact the predictions and all that's happening here is that the classifier has less data to learn from.

In [46]:
# no calibration
# 0.636272811277
scores = cross_val_score(mnb_pipe, np.asarray(data), y=labels_list, cv=3, scoring='f1_weighted')
print scores
print np.mean(scores)

[ 0.6347393   0.62871688  0.64536225]
0.636272811277


In [58]:
# Sigmoid calibration
# score          | Calibration cv
# 0.58836935204  | cv 2
# 0.656218712521 | cv 4
# 0.656513250898 | cv 5
# 0.659902762496 | cv 10
scores = cross_val_score(mnb_calibrated_pipe, np.asarray(data), y=labels_list, cv=10, scoring='f1_weighted')
print scores
print np.mean(scores)

[ 0.66328298  0.64550859  0.65134172  0.65005093  0.68389557  0.66615237
  0.63893501  0.6602161   0.67554382  0.66410054]
0.659902762496


In [59]:
# Isotonic calibration
# score          | Calibration cv
# 0.625965743338 | cv 2
# 0.649969046504 | cv 4
# 0.648737116078 | cv 5
# 0.646710928805 | cv 10
scores = cross_val_score(mnb_iso_calibrated_pipe, np.asarray(data), y=labels_list, cv=10, scoring='f1_weighted')
print scores
print np.mean(scores)

[ 0.64189394  0.62043737  0.63846839  0.64250606  0.67504492  0.65079924
  0.62704031  0.64975024  0.66868239  0.65248642]
0.646710928805


#### Now much difference is calibration making in the outputs?
It looks very similar to ngram calibrated.  It's normalized and there's definitely a difference

In [39]:
mnb_clf = Pipeline([
        ('vect', CountVectorizer(preprocessor=noop, 
                                 lowercase=True, 
                                 ngram_range=(1,3),
                                 token_pattern=r".*", 
                                 tokenizer=noop)),
        ('clf', MultinomialNB(alpha=0.075))
])
mnb_clf.fit(np.asarray(data[:5000]), y=labels_list[:5000])

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3),
        preprocessor=<function <lambda> at 0x7...5f266500>, vocabulary=None)), ('clf', MultinomialNB(alpha=0.075, class_prior=None, fit_prior=True))])

In [40]:
mnb_clf.predict_proba([data[5001]])

array([[  3.62153317e-16,   9.91291701e-12,   5.59500722e-23,
          5.63048562e-18,   3.68156402e-24,   7.30748436e-04,
          2.69235845e-08,   1.16932402e-07,   9.99269108e-01]])

In [44]:
mnb_cal = Pipeline([
        ('vect', CountVectorizer(preprocessor=noop, 
                                 lowercase=True, 
                                 ngram_range=(1,3),
                                 token_pattern=r".*", 
                                 tokenizer=noop)),
        ('clf', CalibratedClassifierCV(MultinomialNB(alpha=0.075), method='sigmoid', cv=5))
])
mnb_cal.fit(np.asarray(data[:5000]), y=np.asarray(labels_list[:5000]))

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3),
        preprocessor=<function <lambda> at 0x7...=MultinomialNB(alpha=0.075, class_prior=None, fit_prior=True),
            cv=5, method='sigmoid'))])

In [45]:
mnb_cal.predict_proba([data[5001]])

array([[ 0.04600529,  0.03726734,  0.03414099,  0.08494592,  0.06160214,
         0.09824496,  0.08294551,  0.0630373 ,  0.49181055]])

In [20]:
ngm = NgramClassifierMulti(pad_ngrams=True)

In [21]:
ngm.fit(np.asarray(data[:1000]), labels_list[:1000])

NgramClassifierMulti(alpha=0.01, n=4, pad_ngrams=True)

In [22]:
ngm.predict([["I", "know", 'these', 'people', '.']])

array([u'TRUMP'], 
      dtype='<U8')

In [32]:
ngm.predict_proba(np.asarray([["I", "know", 'these', 'people', '.']]))

array([[  4.59896116e-30,   7.29195013e-24,   1.21394829e-24,
          1.09368510e-25,   2.22115853e-30,   3.58526733e-30,
          2.05846351e-25,   1.24726343e-25,   4.70065852e-23]])

In [33]:
-np.log2(ngm.predict_proba(np.asarray([["I", "know", 'these', 'people', '.']])))

array([[ 97.45653483,  76.85996958,  79.4465673 ,  82.91900496,
         98.50653048,  97.81576215,  82.0066345 ,  82.72943617,  74.1714833 ]])

In [27]:

mnb_pipe.fit(np.asarray(data[:1000]), labels_list[:1000])

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3),
        preprocessor=<function <lambda> at 0x...fe55878c0>, vocabulary=None)), ('clf', MultinomialNB(alpha=0.75, class_prior=None, fit_prior=True))])

In [34]:
mnb_pipe.predict_proba(np.asarray([["I", "know", 'these', 'people', '.']]))

array([[ 0.00298788,  0.02707586,  0.0074726 ,  0.03757738,  0.00335146,
         0.01125558,  0.02626028,  0.07577926,  0.80823969]])

In [55]:
-np.log(mnb_pipe.predict_proba(np.asarray([["I", "know", 'these', 'people', '.']])))

array([[ 5.81319012,  3.60911254,  4.89651188,  3.2813531 ,  5.69835901,
         4.48689127,  3.63969773,  2.57993067,  0.21289661]])

In [58]:
mnb_pipe.steps

[('vect',
  CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
          dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
          lowercase=False, max_df=1.0, max_features=None, min_df=1,
          ngram_range=(1, 3),
          preprocessor=<function <lambda> at 0x7f4fe55878c0>,
          stop_words=None, strip_accents=None, token_pattern='.*',
          tokenizer=<function <lambda> at 0x7f4fe55878c0>, vocabulary=None)),
 ('clf', MultinomialNB(alpha=0.75, class_prior=None, fit_prior=True))]

In [60]:
mnb_v = mnb_pipe.steps[0][1]
mnb_clf = mnb_pipe.steps[1][1]

mnb_clf._joint_log_likelihood(mnb_v.transform(np.asarray([["I", "know", 'these', 'people', '.']])))

array([[-55.07560706, -52.87152948, -54.15892882, -52.54377004,
        -54.96077594, -53.7493082 , -52.90211467, -51.84234761,
        -49.47531355]])

In [64]:
from scipy.misc import logsumexp

In [65]:
jll = mnb_clf._joint_log_likelihood(mnb_v.transform(np.asarray([["I", "know", 'these', 'people', '.']])))
logsumexp(jll, axis=1)

array([-49.26241694])

In [70]:
jll

array([[-55.07560706, -52.87152948, -54.15892882, -52.54377004,
        -54.96077594, -53.7493082 , -52.90211467, -51.84234761,
        -49.47531355]])

In [66]:
jll - np.atleast_2d(logsumexp(jll, axis=1)).T

array([[-5.81319012, -3.60911254, -4.89651188, -3.2813531 , -5.69835901,
        -4.48689127, -3.63969773, -2.57993067, -0.21289661]])

In [67]:
np.atleast_2d(logsumexp(jll, axis=1)).T

array([[-49.26241694]])

In [68]:
jll - logsumexp(jll, axis=1)

array([[-5.81319012, -3.60911254, -4.89651188, -3.2813531 , -5.69835901,
        -4.48689127, -3.63969773, -2.57993067, -0.21289661]])

In [69]:
jll.shape

(1, 9)

In [21]:
[(i,j) for i in range(4) for j in range(i+1, 4)]

[(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]

In [22]:
X = np.asarray(data[1001:1011])
idxs = [(i,j) for i in range(9) for j in range(i+1, 9)]
confidences = [est.predict_proba(X) for est in ngm.estimators_]

In [23]:
confidences[0].shape

(10, 2)

In [24]:
probs = np.zeros((len(ngm.estimators_), len(ngm.classes_), 10))

for i, c in enumerate(confidences):
    tup = idxs[i]
    for j, col in enumerate(tup):
        probs[i, col, :] = c[:, j]
        
probs.shape

(36, 9, 10)

In [25]:
probs.mean(axis=0).T.shape

(10, 9)

In [52]:
t = np.array([[[1,0,1,0], [0,1,0,1]], [[0,1,0,1],[1,0,1,0]], [[1,0,1,0], [0,1,0,1]]])
print t.shape
t

(3, 2, 4)


array([[[1, 0, 1, 0],
        [0, 1, 0, 1]],

       [[0, 1, 0, 1],
        [1, 0, 1, 0]],

       [[1, 0, 1, 0],
        [0, 1, 0, 1]]])

In [54]:
t.sum(0)

array([[2, 1, 2, 1],
       [1, 2, 1, 2]])