In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import os
import itertools
from collections import defaultdict
from spacy.en import English

In [2]:
os.chdir("..")
os.getcwd()

'/mnt/Storage/Coding_Projects/Candidate_Classifier'

In [3]:
os.getcwd()

'/mnt/Storage/Coding_Projects/Candidate_Classifier'

## Corpora

In [4]:
from candidate_classifier.nltk_model import NgramModel
from candidate_classifier import utils
from nltk.probability import LaplaceProbDist, LidstoneProbDist
from nltk.corpus import PlaintextCorpusReader
import os
from candidate_classifier.debate_corpus_reader import DebateCorpusReader
from candidate_classifier.string_processing import *

In [5]:
nlp = English(entity=False, load_vectors=False)

In [6]:
class TransformerWrapper(object):
    def __init__(self, transformer):
        self.transformer = transformer
    
    def tokenize(self, s):
        return self.transformer(s)

class DummyTokenizer(object):
    def tokenize(self, s):
        return s
    
def sent_tokenizer(s):
    doc = nlp(s)
    return [u''.join(t.text_with_ws for t in sent) for sent in doc.sents]


# def word_tokenizer(s):
#     toks = nlp(s)
#     return ['<S>'] + [t.lower_ for t in toks] + ['</S>']

In [7]:
# Replace [*] with ''
# Replace '. . .' with '...'
# Replace multiple ellipses with single ...
# Remove all sentences that end with ...

BRACKET_PATTERN = re.compile(r"\[[a-zA-Z ]*\]", re.U)
SPACED_ELLIPSIS_PATTERN = re.compile(r"((?:\.\s){3})")
MULTI_ELLIPSIS_PATTERN = re.compile(r"(?:(?:\.){3} ?)+")
ELLIPSIS_BRACKET_PATTERN = re.compile(r"(?:(?:\.){3}) *\[[a-zA-Z ]*\] *(?:(?:\.){3} ?)")
ENDS_WITH_ELLIPSIS = lambda s: s[-3:] == '...'
STARTS_WITH_ELLIPSIS = lambda s: s[:3] == '...'
STARTS_WITH_DASH = lambda s: s[0] == '-'
ENDS_WITH_DASH = lambda s: s[-1] == '-'

In [8]:
doc_transformer = TransformerABC(
    prefilter_substitutions=['html entities',
                             'deaccent',
                             'whitespace',
                             (ELLIPSIS_BRACKET_PATTERN, ' '),
                             BRACKET_PATTERN,
                             (SPACED_ELLIPSIS_PATTERN, '...'),
                             (MULTI_ELLIPSIS_PATTERN, '...'),
                             'whitespace',
                             'strip'],
    tokenizer=sent_tokenizer)

sent_filter = TransformerABC(
    prefilter_substitutions=['puntuation', 'strip'],
    filters=[('len', 49)])

In [9]:
dcr = DebateCorpusReader('candidate_classifier/data/raw', '.*\.txt', 
                             sent_tokenizer=TransformerWrapper(doc_transformer), 
                             word_tokenizer=DummyTokenizer())
candidates = ['BUSH', 'CARSON', 'CHRISTIE', 'CRUZ', 'KASICH', 'RUBIO', 'TRUMP', 'CLINTON', 'SANDERS']

## Preprocess to sentences
I have officially decided that the task is classifying sentences and snippits.  As such, each document will be a sentence.  

#### Precrocessing Steps
- Tokenize to sentences and normalize encoding
- remove all non-ascii characters
- All whitespace to spaces (no newlines)
- Filter all sentences that are less than 50 characters (after removing all punctuation)
- Group all sentences together with their labels and shuffle them
- Write text and labels to (separate) line-delimited files


In [112]:
cleaned_sents = []
for label, sents in dcr.grouped_sents(speakers=candidates).iteritems():
    for sent in sents:
#             stripped = sent.strip(string.punctuation+ ' \n\t')
#             if len(stripped) >= 35:
#                 cleaned_sents.append((label, sent))
        if sent_filter(sent):
            cleaned_sents.append((label, sent))

In [113]:
len(cleaned_sents)

8851

In [114]:
import random
random.shuffle(cleaned_sents)

In [12]:
sents_path = 'candidate_classifier/data/processed/clean_sents.txt'
labels_path = 'candidate_classifier/data/processed/sent_labels.txt'

In [116]:
import codecs

sents_file = codecs.open(sents_path, mode='w', encoding='utf-8')
labels_file = codecs.open(labels_path, mode='w', encoding='utf-8')
    
for sent in cleaned_sents:
    sents_file.write(u'%s\n' % sent[1])
    labels_file.write(u'%s\n' % sent[0])

sents_file.close()
labels_file.close()

## Stats

In [100]:
import pprint
import json
from tabulate import tabulate

In [74]:
def sentence_stats(dcr, filt, speakers=None):
    """
    Returns a dictionary for the given speakers with stats for all sentences
    dcr is a DebateCorpusReader
    filt is a callable that will turn sentences we want to filter into a falsy value
    """
    if speakers is None:
        speakers = dcr.speakers()
    stats = {s:dict() for s in speakers}
    
    for speaker, sents in dcr.grouped_sents(speakers=speakers).iteritems():
        stats[speaker]['count'] = len(sents)
        lengths = [len(s) for s in sents]
        mean_length = np.mean(lengths)
        std = np.std(lengths)
        stats[speaker]['length'] = '%0.2f (+/- %0.2f)' % (mean_length, std*1.960)
    
    return stats
    

In [110]:
sent_stats = sentence_stats(dcr, sent_filter)

In [109]:
for tup in sorted(sent_stats.iteritems(), key=lambda tup: tup[1]['count'], reverse=True):
#     print "%s\t\t\tcount: %s\tlength: %s" % (tup[0], tup[1]['count'], tup[1]['length'])
    print "{: <20} {: <10} {: <20}".format(tup[0], tup[1]['count'], tup[1]['length'])

TRUMP                2647       57.55 (+/- 91.10)   
SANDERS              1948       90.94 (+/- 144.33)  
CLINTON              1867       102.53 (+/- 147.05) 
RUBIO                1833       87.63 (+/- 119.96)  
CRUZ                 1504       90.61 (+/- 131.77)  
KASICH               1410       80.09 (+/- 124.65)  
BUSH                 1374       78.07 (+/- 115.14)  
CHRISTIE             961        86.49 (+/- 126.81)  
CARSON               907        88.96 (+/- 122.32)  
PAUL                 704        76.77 (+/- 100.19)  
FIORINA              546        80.49 (+/- 120.56)  
MUIR                 510        71.71 (+/- 124.24)  
DICKERSON            485        73.13 (+/- 116.95)  
BLITZER              456        50.85 (+/- 92.77)   
TAPPER               429        56.00 (+/- 95.77)   
COOPER               427        68.51 (+/- 114.06)  
RADDATZ              321        78.61 (+/- 118.99)  
HUCKABEE             270        87.93 (+/- 128.88)  
KELLY                246        70.84 (+/- 120

## Classifier

In [30]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
import numpy as np
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import make_scorer, classification_report, f1_score
import codecs
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.grid_search import GridSearchCV

In [13]:
def docs():
    with codecs.open(sents_path, mode='r', encoding='utf-8') as _f:
        for line in _f:
            yield line

def labels():
    with codecs.open(labels_path, mode='r', encoding='utf-8') as _f:
        for line in _f:
            yield line.strip()
labels_list = list(labels())
candidates = sorted(list(set(labels_list)))
docs_list = list(docs())
print candidates

[u'BUSH', u'CARSON', u'CHRISTIE', u'CLINTON', u'CRUZ', u'KASICH', u'RUBIO', u'SANDERS', u'TRUMP']


In [118]:
simple_cleaner = TransformerABC(prefilter_substitutions=['punct', 'strip', 'lower'])

In [62]:
def get_scores(clf, X, y):
    scores = cross_val_score(clf, X, y, cv=10, scoring='f1_samples')
    print scores
    print "\n"
    # Use 1.96 * std b/c 95% of the data should lie in that range, 
    # which means this represents a 95% confidence interval
    print "F1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 1.960)

In [119]:
simple_vect = CountVectorizer(preprocessor=simple_cleaner, tokenizer=lambda s: s.split())

In [120]:
simple_pipe = make_pipeline(simple_vect, MultinomialNB())

In [14]:
def fancy_scorer(y, y_pred, **kwargs):
    # Print classification report
    print classification_report(y, y_pred, target_names=candidates)
    return f1_score(y, y_pred, labels=candidates, average='weighted')    

def f1_weighted_scorer(y, y_pred, **kwargs):
    return f1_score(y, y_pred, labels=candidates, average='weighted')

In [124]:
cross_val_score(simple_pipe, docs_list, labels_list, cv=10, scoring=make_scorer(fancy_scorer))

             precision    recall  f1-score   support

       BUSH       0.73      0.39      0.50        83
     CARSON       0.44      0.12      0.19        65
   CHRISTIE       0.65      0.27      0.39        62
    CLINTON       0.48      0.71      0.57       136
       CRUZ       0.65      0.50      0.56       101
     KASICH       0.59      0.42      0.49        84
      RUBIO       0.47      0.61      0.53       122
    SANDERS       0.56      0.65      0.60       123
      TRUMP       0.54      0.76      0.63       113

avg / total       0.56      0.54      0.52       889

             precision    recall  f1-score   support

       BUSH       0.63      0.35      0.45        83
     CARSON       0.50      0.12      0.20        65
   CHRISTIE       0.65      0.24      0.35        62
    CLINTON       0.48      0.73      0.58       135
       CRUZ       0.59      0.53      0.56       101
     KASICH       0.53      0.48      0.50        84
      RUBIO       0.47      0.62      0.54

array([ 0.52231296,  0.51771904,  0.51920321,  0.5269497 ,  0.53510422,
        0.53976703,  0.51430653,  0.52304192,  0.50787193,  0.5573232 ])

## Grid Searches

In [27]:
simple_cleaner = TransformerABC(prefilter_substitutions=['strip', 'lower'])

### Tokenizers

In [54]:
def simple_tokenizer(s):
    return s.split()

def lemmas_tokenizer(s):
    return [s.lemma_ for s in nlp(s)]

def lemmas_no_punct_tokenizer(s):
    return [s.lemma_ for s in nlp(s) if not any([s.is_punct, s.is_space])]

def lemmas_no_punt_no_num(s):
    return [s.lemma_ for s in nlp(s) if not any([s.is_punct, s.is_space, s.is_digit, s.is_like_num])]

def lemmas_sub_nums(s):
    pass

def lemmas_no_punct_sub_nums():
    pass

# Spacy tokenization, no lemmatization...

### Stopwords Sets

In [26]:
sk_stops = ENGLISH_STOP_WORDS

### Pipelines

In [15]:
mnb_pipe = Pipeline([
        ('vect', CountVectorizer(preprocessor=simple_cleaner)),
        ('clf', MultinomialNB())
])

sgd_pipe = Pipeline([
        ('vect', CountVectorizer(preprocessor=simple_cleaner)),
        ('clf', SGDClassifier())
])

rf_pipe = Pipeline([
        ('vect', CountVectorizer(preprocessor=simple_cleaner)),
        ('clf', RandomForestClassifier())
])

svm_pipe = Pipeline([
        ('vect', CountVectorizer(preprocessor=simple_cleaner)),
        ('clf', LinearSVC())
])

# TODO:
# - TFIDF
# - log-count ratio?
# - binarize

pipes = [mnb_pipe, sgd_pipe, rf_pipe, svm_pipe]

### Grids

In [73]:
mnb_grid_params = {
    'vect__ngram_range': ((1,1),(1,2),(1,3)),
    'vect__stop_words': ('english', None),
    'vect__tokenizer': (simple_tokenizer, lemmas_tokenizer, lemmas_no_punct_tokenizer),
    'clf__alpha': (0.001, 0.01, 0.1, 1)
}

sgd_grid_params = {
    'vect__ngram_range': ((1,1),(1,2),(1,3)),
    'vect__stop_words': ('english', None),
    'vect__tokenizer': (lemmas_tokenizer, lemmas_no_punct_tokenizer),
    'clf__loss': ('hinge', 'log', 'perceptron'),
    'clf__penalty': ('l1', 'l2', 'elasticnet'),
    'clf__alpha': (0.0001, 0.00001, 0.000001),
    'clf__l1_ratio': (0.15, 0.05, 0.005),
    'clf__fit_intercept': (True, False),
    'clf__n_iter': (10, 50, 85)
}

rf_grid_params = {}

svm_grid_params = {}

grids = [mnb_grid_params, sgd_grid_params, rf_grid_params, svm_grid_params]

In [32]:
from pprint import pprint
from time import time
import logging

In [33]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
fmt = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hndlr = logging.StreamHandler()
hndlr.setFormatter(fmt)
logger.addHandler(hndlr)

In [52]:
def process_grid(pipe, params, data, labels):
    grid_search = GridSearchCV(pipe, params, n_jobs=-1, scoring=make_scorer(f1_weighted_scorer), cv=3, verbose=1)
    
    print "Performing grid search..."
    print "pipeline:", [name for name, _ in pipe.steps]
    print "parameters:"
    pprint(params)
    
    t0 = time()
    grid_search.fit(data, labels)
    
    print "done in %0.3fs" % (time() - t0)
    print ''
    
    print "Best score: %0.3f" % grid_search.best_score_
    print "Best parameters set:"
    
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(params.keys()):
        print "\t%s: %r" % (param_name, best_parameters[param_name])
    
    return grid_search

In [53]:
mnb_grid = process_grid(mnb_pipe, mnb_grid_params, docs_list, labels_list)

Performing grid search...
pipeline: ['vect', 'clf']
parameters:
{'clf__alpha': (0.001, 0.01, 0.1, 1),
 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
 'vect__stop_words': ('english', None),
 'vect__tokenizer': (<function simple_tokenizer at 0x7f5254a35b90>,
                     <function lemmas_tokenizer at 0x7f5254a35f50>,
                     <function lemmas_no_punct_tokenizer at 0x7f5254a35ed8>)}
Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   54.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed:  5.5min finished


done in 343.143s

Best score: 0.634
Best parameters set:
	clf__alpha: 0.1
	vect__ngram_range: (1, 3)
	vect__stop_words: None
	vect__tokenizer: <function lemmas_tokenizer at 0x7f5254a35f50>


In [58]:
mnb_best.grid_scores_

[mean: 0.48942, std: 0.00521, params: {'vect__ngram_range': (1, 1), 'vect__tokenizer': <function simple_tokenizer at 0x7f5254a35b90>, 'vect__stop_words': 'english', 'clf__alpha': 0.001},
 mean: 0.50824, std: 0.00670, params: {'vect__ngram_range': (1, 1), 'vect__tokenizer': <function lemmas_tokenizer at 0x7f5254a35f50>, 'vect__stop_words': 'english', 'clf__alpha': 0.001},
 mean: 0.50508, std: 0.00504, params: {'vect__ngram_range': (1, 1), 'vect__tokenizer': <function lemmas_no_punct_tokenizer at 0x7f5254a35ed8>, 'vect__stop_words': 'english', 'clf__alpha': 0.001},
 mean: 0.50453, std: 0.00474, params: {'vect__ngram_range': (1, 1), 'vect__tokenizer': <function simple_tokenizer at 0x7f5254a35b90>, 'vect__stop_words': None, 'clf__alpha': 0.001},
 mean: 0.52849, std: 0.00333, params: {'vect__ngram_range': (1, 1), 'vect__tokenizer': <function lemmas_tokenizer at 0x7f5254a35f50>, 'vect__stop_words': None, 'clf__alpha': 0.001},
 mean: 0.52745, std: 0.00589, params: {'vect__ngram_range': (1, 1)

In [66]:
print classification_report(labels_list, mnb_best.best_estimator_.predict(docs_list), target_names=candidates)

             precision    recall  f1-score   support

       BUSH       1.00      1.00      1.00       828
     CARSON       1.00      1.00      1.00       644
   CHRISTIE       1.00      1.00      1.00       616
    CLINTON       1.00      1.00      1.00      1351
       CRUZ       1.00      1.00      1.00      1009
     KASICH       1.00      1.00      1.00       834
      RUBIO       1.00      1.00      1.00      1214
    SANDERS       1.00      1.00      1.00      1226
      TRUMP       1.00      1.00      1.00      1129

avg / total       1.00      1.00      1.00      8851



In [67]:
mnb_tuned_pipe = Pipeline([
        ('vect', CountVectorizer(preprocessor=simple_cleaner,
                                 ngram_range=(1,3),
                                 tokenizer=lemmas_tokenizer)),
        ('clf', MultinomialNB(alpha=0.01))
])

In [68]:
cross_val_score(mnb_tuned_pipe, docs_list, labels_list, cv=10, scoring=make_scorer(fancy_scorer))

             precision    recall  f1-score   support

       BUSH       0.64      0.63      0.63        83
     CARSON       0.60      0.55      0.58        65
   CHRISTIE       0.50      0.47      0.48        62
    CLINTON       0.69      0.79      0.73       136
       CRUZ       0.73      0.61      0.67       101
     KASICH       0.58      0.62      0.60        84
      RUBIO       0.66      0.62      0.64       122
    SANDERS       0.77      0.74      0.76       123
      TRUMP       0.67      0.75      0.71       113

avg / total       0.66      0.66      0.66       889

             precision    recall  f1-score   support

       BUSH       0.65      0.60      0.63        83
     CARSON       0.55      0.51      0.53        65
   CHRISTIE       0.59      0.60      0.59        62
    CLINTON       0.64      0.73      0.68       135
       CRUZ       0.73      0.68      0.70       101
     KASICH       0.56      0.63      0.59        84
      RUBIO       0.64      0.63      0.63

array([ 0.66223059,  0.64484832,  0.63723762,  0.64807637,  0.68796454,
        0.65502752,  0.65799286,  0.66213803,  0.66871257,  0.66857936])

In [69]:
sum([ 0.66223059,  0.64484832,  0.63723762,  0.64807637,  0.68796454,
        0.65502752,  0.65799286,  0.66213803,  0.66871257,  0.66857936])/10.0

0.6592807780000001

In [74]:
sgd_grid = process_grid(sgd_pipe, sgd_grid_params, docs_list, labels_list)

Performing grid search...
pipeline: ['vect', 'clf']
parameters:
{'clf__alpha': (0.0001, 1e-05, 1e-06),
 'clf__fit_intercept': (True, False),
 'clf__l1_ratio': (0.15, 0.05, 0.005),
 'clf__loss': ('hinge', 'log', 'perceptron'),
 'clf__n_iter': (10, 50, 85),
 'clf__penalty': ('l1', 'l2', 'elasticnet'),
 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
 'vect__stop_words': ('english', None),
 'vect__tokenizer': (<function lemmas_tokenizer at 0x7f52783ee848>,
                     <function lemmas_no_punct_tokenizer at 0x7f5254e21b18>)}
Fitting 3 folds for each of 5832 candidates, totalling 17496 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 16.9min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 30.7min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 48.0min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 69.2min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 94.0min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 123.0min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 155.6min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 192.0min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed: 232.1min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 276.4min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed: 324.7min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed: 376.2min
[Parallel(n_jobs=-1)]: Done 11234 tasks      | elapsed: 431.7min
[Parallel(n_jobs=-1)]: Done 12784 tasks      | ela

done in 40381.515s

Best score: 0.579
Best parameters set:
	clf__alpha: 0.0001
	clf__fit_intercept: False
	clf__l1_ratio: 0.05
	clf__loss: 'hinge'
	clf__n_iter: 85
	clf__penalty: 'l2'
	vect__ngram_range: (1, 3)
	vect__stop_words: None
	vect__tokenizer: <function lemmas_no_punct_tokenizer at 0x7f5254e21b18>


In [48]:
process_grid(mnb_pipe, mnb_grid_params, docs_list, labels_list)

Performing grid search...
pipeline: ['vect', 'clf']
parameters:
{'clf__alpha': (0.001, 0.01, 0.1, 1),
 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
 'vect__stop_words': ('english', None),
 'vect__tokenizer': (<function simple_tokenizer at 0x7f5254a35b90>,
                     <function lemmas_tokenizer at 0x7f5254a35f50>,
                     <function lemmas_no_punct_tokenizer at 0x7f5254a35ed8>)}
Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:  5.0min
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed: 23.6min
[Parallel(n_jobs=1)]: Done 449 tasks       | elapsed: 54.2min
[Parallel(n_jobs=1)]: Done 720 out of 720 | elapsed: 89.2min finished


done in 5363.179s

Best score: 0.659
Best parameters set:
	clf__alpha: 0.1
	vect__ngram_range: (1, 3)
	vect__stop_words: None
	vect__tokenizer: <function lemmas_tokenizer at 0x7f5254a35f50>


## Scratch

In [17]:
txt = u"Listening to this, do you think this is the tone — this immigration debate that republicans need to take to win back Hispanics into our party especially states like where we are in Nevada that has a pretty Hispanic community?"

In [18]:
toks = nlp(txt)
' | '.join(tok.lemma_ for tok in toks)

u'listen | to | this | , | do | you | think | this | be | the | tone | -- | this | immigration | debate | that | republicans | need | to | take | to | win | back | hispanic | into | our | party | especially | state | like | where | we | be | in | nevada | that | have | a | pretty | hispanic | community | ?'

In [20]:
'--' in set(string.punctuation)

False

In [25]:
toks[11].is_punct

True