In [1]:
from sklearn.cross_validation import cross_val_score
import numpy as np

In [26]:
class Dummy(object):
    def fit(self, X, y):
        return self
    
    def predict(self, X):
        return [1]*len(X)
    
    def get_params(self, deep=False):
        return {}

In [42]:
data = np.zeros((100,1))
labels = [1]*100

In [44]:
cross_val_score(Dummy(), data, y=labels, cv=10, scoring='f1')

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])

# NGramModel Classifier

In [1]:
from sklearn.cross_validation import cross_val_score
import numpy as np
import ujson
import os
from sklearn.cross_validation import KFold

In [2]:
os.chdir('..')

In [3]:
from candidate_classifier.nltk_model.ngram_classifier import NgramClassifier

In [33]:
with open('candidate_classifier/data/processed/processed.json', 'rb') as _f:
    processed = ujson.load(_f)

trump_sents = filter(lambda s: len(s) > 5, processed['TRUMP']['sents'])
trump_labels = [1]*len(trump_sents)
hillary_sents = filter(lambda s: len(s) > 5, processed['CLINTON']['sents'])
hillary_labels = [0]*len(hillary_sents)

In [34]:
data = trump_sents+hillary_sents
labels = trump_labels + hillary_labels

In [6]:
classifier = NgramClassifier()
cross_val_score(classifier, data, y=labels, cv=KFold(len(data), n_folds=10, shuffle=True, random_state=1), scoring='f1')

array([ 0.85441527,  0.85308057,  0.81516588,  0.85645933,  0.8337469 ,
        0.85254692,  0.81108312,  0.82506527,  0.85046729,  0.85148515])

In [35]:
classifier = NgramClassifier()
cross_val_score(classifier, data, y=labels, cv=KFold(len(data), n_folds=10, shuffle=True, random_state=1), scoring='f1')

array([ 0.89090909,  0.90502793,  0.89940828,  0.89230769,  0.93055556,
        0.95402299,  0.8914956 ,  0.88501742,  0.90449438,  0.89020772])

In [36]:
classifier = NgramClassifier()
cross_val_score(classifier, data, y=labels, cv=KFold(len(data), n_folds=10, shuffle=True, random_state=1), scoring='f1_weighted')

array([ 0.88146989,  0.88778878,  0.88804892,  0.88457193,  0.9339934 ,
        0.94740645,  0.87793914,  0.89103201,  0.88832336,  0.87784199])

In [37]:
a = np.array([1, 2, 6, 4, 2, 3, 2])
np.unique(a, return_counts=True)   

(array([1, 2, 3, 4, 6]), array([1, 3, 1, 1, 1]))

In [16]:
import itertools

In [20]:
def rangeg(m):
    curr = 0
    while curr < m:
        yield curr
        curr += 1

In [22]:
r = rangeg(5)
it = itertools.takewhile(lambda x: x<3, r)
[e for e in it]
r.next()

4

In [24]:
from nltk.util import ngrams

In [27]:
list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))

[('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]

In [30]:
list(ngrams([1,2,3,4,5], 3, pad_left=True, left_pad_symbol=('',)*2))

[(('', ''), ('', ''), 1), (('', ''), 1, 2), (1, 2, 3), (2, 3, 4), (3, 4, 5)]

In [29]:
list(ngrams([1,2,3,4,5], 2))

[(1, 2), (2, 3), (3, 4), (4, 5)]

In [31]:
list(ngrams([1,2,3,4,5], 3, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))

[('<s>', '<s>', 1),
 ('<s>', 1, 2),
 (1, 2, 3),
 (2, 3, 4),
 (3, 4, 5),
 (4, 5, '</s>'),
 (5, '</s>', '</s>')]

# Multi-Class

In [6]:
from sklearn.multiclass import OneVsOneClassifier

In [7]:
labels = []
data = []
for name, d in processed.iteritems():
    if name in {'TRUMP', 'CLINTON', 'SANDERS', 'RUBIO', 'KASICH', 'BUSH', 'CHRISTIE', 'CARSON', 'CRUZ'}:
        data.extend(d['sents'])
        labels.extend([name]*len(d['sents']))
len(labels), len(data)

(11814, 11814)

In [15]:
multi_clf = OneVsOneClassifier(NgramClassifier())
cross_val_score(multi_clf, np.asarray(data), y=labels, cv=KFold(len(data), n_folds=10, shuffle=True, random_state=1), scoring='f1_weighted')

array([ 0.5502757 ,  0.53792265,  0.53453339,  0.51430339,  0.54701641,
        0.53162138,  0.50450235,  0.53508308,  0.52657834,  0.50224594])

In [27]:
# Using class probabilities where the chosen class always equals 1
multi_clf2 = OneVsOneClassifier(NgramClassifier())
cross_val_score(multi_clf2, np.asarray(data), y=labels, cv=KFold(len(data), n_folds=3, shuffle=True, random_state=1), scoring='f1_weighted')

array([ 0.50750453,  0.5083891 ,  0.49957373])

In [10]:
# Using completely raw (no normalization) class probabilities
multi_clf2 = OneVsOneClassifier(NgramClassifier())
cross_val_score(multi_clf2, np.asarray(data), y=labels, cv=KFold(len(data), n_folds=3, shuffle=True, random_state=1), scoring='f1_weighted')

array([ 0.50432043,  0.50617436,  0.49509918])

In [10]:
len(data)

11814

In [8]:
multi = OneVsOneClassifier(NgramClassifier())

In [13]:
multi.set_params(**{'estimator__n': 4})

OneVsOneClassifier(estimator=NgramClassifier(alpha=0.01, n=4), n_jobs=1)

In [14]:
multi.fit(np.asarray(data[:5000]), labels[:5000])

OneVsOneClassifier(estimator=NgramClassifier(alpha=0.01, n=4), n_jobs=1)

In [15]:
multi.score(data[2000:4000], labels[2000:4000])

0.86899999999999999