In [None]:
# NGramModel Classifier

In [1]:
from sklearn.cross_validation import cross_val_score
import numpy as np
import ujson
import os
from sklearn.cross_validation import StratifiedKFold

In [2]:
os.chdir('..')

In [3]:
from candidate_classifier.nltk_model.ngram_classifier import NgramClassifier, logsumexp2, NgramClassifierMulti

In [4]:
with open('candidate_classifier/data/processed/processed.json', 'rb') as _f:
    processed = ujson.load(_f)

trump_sents = filter(lambda s: len(s) > 5, processed['TRUMP']['sents'])
trump_labels = [1]*len(trump_sents)
hillary_sents = filter(lambda s: len(s) > 5, processed['CLINTON']['sents'])
hillary_labels = [0]*len(hillary_sents)

In [5]:
data = trump_sents+hillary_sents
labels = trump_labels + hillary_labels

In [6]:
classifier = NgramClassifier()
cross_val_score(classifier, data, y=labels, cv=KFold(len(data), n_folds=10, shuffle=True, random_state=1), scoring='f1_weighted')

array([ 0.88146989,  0.88778878,  0.88804892,  0.88457193,  0.9339934 ,
        0.94740645,  0.87793914,  0.89103201,  0.88832336,  0.87784199])

In [6]:
classifier = NgramClassifier()
classifier.fit(np.asarray(data), labels)

NgramClassifier(alpha=0.01, n=4, pad_ngrams=False)

In [7]:
classifier.predict(np.asarray([["I", "know", 'these', 'people', '.']]))

[1]

In [12]:
classifier._calc_prob_ratio(hillary_sents[0])

322.671493884 663.767076994


340.79549174251457

In [13]:
sq = hillary_sents[0]
p1 = np.exp2(-classifier.m1.prob_seq(sq))
p2 = np.exp2(-classifier.m2.prob_seq(sq))


(p1/p2) * classifier.y_ratio

3.8874552394508698e+102

In [15]:
sq = trump_sents[0]
p1 = np.exp2(-classifier.m1.prob_seq(sq))
p2 = np.exp2(-classifier.m2.prob_seq(sq))
# p1 = classifier.m1.prob_seq(sq)
# p2 = classifier.m2.prob_seq(sq)


(p1/p2) * classifier.y_ratio

1.0092272438876763e-57

In [11]:
classifier._calc_prob_ratio(trump_sents[0])

534.068689223 345.03213024


-189.33665035136295

# Multi-Class

In [5]:
from sklearn.multiclass import OneVsOneClassifier

In [5]:
labels = []
data = []
for name, d in processed.iteritems():
    if name in {'TRUMP', 'CLINTON', 'SANDERS', 'RUBIO', 'KASICH', 'BUSH', 'CHRISTIE', 'CARSON', 'CRUZ'}:
        data.extend(d['sents'])
        labels.extend([name]*len(d['sents']))
len(labels), len(data)

(11814, 11814)

In [15]:
multi_clf = OneVsOneClassifier(NgramClassifier())
cross_val_score(multi_clf, np.asarray(data), y=labels, cv=KFold(len(data), n_folds=10, shuffle=True, random_state=1), scoring='f1_weighted')

array([ 0.5502757 ,  0.53792265,  0.53453339,  0.51430339,  0.54701641,
        0.53162138,  0.50450235,  0.53508308,  0.52657834,  0.50224594])

In [27]:
# Using class probabilities where the chosen class always equals 1
multi_clf2 = OneVsOneClassifier(NgramClassifier())
cross_val_score(multi_clf2, np.asarray(data), y=labels, cv=KFold(len(data), n_folds=3, shuffle=True, random_state=1), scoring='f1_weighted')

array([ 0.50750453,  0.5083891 ,  0.49957373])

In [10]:
# Using completely raw (no normalization) class probabilities
multi_clf2 = OneVsOneClassifier(NgramClassifier())
cross_val_score(multi_clf2, np.asarray(data), y=labels, cv=KFold(len(data), n_folds=3, shuffle=True, random_state=1), scoring='f1_weighted')

array([ 0.50432043,  0.50617436,  0.49509918])

In [None]:
# Now in log-space

In [11]:
# Raw probabilities
multi_clf3 = OneVsOneClassifier(NgramClassifier())
cross_val_score(multi_clf3, np.asarray(data), y=labels, cv=KFold(len(data), n_folds=3, shuffle=True, random_state=1), scoring='f1_weighted')

array([ 0.51246686,  0.51584491,  0.50119768])

In [10]:
# Normalized
multi_clf4 = OneVsOneClassifier(NgramClassifier())
cross_val_score(multi_clf4, np.asarray(data), y=labels, cv=KFold(len(data), n_folds=3, shuffle=True, random_state=1), scoring='f1_weighted')

array([ 0.50796537,  0.51213101,  0.49695502])

1 job:  7.5376     6.930153    7.134821
2 jobs: 6.603988   6.448725    6.533559
3 jobs: 

8 jobs: 6.286621   5.711126

In [6]:
import time
from sklearn.multiclass import OneVsOneClassifier
import cPickle as pickle

In [7]:
start = time.clock()
# ngm = NgramClassifierMulti(use_dictionary=True, n_jobs=8)
ngm = OneVsOneClassifier(NgramClassifier(), n_jobs=4)
cv = StratifiedKFold(labels, n_folds=3, shuffle=True, random_state=1)
d = np.asarray(data[:5000])
l = np.asarray(labels[:5000])
# scores = cross_val_score(ngm, np.asarray(d), y=l, cv=cv, scoring='f1_weighted')
print ngm.n_jobs
ngm.fit(d, l)
print ngm.n_jobs
print "Time: %s" % (time.clock() - start)
# print scores
# print np.mean(scores)

4
4
Time: 6.531385


In [6]:
from joblib import Parallel, delayed
from sklearn.multiclass import _fit_ovo_binary as fovob

In [8]:
X = np.asarray(data[:8000])
y = np.asarray(labels[:8000])
estimator = NgramClassifier(use_dictionary=True)
classes = np.unique(y)
n_classes = classes.shape[0]

estimators = Parallel(n_jobs=100)(
    delayed(fovob)(estimator, X, y, classes[i], classes[j])
    for i in range(n_classes) for j in range(i + 1, n_classes)
)

Finished training
Finished training
Finished training
Finished training
Finished training
Finished training
Finished training
Finished training
Finished training
Finished training
Finished training
Finished training
Finished training
Finished training
Finished training


In [18]:
estimators[0].m1

<NgramModel with 19064 4-grams>

In [20]:
import types
import collections

In [21]:
isinstance([], collections.Iterator)

False

In [32]:
import copy_reg
import types

In [33]:
def pickle_function(func):
    return unpickle_function, (dill.dumps(func),)

def unpickle_function(data):
    return dill.loads(data)

copy_reg.pickle(types.FunctionType, pickle_function, unpickle_function)

In [35]:
p = pickle.dumps(ngm)