In [1]:
import codecs
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [2]:
with open("data/imdb.pickle.z", 'rb') as f:
    compressed_data = f.read()

uncompressed_data = codecs.decode(compressed_data, 'zlib_codec')
imdb_data = pickle.loads(uncompressed_data)

In [3]:
imdb_data.keys()

dict_keys(['DESCR', 'test_corpus', 'train_corpus', 'y_test', 'y_train'])

In [4]:
print(imdb_data['DESCR'])

IMDB reviews dataset. Binary classification (0 = negative, 1 = positive) 25K in train, 25K in test. Data source: http://ai.stanford.edu/~amaas/data/sentiment/


In [5]:
print(imdb_data['train_corpus'][7])

I don't know who to blame, the timid writers or the clueless director. It seemed to be one of those movies where so much was paid to the stars (Angie, Charlie, Denise, Rosanna and Jon) that there wasn't enough left to really make a movie. This could have been very entertaining, but there was a veil of timidity, even cowardice, that hung over each scene. Since it got an R rating anyway why was the ubiquitous bubble bath scene shot with a 70-year-old woman and not Angie Harmon? Why does Sheen sleepwalk through potentially hot relationships WITH TWO OF THE MOST BEAUTIFUL AND SEXY ACTRESSES in the world? If they were only looking for laughs why not cast Whoopi Goldberg and Judy Tenuta instead? This was so predictable I was surprised to find that the director wasn't a five year old. What a waste, not just for the viewers but for the actors as well.


In [6]:
tp = r"(?u)\b[\w\'/]+\b"
vect = CountVectorizer(token_pattern=tp, lowercase=True, ngram_range=(1, 1), min_df=5, binary=True)

In [7]:
X_train = vect.fit_transform(imdb_data['train_corpus'])

In [8]:
X_train.shape

(25000, 28283)

In [13]:
feature_names = np.asarray(vect.get_feature_names())

In [17]:
X_test = vect.transform(imdb_data['test_corpus'])

In [18]:
y_train = imdb_data['y_train']
y_test = imdb_data['y_test']

In [19]:
from sklearn.feature_selection import mutual_info_classif

In [21]:
mis = mutual_info_classif(X_train, y_train, discrete_features=True)

In [23]:
inds = np.argsort(mis)[::-1]

for i in inds[:20]:
    print("%s \t %0.3f" %(feature_names[i], mis[i]))

bad 	 0.039
worst 	 0.036
waste 	 0.024
awful 	 0.022
great 	 0.020
excellent 	 0.015
terrible 	 0.014
worse 	 0.013
stupid 	 0.013
wonderful 	 0.013
boring 	 0.012
no 	 0.011
horrible 	 0.011
nothing 	 0.011
poor 	 0.010
best 	 0.010
crap 	 0.010
minutes 	 0.009
perfect 	 0.009
poorly 	 0.009


In [24]:
from sklearn.naive_bayes import BernoulliNB

In [25]:
clf = BernoulliNB()

In [26]:
clf.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [27]:
clf.score(X_test, y_test)

0.8328

In [28]:
clf.feature_log_prob_

array([[-4.66295929, -5.52162091, -5.58349631, ..., -3.62250292,
        -7.13105882, -6.86869455],
       [-5.822726  , -9.43364391, -5.93713635, ..., -3.66532291,
        -7.824206  , -5.69597429]])

In [29]:
tw = clf.feature_log_prob_[0] - clf.feature_log_prob_[1]

In [30]:
inds = np.argsort(tw)

In [31]:
for i in inds[:10]:
    print(feature_names[i], tw[i])

edie -3.713572066704308
7/10 -3.316780039849572
mcintire -3.218875824868201
din -3.218875824868201
gunga -3.1780538303479453
tsui -3.1780538303479453
antwone -3.1780538303479453
8/10 -3.0633909220278053
quibble -3.0445224377234226
sabu -3.0445224377234226


In [33]:
for i in inds[::-1][:10]:
    print(i, feature_names[i], tw[i])

183 2/10 4.102643365036796
1 0/10 3.9120230054281464
248 4/10 3.5263605246161616
223 3/10 3.508555899982655
26789 uwe 3.349904087274605
15488 manos 3.3322045101752042
2992 boll 3.3322045101752042
12040 hobgoblins 3.1354942159291497
13906 kareena 3.091042453358316
23041 slater 3.091042453358316


In [38]:
freq = np.sum(X_train, axis=0)
freq.A1.shape

(28283,)

In [39]:
freq = freq.A1

In [40]:
freq[26789]

57

In [43]:
clf.feature_count_[0][26789]

56.0

In [44]:
clf.feature_count_[1][26789]

1.0

In [45]:
clf.class_count_

array([12500., 12500.])

In [46]:
np.log(57/2)

3.349904087274605

In [47]:
tw[26789]

3.349904087274605

In [49]:
fw[26789]

-0.004409708488700152

In [50]:
test_freq = np.sum(X_test, axis=0)
test_freq = test_freq.A1
test_freq[26789]

37

In [51]:
f_flp = np.log(1 - np.exp(clf.feature_log_prob_))
fw = f_flp[0] - f_flp[1]

In [52]:
inds = np.argsort(fw)

In [53]:
for i in inds[:10]:
    print(feature_names[i], fw[i])

this -0.3864571440629736
bad -0.3067256092399112
to -0.30489420133288414
the -0.2988553730499781
i -0.26543646350446326
movie -0.25638760989336107
no -0.21106827640831793
was -0.20910609163862004
just -0.18518968825741583
even -0.1769056828209219


In [54]:
for i in inds[::-1][:10]:
    print(feature_names[i], fw[i])

and 0.32179606614928247
great 0.22945363222121803
is 0.2088301632951297
as 0.15477796683162204
best 0.1414586899087999
very 0.13517175912206814
well 0.1272317971276138
also 0.12473679462096474
love 0.11913853194103155
his 0.1132547508867996


In [56]:
b = clf.class_log_prior_[0] - clf.class_log_prior_[1]
b

0.0

## Empty Document

In [58]:
lr = b + np.sum(fw)
lr

-1.501305882003622

In [59]:
np.exp(lr) / (1+np.exp(lr))

0.18223083690589836

In [66]:
empty_doc = ['']
x_empty_doc = vect.transform(empty_doc)

In [67]:
clf.predict_proba(x_empty_doc)

array([[0.18223084, 0.81776916]])

## Increase min_df

In [68]:
tp = r"(?u)\b[\w\'/]+\b"
vect = CountVectorizer(token_pattern=tp, lowercase=True, ngram_range=(1, 1), min_df=100, binary=True)

In [69]:
X_train = vect.fit_transform(imdb_data['train_corpus'])
X_test = vect.transform(imdb_data['test_corpus'])
feature_names = np.asarray(vect.get_feature_names())

In [70]:
X_train.shape

(25000, 3898)

In [71]:
clf.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [72]:
clf.score(X_test, y_test)

0.84524

In [73]:
tw = clf.feature_log_prob_[0] - clf.feature_log_prob_[1]
inds = np.argsort(tw)

In [74]:
for i in inds[:10]:
    print(i, feature_names[i], tw[i])

50 7/10 -3.316780039849572
54 8/10 -3.0633909220278053
59 9/10 -2.6741486494265283
6 10/10 -2.5320245727650708
1359 flawless -2.4336133554004498
3332 superbly -2.3116349285139632
2501 perfection -2.181224235989778
538 captures -2.0204432009178808
3835 wonderfully -2.0130995935431146
2774 refreshing -1.8951950564762976


In [75]:
for i in inds[::-1][:10]:
    print(i, feature_names[i], tw[i])

23 2/10 4.102643365036796
39 4/10 3.5263605246161616
36 3/10 3.508555899982655
3242 stinker 2.9856819377004893
3646 unwatchable 2.9856819377004893
1763 incoherent 2.9061201148643034
3 1/10 2.8200552594787043
2266 mst3k 2.7269186854065923
3626 unfunny 2.6164830075767194
3740 waste 2.5323731870269244


In [77]:
f_flp = np.log(1 - np.exp(clf.feature_log_prob_))
fw = f_flp[0] - f_flp[1]
inds = np.argsort(fw)

In [78]:
for i in inds[:10]:
    print(i, feature_names[i], tw[i])

3465 this 0.0398445002541834
318 bad 1.0810442692719002
3506 to 0.019679469704074748
3434 the 0.00250050543677105
1717 i 0.07902676054863811
2259 movie 0.1637615471528644
2332 no 0.4371426562149239
3737 was 0.11408757773529388
1896 just 0.25410577691817693
1168 even 0.3510252113966139


In [79]:
for i in inds[::-1][:10]:
    print(i, feature_names[i], tw[i])

192 and -0.011090992684172463
1525 great -0.7040970638295319
1836 is -0.023899835627466004
263 as -0.08515251986732686
385 best -0.5974627223168865
3678 very -0.25547316908216366
3773 well -0.29690444184241827
172 also -0.36141304177351596
2073 love -0.5642988095268091
1651 his -0.15235175289500624


In [80]:
lr = b + np.sum(fw)
lr

-4.070899985689966

In [81]:
np.exp(lr) / (1+np.exp(lr))

0.01677579693254762