In [66]:
import numpy as np
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import f1_score, confusion_matrix
import regex as re

In [45]:
sentiment = load_files('data/imdb1/', random_state=41)
sentiment.target_names

['neg', 'pos']

In [46]:
train_data, test_data, train_target, test_target = train_test_split(sentiment.data, sentiment.target, random_state=41)

In [47]:
vectorizer = CountVectorizer()
vectorized_train_data = vectorizer.fit_transform(train_data)

In [48]:
#Binary Word Counts
vectorizer1 = CountVectorizer(binary=True,lowercase=True)
vectorized_train_data1 = vectorizer1.fit_transform(train_data)

In [50]:
#Models
clf = MultinomialNB()
clf1 = MultinomialNB()
#Fitting Models
clf.fit(vectorized_train_data, train_target)
clf1.fit(vectorized_train_data1, train_target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [51]:
#Scoring Models
print 'Word Count Accuracy : {}'.format(clf.score(vectorizer.transform(test_data), test_target))
print clf1.score(vectorizer1.transform(test_data), test_target)

0.818
0.826


In [52]:

print confusion_matrix(test_target, clf.predict(vectorizer.transform(test_data)))
print confusion_matrix(test_target, clf1.predict(vectorizer1.transform(test_data)))

[[205  50]
 [ 41 204]]
[[207  48]
 [ 39 206]]


In [22]:
f1s = []
kf = KFold(len(sentiment.data), n_folds=10)
for train, test in kf:
    train_fold = [sentiment.data[i] for i in train]
    test_fold = [sentiment.data[i] for i in test]
    vectorized_train_data = vectorizer.fit_transform(train_fold)
    clf.fit(vectorized_train_data, sentiment.target[train])
    y_pred = clf.predict(vectorizer.transform(test_fold))
    f1 = f1_score(sentiment.target[test], y_pred)
    print f1
    f1s.append(f1)

0.819047619048
0.816143497758
0.85
0.782608695652
0.835978835979
0.78612716763
0.817777777778
0.788571428571
0.78021978022
0.820754716981


In [23]:
np.mean(f1s)

0.80972295196166522

In [24]:
vectorizer2 = CountVectorizer(stop_words = 'english')
clf2 = MultinomialNB()

f1s2 = []
for train, test in KFold(len(sentiment.data), n_folds=10):
    train_fold = [sentiment.data[i] for i in train]
    test_fold = [sentiment.data[i] for i in test]
    vectorized_train_data = vectorizer2.fit_transform(train_fold)
    clf2.fit(vectorized_train_data, sentiment.target[train])
    y_pred = clf2.predict(vectorizer2.transform(test_fold))
    f1 = f1_score(sentiment.target[test], y_pred)
    print f1
    f1s2.append(f1)

0.819047619048
0.821428571429
0.847290640394
0.778378378378
0.789189189189
0.78612716763
0.826666666667
0.784090909091
0.780748663102
0.816901408451


In [25]:
np.mean(f1s2)

0.80498692133777894

In [63]:
def negate_sequence(text):
    negation = False
    delims = "?.,!:;"
    exp = re.compile(r"(not\b|no\b|n't\b)")
    result = []
    words = text.split()
    prev = None
    pprev = None
    for word in words:
        stripped = word.strip(delims).lower()
        negated = "not_" + stripped if negation else stripped
        result.append(negated)

#         if any(neg in word for neg in ):
        if exp.match(word):
            negation = not negation

        if any(c in word for c in delims):
            negation = False

    return ' '.join(result)

In [64]:
sentiment = load_files('data/imdb1/', random_state=41)

negated_sentiment = [negate_sequence(i) for i in sentiment.data]

train_data, test_data, train_target, test_target = train_test_split(negated_sentiment, sentiment.target, random_state=41)
vectorizer2 = CountVectorizer(binary=True,lowercase=True)
vectorized_train_data2 = vectorizer2.fit_transform(train_data)

In [65]:
clf2 = MultinomialNB()
clf2.fit(vectorized_train_data2, train_target)
print clf2.score(vectorizer2.transform(test_data), test_target)
print confusion_matrix(test_target, clf2.predict(vectorizer2.transform(test_data)))

0.844
[[211  44]
 [ 34 211]]


In [69]:
clf3 = BernoulliNB()
vectorizer3 = CountVectorizer(binary=True,lowercase=True)
vectorized_train_data3 = vectorizer3.fit_transform(train_data)
clf3.fit(vectorized_train_data2, train_target)
print clf3.score(vectorizer3.transform(test_data), test_target)
print confusion_matrix(test_target, clf3.predict(vectorizer3.transform(test_data)))

0.802
[[222  33]
 [ 66 179]]


In [70]:
clf4 = BernoulliNB()
train_data, test_data, train_target, test_target = train_test_split(sentiment.data, sentiment.target, random_state=41)
vectorizer4 = CountVectorizer(binary=True,lowercase=True)
vectorized_train_data4 = vectorizer4.fit_transform(train_data)
clf4.fit(vectorized_train_data4, train_target)
print clf4.score(vectorizer4.transform(test_data), test_target)
print confusion_matrix(test_target, clf4.predict(vectorizer4.transform(test_data)))

0.798
[[220  35]
 [ 66 179]]
