## Load in basic prereqs and set up objects

In [1]:
from nltk import sent_tokenize, word_tokenize
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
cv = CountVectorizer(min_df=1, tokenizer=word_tokenize)

## Load in the data

In [3]:
pos = pd.DataFrame(
    [x.rstrip() for x
     in open("../data/rt-polaritydata/rt-polarity.pos").readlines()],
columns=['review'])
pos['type'] = 'positive'
neg = pd.DataFrame(
    [x.rstrip() for x
     in open("../data/rt-polaritydata/rt-polarity.neg").readlines()],
columns=['review'])
neg['type'] = 'negative'

In [4]:
TRAIN_PROP = 0.8

In [5]:
train_pos = pos.sample(frac=TRAIN_PROP)
train_neg = neg.sample(frac=TRAIN_PROP)
train_reviews = pd.concat([train_pos, train_neg])

In [6]:
test_pos = pos.drop(train_pos.index)
test_neg = neg.drop(train_neg.index)
test_reviews = pd.concat([test_pos, test_neg])

In [7]:
train_bow = cv.fit_transform(train_reviews['review'])

Let's see the size of the vocabulary with no fancy transformations on the sentence:

In [8]:
print(len(cv.get_feature_names()))

18211


## Don't forget to get the bag of words for the test set!

In [9]:
test_bow = cv.transform(test_reviews['review'])

In [10]:
train_bow

<8530x18211 sparse matrix of type '<class 'numpy.int64'>'
	with 165345 stored elements in Compressed Sparse Row format>

In [11]:
train_bow.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

## Train our classifier with bag-of-words features

In [12]:
from sklearn.linear_model import LogisticRegressionCV

In [13]:
MAX_ITER = 1000
model = LogisticRegressionCV(max_iter=MAX_ITER)

In [14]:
model.fit(X=train_bow, y=train_reviews['type'])

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=1000, multi_class='auto', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [15]:
model.score(X=test_bow, y=test_reviews['type'])

0.7575046904315197