## Load in basic prereqs and set up objects

In [1]:
from nltk import sent_tokenize, word_tokenize
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

In [2]:
cv = CountVectorizer(min_df=1, tokenizer=word_tokenize)

## Load in the data

In [3]:
pos = pd.DataFrame(
    [x.rstrip() for x
     in open("../data/rt-polaritydata/rt-polarity.pos").readlines()],
columns=['review'])
pos['type'] = 'positive'
neg = pd.DataFrame(
    [x.rstrip() for x
     in open("../data/rt-polaritydata/rt-polarity.neg").readlines()],
columns=['review'])
neg['type'] = 'negative'

In [4]:
TRAIN_PROP = 0.8

In [5]:
train_pos = pos.sample(frac=TRAIN_PROP)
train_neg = neg.sample(frac=TRAIN_PROP)
train_reviews = pd.concat([train_pos, train_neg])

In [6]:
test_pos = pos.drop(train_pos.index)
test_neg = neg.drop(train_neg.index)
test_reviews = pd.concat([test_pos, test_neg])

In [7]:
train_bow = cv.fit_transform(train_reviews['review'])

Let's see the size of the vocabulary with no fancy transformations on the sentence:

In [8]:
print(len(cv.get_feature_names()))

18085


### Build principle components analysis object and then transform BOW into smaller dimension

In [9]:
N_COMPONENTS = 1000
WHITEN = True # to center features or not
pca = PCA(n_components=N_COMPONENTS, whiten=WHITEN)

In [10]:
train_vecs = pca.fit_transform(train_bow.todense())

## Don't forget to get the bag of words for the test set!

In [11]:
test_vecs = pca.transform(
    cv.transform(test_reviews['review']).todense())

In [12]:
test_vecs

array([[ 1.90726938, -2.35433578,  2.97862158, ...,  1.24696455,
        -0.13664707,  0.97219216],
       [-1.5266682 , -0.33614623, -1.30526253, ..., -0.20930712,
         0.003721  , -0.04018791],
       [-0.3852074 , -0.08889897, -0.44076016, ..., -0.26394594,
         0.119574  ,  0.55185054],
       ...,
       [ 1.99043076,  1.35390085, -1.07393365, ...,  0.86028029,
        -0.47860171, -0.55720906],
       [-0.81057942,  0.2947098 ,  0.11331359, ...,  0.36269267,
        -0.03196597, -0.01421735],
       [-0.83558672, -0.6724384 , -0.19066866, ..., -0.30272174,
         0.14110603, -0.00721906]])

## Train our classifier with LSA features and assess test accuracy

In [13]:
from sklearn.linear_model import LogisticRegressionCV

In [14]:
MAX_ITER = 1000
model = LogisticRegressionCV(max_iter=MAX_ITER)

In [15]:
model.fit(X=train_vecs, y=train_reviews['type'])

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=1000, multi_class='auto', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [16]:
model.score(X=test_vecs, y=test_reviews['type'])

0.7363977485928705