## Load in basic prereqs and set up objects

In [1]:
from nltk import sent_tokenize, word_tokenize
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

In [2]:
cv = CountVectorizer(min_df=1, tokenizer=word_tokenize)

## Load in the data

In [3]:
pos = pd.DataFrame(
    [x.rstrip() for x
     in open("../data/rt-polaritydata/rt-polarity.pos").readlines()],
columns=['review'])
pos['type'] = 'positive'
neg = pd.DataFrame(
    [x.rstrip() for x
     in open("../data/rt-polaritydata/rt-polarity.neg").readlines()],
columns=['review'])
neg['type'] = 'negative'

In [4]:
TRAIN_PROP = 0.8

In [5]:
train_pos = pos.sample(frac=TRAIN_PROP)
train_neg = neg.sample(frac=TRAIN_PROP)
train_reviews = pd.concat([train_pos, train_neg])

In [6]:
test_pos = pos.drop(train_pos.index)
test_neg = neg.drop(train_neg.index)
test_reviews = pd.concat([test_pos, test_neg])

In [7]:
train_bow = cv.fit_transform(train_reviews['review'])

Let's see the size of the vocabulary with no fancy transformations on the sentence:

In [8]:
print(len(cv.get_feature_names()))

18177


### Create matrix of relevant word vectors

In [9]:
w2v_vecs = pd.np.load('../data/word_linear_sg_100d/words100.npy')
w2v_words = set([x.rstrip() for x in
    open('../data/word_linear_sg_100d/words100.vocab', 'r').readlines()])

In [10]:
pretrained_vecs = dict(zip(w2v_words, w2v_vecs))
w2v_words_to_index = {}
for word in cv.vocabulary_:
    index = cv.vocabulary_[word]
    if word in pretrained_vecs:
        w2v_words_to_index[index] = pretrained_vecs[word]
    else:
        w2v_words_to_index[index] = pd.np.zeros(100)
word_vecs = pd.DataFrame(w2v_words_to_index)[
    sorted(list(cv.vocabulary_.values()))].values.T

In [11]:
train_vecs = word_vecs.T.dot(train_bow.T.todense()).T

## Don't forget to get the bag of words for the test set!

In [12]:
test_bow = cv.transform(test_reviews['review'])
test_vecs = word_vecs.T.dot(test_bow.T.todense()).T

## Train our classifier with LSA features and assess test accuracy

In [13]:
from sklearn.linear_model import LogisticRegressionCV

In [14]:
MAX_ITER = 1000
model = LogisticRegressionCV(max_iter=MAX_ITER)

In [15]:
model.fit(X=train_vecs, y=train_reviews['type'])

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=1000, multi_class='auto', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [16]:
model.score(X=test_vecs, y=test_reviews['type'])

0.5834896810506567