In [1]:
from nltk import sent_tokenize, word_tokenize
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import download as nltk_download

In [2]:
pos = pd.DataFrame(
    [x.rstrip() for x
     in open("../data/rt-polaritydata/rt-polarity.pos").readlines()],
columns=['review'])
pos['type'] = 'positive'
neg = pd.DataFrame(
    [x.rstrip() for x
     in open("../data/rt-polaritydata/rt-polarity.neg").readlines()],
columns=['review'])
neg['type'] = 'negative'

In [3]:
TRAIN_PROP = 0.8
train_pos = pos.sample(frac=TRAIN_PROP)
train_neg = neg.sample(frac=TRAIN_PROP)
train_reviews = pd.concat([train_pos, train_neg])
test_pos = pos.drop(train_pos.index)
test_neg = neg.drop(train_neg.index)
test_reviews = pd.concat([test_pos, test_neg])

### TF-IDF first requires calculation of the inverse document frequency
#### N / log(df(w_i))

In [4]:
cv = CountVectorizer(min_df=1, tokenizer=word_tokenize)
train_bow = cv.fit_transform(train_reviews['review'])
train_tfidf = cv.transform(train_reviews['review'])

In [5]:
N = train_bow.shape[0]
train_tfidf[train_tfidf >= 1] = 1

In [6]:
idf = N / (train_tfidf.sum(axis=0) + 1)

In [7]:
idf

matrix([[  99.18604651, 2132.5       ,  775.45454545, ..., 4265.        ,
         4265.        , 4265.        ]])

In [8]:
train_tfidf = train_bow.multiply(idf)

In [9]:
train_tfidf.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
np.log(idf)

matrix([[4.59699734, 7.66505028, 6.65344937, ..., 8.35819746, 8.35819746,
         8.35819746]])

## Train our classifier with tfidf features

In [11]:
from sklearn.linear_model import LogisticRegressionCV

MAX_ITER = 1000
model = LogisticRegressionCV(max_iter=MAX_ITER)
model.fit(X=train_tfidf, y=train_reviews['type'])

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=1000, multi_class='auto', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

### Now, generate tf-idf features for test bag-of-words

In [12]:
test_tfidf = cv.transform(test_reviews['review']).multiply(idf)

In [13]:
model.score(X=test_tfidf, y=test_reviews['type'])

0.7049718574108818