In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import movie_reviews
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

## Load Data

In [2]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

In [3]:
negids[0:10]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt']

In [4]:
negfeats = [(movie_reviews.words(fileids=[f])) for f in negids]

In [5]:
negfeats[0].count

<bound method AbstractLazySequence.count of ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]>

In [6]:
documents = [(' '.join(list(movie_reviews.words(fileid))), category) \
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [7]:
documents = pd.DataFrame(documents, columns=['review', 'category'])

## Train Test Split

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(documents.review.values,\
                                                    documents.category.values, test_size=0.25)

In [9]:
X_train.shape

(1500,)

In [10]:
np.unique(Y_test, return_counts=True)

(array(['neg', 'pos'], dtype=object), array([226, 274]))

In [11]:
print('train on %d instances, test on %d instances' % (X_train.shape[0], X_test.shape[0]))

train on 1500 instances, test on 500 instances


## Encode Features - Bag of Words/Unigram

In [12]:
count_vec = CountVectorizer(lowercase=True, ngram_range=(1,1), stop_words='english')

In [13]:
X_train = count_vec.fit_transform(X_train)

## Classifier

In [14]:
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

### Train

In [15]:
classifier.fit(X=X_train, y=Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

### Prediction

In [16]:
predictions = classifier.predict(X=count_vec.transform(X_test))

In [17]:
predictions_probs = classifier.predict_proba(X=count_vec.transform(X_test))

In [28]:
print("Test Accuracy: ", accuracy_score(y_pred=predictions, y_true=Y_test))

Test Accuracy:  0.828


In [19]:
predictions_probs[0:10]

array([[9.18668337e-03, 9.90813317e-01],
       [3.92922068e-02, 9.60707793e-01],
       [1.69495111e-01, 8.30504889e-01],
       [2.40983729e-02, 9.75901627e-01],
       [8.94275268e-03, 9.91057247e-01],
       [3.95475270e-03, 9.96045247e-01],
       [1.24444797e-02, 9.87555520e-01],
       [9.92718301e-01, 7.28169851e-03],
       [1.39779662e-01, 8.60220338e-01],
       [5.69450921e-04, 9.99430549e-01]])

In [20]:
Y_train[0:10]

array(['pos', 'neg', 'neg', 'pos', 'pos', 'pos', 'pos', 'neg', 'neg',
       'neg'], dtype=object)

In [21]:
predictions[0:10]

array(['pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'pos',
       'pos'], dtype=object)

In [29]:
print("Test AUC ROC ", roc_auc_score(y_score=predictions_probs[:, 0], y_true=Y_test=='neg'))

Test AUC ROC  0.904302047671339


## Visualizing Influential Words

In [24]:
sorted_index = np.argsort(classifier.coef_)

### Negative

In [25]:
np.array(count_vec.get_feature_names())[sorted_index][0, 0:20]

array(['bad', 'worst', 'boring', 'supposed', 'script', 'plot',
       'unfortunately', 'waste', 'reason', 'poor', 'stupid', 'guess',
       'write', 'looks', 'awful', 'attempt', 'terrible', 'women',
       'ridiculous', 'wasted'], dtype='<U58')

### Positve

In [26]:
np.array(count_vec.get_feature_names())[np.flip(sorted_index)][0, 0:20]

array(['hilarious', 'fun', 'memorable', 'great', 'different', 'pace',
       'quite', 'terrific', 'things', 'true', 'especially', 'excellent',
       'town', 'gives', 'american', 'bit', 'people', 'perfectly', 'frank',
       'entertaining'], dtype='<U58')