In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

In [2]:
# read in movie reviews
# stored in directory Movie Reviews
# 1000 positive reviews in subdirectory pos
# 1000 negative reviews in subdirectory neg
import time # to see how long it takes
import os  # for file handling in os appropriate way
start_time = time.time()
labels = {'pos': 1, 'neg': 0}  #dictionary to have numeric sentiment
mr_raw = DataFrame()  # initialize the empty data frame
for l in ('pos', 'neg'):  # for each class
    path ='/Users/dewan/Dropbox/Documents/CLASS/CIS417/Data/Movie Reviews/Reviews/%s' % l  #clever use of formatting
    for file in os.listdir(path):   # iterate for each file in directory
        with open(os.path.join( path, file), 'r') as infile:  # open each file
            txt = infile.read()  # read in the text
            mr_raw = mr_raw.append([[txt, labels[l]]], ignore_index = True)  #append a row, no meaning to index
mr_raw.columns = ['review', 'sentiment']  # rename columns
print "Elapsed time: ", '%.2f' % (time.time() - start_time), "seconds"

Elapsed time:  3.59 seconds


In [3]:
mr_raw.shape

(2000, 2)

In [4]:
mr_raw.head()

Unnamed: 0,review,sentiment
0,films adapted from comic books have had plenty...,1
1,every now and then a movie comes along from a ...,1
2,you've got mail works alot better than it dese...,1
3,""" jaws "" is a rare film that grabs your atten...",1
4,moviemaking is a lot like being the general ma...,1


In [5]:
mr_raw.tail()

Unnamed: 0,review,sentiment
1995,"if anything , "" stigmata "" should be taken as ...",0
1996,"john boorman's "" zardoz "" is a goofy cinematic...",0
1997,the kids in the hall are an acquired taste . \...,0
1998,there was a time when john carpenter was a gre...,0
1999,two party guys bob their heads to haddaway's d...,0


In [None]:
# pickle mr_raw
import pickle
pickle.dump(mr_raw, open("03 mr raw.pkl","wb"))
# read it in later
# mr_raw = pickle.load(open("03 mr raw.pkl","rb"))

In [6]:
X = mr_raw.ix[:,"review"].values   # .values returns arrays
y = mr_raw.ix[:,"sentiment"].values

In [7]:
from nltk.stem import SnowballStemmer
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

def preprocessor(data):
        return " ".join([SnowballStemmer("english").stem(word) for word in data.split()])

param_grid = [{'vect__preprocessor': [None, preprocessor],
               'vect__ngram_range': [(1,1),(1,2)],
               'vect__stop_words': ['english', None],
               'vect__use_idf': [False, True]  # False provides normalized counts
               }]

lr_tfidf = Pipeline([('vect', TfidfVectorizer(lowercase=True)),
                     ('clf', MultinomialNB())])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, 
                           scoring='accuracy',
                           cv=5, verbose=1,
                           n_jobs=-1)

In [8]:
gs_lr_tfidf.fit(X, y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  66 out of  80 | elapsed:  4.0min remaining:   50.4s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  5.4min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1), (1, 2)], 'vect__use_idf': [False, True], 'vect__preprocessor': [None, <function preprocessor at 0x10d4e4500>], 'vect__stop_words': ['english', None]}],
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='accuracy', verbose=1)

In [9]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)
# with R we had resub accuracy of 78% and test accuracy of 73%

Best parameter set: {'vect__ngram_range': (1, 2), 'vect__use_idf': True, 'vect__preprocessor': None, 'vect__stop_words': None} 
CV Accuracy: 0.836


In [None]:
# if you had a separate test set, you can check the quality of model
# clf = gs_lr_tfidf.best_estimator_.score(X_test, y_test)
# print('Test Accuracy: %.3f' % clf.score(X_test, y_test))