# Adam Brannigan

In [110]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

# Read in Data

In [246]:
#Read in data and display
df = pd.read_json('moviereviews/reviews.json')
unclassified = pd.read_json('moviereviews/unclassified.json')
unclassified = pd.Series(unclassified.data)
X = df.data
y = df.labels

In [247]:
df.head()
type(unclassified)

pandas.core.series.Series

In [248]:
df.tail()

Unnamed: 0,data,labels
780,"if anything , "" stigmata "" should be taken as ...",-1
781,"john boorman's "" zardoz "" is a goofy cinematic...",-1
782,the kids in the hall are an acquired taste . \...,-1
783,there was a time when john carpenter was a gre...,-1
784,two party guys bob their heads to haddaway's d...,-1


# Vectorizing The data

In [249]:
# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 
#state=1 is just setting a seed so get same training set on all workbooks
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(588,)
(197,)
(588,)
(197,)


In [250]:
# instantiate vectorizer use stop words english to remove any uneseccery words
vect = CountVectorizer(stop_words='english',  max_df=0.3)
#fit(learn) the vocabulary of the training data
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.3, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [251]:
#check out the features
vect.get_feature_names()

['000',
 '007',
 '05',
 '10',
 '100',
 '1000',
 '10000',
 '101',
 '102',
 '103',
 '105',
 '106',
 '107',
 '108',
 '11',
 '110',
 '112',
 '114',
 '117',
 '12',
 '125',
 '126',
 '13',
 '131',
 '137',
 '138',
 '139',
 '13th',
 '14',
 '144',
 '14th',
 '15',
 '1521',
 '155',
 '16',
 '160',
 '1600',
 '1600s',
 '161',
 '1692',
 '16mm',
 '16x9',
 '17',
 '170',
 '175',
 '1799',
 '17th',
 '18',
 '180',
 '1800s',
 '1812',
 '1862',
 '1898',
 '1899',
 '18th',
 '19',
 '1900',
 '1900s',
 '1925',
 '1928',
 '1930',
 '1930s',
 '1932',
 '1933',
 '1937',
 '1938',
 '1939',
 '1940',
 '1940s',
 '1941',
 '1942',
 '1944',
 '1947',
 '1948',
 '1949',
 '1950',
 '1950s',
 '1954',
 '1956',
 '1957',
 '1959',
 '1960',
 '1960s',
 '1961',
 '1962',
 '1963',
 '1964',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1970s',
 '1971',
 '1972',
 '1973',
 '1974',
 '1975',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1980s',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1990s',
 '1

In [252]:
# create a document-term matrix from training data
X_train_dtmatrix = vect.transform(X_train)
X_train_dtmatrix

<588x23323 sparse matrix of type '<class 'numpy.int64'>'
	with 129854 stored elements in Compressed Sparse Row format>

In [253]:
# convert sparse matrix to a dense matrix
X_train_dtmatrix.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [254]:
#create dataframe with vocabulary and matrix together
pd.DataFrame(X_train_dtmatrix.toarray(), columns=vect.get_feature_names())

Unnamed: 0,000,007,05,10,100,1000,10000,101,102,103,...,zooming,zooms,zorg,zsigmond,zucker,zweibel,zwick,zwigoff,zycie,zzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,9,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [255]:
# create a document-term matrix from testing data
X_test_dtmatrix = vect.transform(X_test)
X_test_dtmatrix

<197x23323 sparse matrix of type '<class 'numpy.int64'>'
	with 38887 stored elements in Compressed Sparse Row format>

# Build and Evaluate the Model 
# MultinomialNB

In [256]:
# train the model using X_train_dtm and get the time
nb = MultinomialNB()
%time nb.fit(X_train_dtmatrix, y_train)
#evaluate score & time the model
%time print(nb.score(X_test_dtmatrix, y_test))

Wall time: 52 ms
0.766497461928934
Wall time: 3 ms


#  Logistic regression

In [257]:
lreg = LogisticRegression()
# train the model using X_train_dtm
%time lreg.fit(X_train_dtmatrix, y_train)
# calculate accuracy
%time print(lreg.score(X_test_dtmatrix, y_test))

Wall time: 112 ms
0.7817258883248731
Wall time: 0 ns


### Tuning the vectorizer
<p>After som tuning of the vectorizer I was able to improve the accuracy score on each model to 78%.
   <br> <B>MultinomialNB 
        <li>Defaults                    || 74% 
        <li>Stop_Words                   ||75%
        <li>Max_df = .5 & min_df3        ||78%
   <br> <B>LogisticRegression 
        <li>Defaults                    || 73% 
        <li>Stop_Words                   ||73%
        <li>Max_df = .3                  ||78%
            
<p>As MultinomialNB slightly outperformed on training time and only slightly worse on testing It was choosen .

# Classify Unlassified Data

In [342]:
# create a document-term matrix from Unlassified data
unclass_dtmatrix = vect.transform(unclassified)
unclass_dtmatrix

<15x23323 sparse matrix of type '<class 'numpy.int64'>'
	with 3570 stored elements in Compressed Sparse Row format>

In [343]:
#make prediction on unclassified data
unclass_predicted = nb.predict(unclass_dtmatrix)
unclass_predicted = pd.Series(unclass_predicted)

In [344]:
# Save the data and label series to a adtaframe and display predictions
data=[unclassified,unclass_predicted]
unclass_predictions = pd.concat(data,axis=1)
unclass_predictions.columns = ['Data', 'Labels']
unclass_predictions

Unnamed: 0,Data,Labels
0,"in 1912 , a ship set sail on her maiden voyage...",1
1,star wars : episode 1 - the phantom menace ( 1...,1
2,martin scorsese's triumphant adaptation of edi...,1
3,"over 40 years ago , a japanese production comp...",-1
4,it has been 20 years since a terrence malick f...,1
5,after watching the first ten minutes of this j...,1
6,'traffic violation' dr . daniel's review of u-...,-1
7,it is movies like these that make a jaded movi...,-1
8,instinct is the kind of movie that inexperienc...,-1
9,"when i saw the trailer for "" the sixth sense ,...",-1


In [345]:
y_pred_prob = nb.predict_proba(unclass_dtmatrix)[:, 1]
y_pred_prob

array([9.99989559e-01, 1.00000000e+00, 1.00000000e+00, 7.82174340e-36,
       1.00000000e+00, 1.00000000e+00, 3.89558320e-14, 6.56630157e-08,
       5.61378659e-03, 1.02692891e-10, 1.64839958e-39, 1.00000000e+00,
       1.18913117e-28, 2.42455628e-13, 1.42175397e-15])

In [351]:
y_test.shape
#unclass_dtmatrix.shape

(197,)

In [364]:
# calculate AUC score on sample of test data
nb.score(unclass_dtmatrix, y_test[45:60])

0.6666666666666666