In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from time import time

In [2]:
data = pd.read_csv("data/labeledTrainData.tsv", delimiter="\t")

In [3]:
print("Number of stop words: {}".format(len(ENGLISH_STOP_WORDS)))
print("Every 10th stopword:\n{}".format(list(ENGLISH_STOP_WORDS)[::10]))

Number of stop words: 318
Every 10th stopword:
['is', 'we', 'mostly', 'i', 'seem', 'may', 'least', 'five', 'amongst', 'find', 'fifty', 'moreover', 'whatever', 'or', 'namely', 'former', 'be', 'them', 'perhaps', 'in', 'too', 'sixty', 'whence', 'de', 'anyhow', 'first', 'else', 'those', 'give', 'and', 'throughout', 'latterly']


In [4]:
def simple_split(data,y,length,split_mark=0.7):
    if split_mark > 0. and split_mark < 1.0:
        n = int(split_mark*length)
    else:
        n = int(split_mark)
    X_train =  data[:n].copy()
    X_test =   data[n:].copy()
    y_train = y[:n].copy()
    y_test  = y[n:].copy()
    return X_train,X_test,y_train,y_test

In [5]:
vectorizer = CountVectorizer(min_df=5, stop_words="english")

In [6]:
d_train,d_test,y_train,y_test = simple_split(data,data.sentiment,len(data))
print(d_train.shape,d_test.shape,y_train.shape,y_test.shape)

(17500, 3) (7500, 3) (17500,) (7500,)


In [7]:
X_train = d_train.review
X_test = d_test.review
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [8]:
scores = cross_val_score(LogisticRegression(), X_train_vect, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))



Mean cross-validation accuracy: 0.87


In [9]:
logreg = LogisticRegression()
logreg.fit(X_train_vect, y_train)
print("Training set score: {:.3f}".format(logreg.score(X_train_vect, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test_vect, y_test)))


Training set score: 0.999
Test set score: 0.871


In [10]:
start = time()
pipe = make_pipeline(TfidfVectorizer(min_df=5, norm=None),
LogisticRegression())
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(d_train.review, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print('GridSearchCV took {} minutes'.format((time() - start)/60.))



Best cross-validation score: 0.89
GridSearchCV took 2.878620119889577 minutes


In [11]:
tfidf = grid.best_estimator_.named_steps["tfidfvectorizer"]


In [12]:
X_train_tfidf = tfidf.transform(d_train.review)

In [13]:
X_train_tfidf.shape

(17500, 22990)

In [14]:
print(X_train_tfidf)

  (0, 22901)	3.1953727427163816
  (0, 22754)	2.1251307485790303
  (0, 22725)	4.482154740974373
  (0, 22665)	8.824103152080845
  (0, 22647)	5.4360816627251864
  (0, 22525)	2.8686359473435417
  (0, 22513)	7.243652776519996
  (0, 22510)	6.473708987361189
  (0, 22505)	5.377464300512746
  (0, 22464)	6.574046362895457
  (0, 22462)	4.445654338754847
  (0, 22453)	6.039048352537808
  (0, 22411)	4.403113631564757
  (0, 22307)	2.8850603553763436
  (0, 22303)	7.074560329917086
  (0, 22285)	2.872787825999607
  (0, 22239)	4.077929558629529
  (0, 22236)	4.0378025946689515
  (0, 22235)	3.080642193606467
  (0, 22100)	5.682416965903773
  (0, 21895)	5.681053555643737
  (0, 21812)	4.354916341964563
  (0, 21749)	2.055445750299673
  (0, 21625)	4.602496810247816
  (0, 21272)	5.3450632836547545
  :	:
  (17499, 10052)	3.969843232833958
  (17499, 9497)	4.683221730226808
  (17499, 9027)	2.373632729936668
  (17499, 8877)	3.9131496132153023
  (17499, 8833)	2.7835083625821624
  (17499, 8691)	5.349478301863871
  (17

In [15]:
m = X_train_tfidf.max(axis=0)

In [16]:
print(m)

  (0, 0)	19.9286367482732
  (0, 1)	34.398364192335166
  (0, 2)	16.570213302696317
  (0, 3)	8.978253831908102
  (0, 4)	15.548562055164332
  (0, 5)	15.994849157792752
  (0, 6)	8.372118028337788
  (0, 7)	17.145577447599877
  (0, 8)	8.372118028337788
  (0, 9)	8.690571759456322
  (0, 10)	8.824103152080845
  (0, 11)	45.25228546877725
  (0, 12)	55.98824761330999
  (0, 13)	14.805434942299367
  (0, 14)	14.212903310013022
  (0, 15)	24.61519183102386
  (0, 16)	16.744236056675575
  (0, 17)	17.956507663816204
  (0, 18)	8.978253831908102
  (0, 19)	8.467428208142112
  (0, 20)	60.079201641084474
  (0, 21)	8.372118028337788
  (0, 22)	15.994849157792752
  (0, 23)	21.710716196685386
  (0, 24)	8.467428208142112
  :	:
  (0, 22965)	8.978253831908102
  (0, 22966)	35.91301532763241
  (0, 22967)	17.145577447599877
  (0, 22968)	8.690571759456322
  (0, 22969)	8.824103152080845
  (0, 22970)	16.570213302696317
  (0, 22971)	211.6857052035528
  (0, 22972)	64.49570480027158
  (0, 22973)	15.45098172682547
  (0, 22974)

In [17]:
m.toarray()

array([[19.92863675, 34.39836419, 16.5702133 , ...,  8.82410315,
        16.93485642, 25.40228462]])

In [18]:
tfidf_max_value_sorted = m.toarray().ravel().argsort()

In [19]:
feature_names = np.array(tfidf.get_feature_names())
print("Features with lowest tfidf:\n{}".format(
feature_names[tfidf_max_value_sorted[:20]]))
print("Features with highest tfidf: \n{}".format(
feature_names[tfidf_max_value_sorted[-20:]]))

Features with lowest tfidf:
['touches' 'poignant' 'briefly' 'importantly' 'root' 'instantly' 'wonders'
 'scripted' 'lacked' 'currently' 'suited' 'undoubtedly' 'highest'
 'disagree' 'draws' 'nearby' 'occurred' 'altogether' 'jealous' 'uneven']
Features with highest tfidf: 
['gypo' 'luzhin' 'ripley' 'kornbluth' 'paulie' 'blob' 'homer' 'taker'
 'dillinger' 'coop' 'vargas' 'gadget' 'dominick' 'jesse' 'bridget' 'the'
 'victor' 'victoria' 'zizek' 'timon']


In [20]:
sorted_by_idf = np.argsort(tfidf.idf_)
print("Features with lowest idf:\n{}".format(
feature_names[sorted_by_idf[:100]]))

Features with lowest idf:
['the' 'and' 'of' 'to' 'this' 'is' 'it' 'in' 'that' 'but' 'for' 'with'
 'was' 'as' 'on' 'movie' 'not' 'br' 'have' 'one' 'be' 'film' 'are' 'you'
 'all' 'at' 'an' 'by' 'from' 'so' 'like' 'who' 'they' 'there' 'if' 'his'
 'just' 'about' 'out' 'he' 'or' 'has' 'what' 'can' 'some' 'good' 'when'
 'more' 'time' 'very' 'up' 'only' 'even' 'no' 'my' 'would' 'see' 'really'
 'which' 'story' 'well' 'had' 'me' 'than' 'much' 'their' 'were' 'get'
 'other' 'do' 'been' 'most' 'her' 'don' 'also' 'into' 'first' 'made' 'how'
 'because' 'great' 'will' 'people' 'make' 'way' 'bad' 'we' 'could' 'any'
 'after' 'too' 'then' 'them' 'she' 'think' 'watch' 'acting' 'movies'
 'seen' 'its']


In [21]:
X_test_tfidf = tfidf.transform(d_test.review)

In [22]:
logreg = grid.best_estimator_.named_steps["logisticregression"]
print("Test set score: {:.3f}".format(logreg.score(X_test_tfidf, y_test)))

Test set score: 0.892


In [23]:
pred_logreg = logreg.predict(X_test_tfidf)
confusion = confusion_matrix(y_test, pred_logreg)
print("Confusion matrix:\n{}".format(confusion))

Confusion matrix:
[[3315  424]
 [ 383 3378]]


In [24]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
print("Training set score: {:.3f}".format(nb.score(X_train_tfidf, y_train)))
print("Test set score: {:.3f}".format(nb.score(X_test_tfidf, y_test)))



Training set score: 0.902
Test set score: 0.839


In [25]:
pred_nb = nb.predict(X_test_tfidf)
confusion = confusion_matrix(y_test, pred_nb)
print("Confusion matrix:\n{}".format(confusion))

Confusion matrix:
[[3249  490]
 [ 717 3044]]


In [26]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000,n_jobs=6)
rf.fit(X_train_tfidf, y_train)
print("Training set score: {:.3f}".format(rf.score(X_train_tfidf, y_train)))
print("Test set score: {:.3f}".format(rf.score(X_test_tfidf, y_test)))

Training set score: 1.000
Test set score: 0.864


In [27]:
review = "This movie is not that good"
print(logreg.predict(tfidf.transform([review]))[0])

print(rf.predict(tfidf.transform([review]))[0])

print(nb.predict(tfidf.transform([review]))[0])


1
1
0


In [28]:
review = "This movie is not that bad"
print(logreg.predict(tfidf.transform([review]))[0])

print(rf.predict(tfidf.transform([review]))[0])

print(nb.predict(tfidf.transform([review]))[0])


0
0
0


In [29]:
review = "I was going to say something awesome or great or good, but I can't because the movie is so bad."
print(logreg.predict(tfidf.transform([review]))[0])

print(rf.predict(tfidf.transform([review]))[0])

print(nb.predict(tfidf.transform([review]))[0])


1
1
0
