In [57]:
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier


In [2]:
phrases = ["The quick brown fox jumped over the lazy dog",
           "education is what you have left over after forgetting everything you ever learnt"]

In [3]:
vect = CountVectorizer()
vect.fit(phrases)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [4]:
print("Vocabulary size: {}".format(len(vect.vocabulary_)))
print("Vocabulary content:\n {}".format(vect.vocabulary_))

Vocabulary size: 19
Vocabulary content:
 {'the': 16, 'quick': 15, 'brown': 1, 'fox': 7, 'jumped': 10, 'over': 14, 'lazy': 11, 'dog': 2, 'education': 3, 'is': 9, 'what': 17, 'you': 18, 'have': 8, 'left': 13, 'after': 0, 'forgetting': 6, 'everything': 5, 'ever': 4, 'learnt': 12}


In [5]:
bag_of_words = vect.transform(phrases)

In [6]:
print(bag_of_words)

  (0, 1)	1
  (0, 2)	1
  (0, 7)	1
  (0, 10)	1
  (0, 11)	1
  (0, 14)	1
  (0, 15)	1
  (0, 16)	2
  (1, 0)	1
  (1, 3)	1
  (1, 4)	1
  (1, 5)	1
  (1, 6)	1
  (1, 8)	1
  (1, 9)	1
  (1, 12)	1
  (1, 13)	1
  (1, 14)	1
  (1, 17)	1
  (1, 18)	2


In [7]:
print("bag_of_words as an array:\n{}".format(bag_of_words.toarray()))

bag_of_words as an array:
[[0 1 1 0 0 0 0 1 0 0 1 1 0 0 1 1 2 0 0]
 [1 0 0 1 1 1 1 0 1 1 0 0 1 1 1 0 0 1 2]]


In [13]:
vect.get_feature_names()

['after',
 'brown',
 'education',
 'ever',
 'everything',
 'forgetting',
 'fox',
 'have',
 'is',
 'jumped',
 'learnt',
 'left',
 'moon',
 'over',
 'the',
 'what',
 'you']

In [8]:
data = pd.read_csv("data/labeledTrainData.tsv", delimiter="\t")

In [9]:
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [58]:
print("Samples per class: {}".format(np.bincount(data.sentiment)))

Samples per class: [12500 12500]


In [18]:
def simple_split(data,y,length,split_mark=0.7):
    if split_mark > 0. and split_mark < 1.0:
        n = int(split_mark*length)
    else:
        n = int(split_mark)
    X_train =  data[:n].copy()
    X_test =   data[n:].copy()
    y_train = y[:n].copy()
    y_test  = y[n:].copy()
    return X_train,X_test,y_train,y_test

In [26]:
vectorizer = CountVectorizer()

In [40]:
X_train,X_test,y_train,y_test = simple_split(data.review,data.sentiment,len(data))
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(17500,) (7500,) (17500,) (7500,)


In [60]:
print("Samples per class: {}".format(np.bincount(y_train)))
print("Samples per class: {}".format(np.bincount(y_test)))

Samples per class: [8761 8739]
Samples per class: [3739 3761]


In [41]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [43]:
feature_names = vectorizer.get_feature_names()
print("Number of features: {}".format(len(feature_names)))
print("First 20 features:\n{}".format(feature_names[:20]))
print("Features 19500 to 19530:\n{}".format(feature_names[19500:19530]))
print("Every 2000th feature:\n{}".format(feature_names[::2000]))

Number of features: 65005
First 20 features:
['00', '000', '0000000000001', '00001', '00015', '000s', '001', '003830', '006', '007', '0079', '0080', '0083', '0093638', '00am', '00pm', '00s', '01', '01pm', '02']
Features 19500 to 19530:
['erroll', 'erroneous', 'erroneously', 'error', 'errors', 'erroy', 'errs', 'erruptions', 'ers', 'ersatz', 'erschbamer', 'erstwhile', 'erudite', 'erupt', 'erupted', 'erupting', 'eruption', 'eruptions', 'erupts', 'erwin', 'ery', 'erye', 'eréndira', 'es', 'esai', 'esau', 'escalate', 'escalated', 'escalates', 'escalating']
Every 2000th feature:
['00', 'airlift', 'association', 'bering', 'bronco', 'channeling', 'compounds', 'cusamano', 'digitized', 'dysfunctional', 'exclaiming', 'fluidity', 'girotti', 'harish', 'hushed', 'ioc', 'klass', 'loa', 'matter', 'moonwalks', 'notorious', 'parcel', 'polynesia', 'quincey', 'resourcefulness', 'satan', 'shovels', 'spells', 'superheros', 'thrower', 'ummm', 'viggo', 'works']


In [45]:
vectorizer.vocabulary_

{'with': 63728,
 'all': 2250,
 'this': 57865,
 'stuff': 55431,
 'going': 24332,
 'down': 17279,
 'at': 4082,
 'the': 57672,
 'moment': 37800,
 'mj': 37605,
 've': 61636,
 'started': 54657,
 'listening': 33896,
 'to': 58358,
 'his': 27044,
 'music': 38654,
 'watching': 62795,
 'odd': 40456,
 'documentary': 16869,
 'here': 26685,
 'and': 2756,
 'there': 57747,
 'watched': 62789,
 'wiz': 63780,
 'moonwalker': 37998,
 'again': 1818,
 'maybe': 36083,
 'just': 31182,
 'want': 62647,
 'get': 23765,
 'certain': 9833,
 'insight': 29452,
 'into': 29840,
 'guy': 25447,
 'who': 63352,
 'thought': 57914,
 'was': 62740,
 'really': 46724,
 'cool': 12754,
 'in': 28719,
 'eighties': 18394,
 'make': 35139,
 'up': 61140,
 'my': 38747,
 'mind': 37212,
 'whether': 63236,
 'he': 26290,
 'is': 30128,
 'guilty': 25297,
 'or': 40900,
 'innocent': 29371,
 'part': 42097,
 'biography': 6386,
 'feature': 21000,
 'film': 21398,
 'which': 63239,
 'remember': 47561,
 'see': 50884,
 'cinema': 10761,
 'when': 63222,
 '

In [46]:
i = 45000
j = 10
words = vectorizer.get_feature_names()[i:i+10]
pd.DataFrame(X_train[j:j+7,i:i+10].todense(), columns=words)

Unnamed: 0,producer,producer9and,producers,produces,producing,product,production,productions,productive,productively
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,1,0,0,0
4,0,0,0,0,0,0,1,0,0,0
5,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0


In [47]:
scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

Mean cross-validation accuracy: 0.88


In [48]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))


Training set score: 0.999
Test set score: 0.879


In [49]:
pred_logreg = logreg.predict(X_test)
confusion = confusion_matrix(y_test, pred_logreg)
print("Confusion matrix:\n{}".format(confusion))

Confusion matrix:
[[3279  460]
 [ 451 3310]]


In [50]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
print("Training set score: {:.3f}".format(nb.score(X_train, y_train)))
print("Test set score: {:.3f}".format(nb.score(X_test, y_test)))



Training set score: 0.908
Test set score: 0.845


In [51]:
pred_nb = nb.predict(X_test)
confusion = confusion_matrix(y_test, pred_nb)
print("Confusion matrix:\n{}".format(confusion))

Confusion matrix:
[[3275  464]
 [ 702 3059]]


In [52]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print("Training set score: {:.3f}".format(rf.score(X_train, y_train)))
print("Test set score: {:.3f}".format(rf.score(X_test, y_test)))

Training set score: 0.993
Test set score: 0.744


In [53]:
review = "This movie is not that good"
print(logreg.predict(vectorizer.transform([review]))[0])

print(rf.predict(vectorizer.transform([review]))[0])

print(nb.predict(vectorizer.transform([review]))[0])


1
1
0


In [54]:
review = "This movie is not that bad"
print(logreg.predict(vectorizer.transform([review]))[0])

print(rf.predict(vectorizer.transform([review]))[0])

print(nb.predict(vectorizer.transform([review]))[0])


0
0
0


In [55]:
review = "I was going to say something awesome or great or good, but I can't because the movie is so bad."
print(logreg.predict(vectorizer.transform([review]))[0])

print(rf.predict(vectorizer.transform([review]))[0])

print(nb.predict(vectorizer.transform([review]))[0])


1
0
0


In [56]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

Best cross-validation score: 0.88
Best parameters:  {'C': 0.1}
