In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

In [3]:
phrases = ["The quick brown fox jumped over the lazy dog",
           "education is what you have left over after forgetting everything you ever learnt"]




In [4]:
vect = CountVectorizer(ngram_range=(1, 1)).fit(phrases)
print("Vocabulary size: {}".format(len(vect.vocabulary_)))
print("Vocabulary:\n{}".format(vect.get_feature_names()))

Vocabulary size: 19
Vocabulary:
['after', 'brown', 'dog', 'education', 'ever', 'everything', 'forgetting', 'fox', 'have', 'is', 'jumped', 'lazy', 'learnt', 'left', 'over', 'quick', 'the', 'what', 'you']


In [5]:
vect = CountVectorizer(ngram_range=(3, 3)).fit(phrases)
print("Vocabulary size: {}".format(len(vect.vocabulary_)))
print("Vocabulary:\n{}".format(vect.get_feature_names()))

Vocabulary size: 18
Vocabulary:
['after forgetting everything', 'brown fox jumped', 'education is what', 'everything you ever', 'forgetting everything you', 'fox jumped over', 'have left over', 'is what you', 'jumped over the', 'left over after', 'over after forgetting', 'over the lazy', 'quick brown fox', 'the lazy dog', 'the quick brown', 'what you have', 'you ever learnt', 'you have left']


In [6]:
vect = CountVectorizer(ngram_range=(1, 3)).fit(phrases)
print("Vocabulary size: {}".format(len(vect.vocabulary_)))
print("Vocabulary:\n{}".format(vect.get_feature_names()))

Vocabulary size: 57
Vocabulary:
['after', 'after forgetting', 'after forgetting everything', 'brown', 'brown fox', 'brown fox jumped', 'dog', 'education', 'education is', 'education is what', 'ever', 'ever learnt', 'everything', 'everything you', 'everything you ever', 'forgetting', 'forgetting everything', 'forgetting everything you', 'fox', 'fox jumped', 'fox jumped over', 'have', 'have left', 'have left over', 'is', 'is what', 'is what you', 'jumped', 'jumped over', 'jumped over the', 'lazy', 'lazy dog', 'learnt', 'left', 'left over', 'left over after', 'over', 'over after', 'over after forgetting', 'over the', 'over the lazy', 'quick', 'quick brown', 'quick brown fox', 'the', 'the lazy', 'the lazy dog', 'the quick', 'the quick brown', 'what', 'what you', 'what you have', 'you', 'you ever', 'you ever learnt', 'you have', 'you have left']


In [7]:
data = pd.read_csv("data/labeledTrainData.tsv", delimiter="\t")

In [8]:
def simple_split(data,y,length,split_mark=0.7):
    if split_mark > 0. and split_mark < 1.0:
        n = int(split_mark*length)
    else:
        n = int(split_mark)
    X_train =  data[:n].copy()
    X_test =   data[n:].copy()
    y_train = y[:n].copy()
    y_test  = y[n:].copy()
    return X_train,X_test,y_train,y_test

In [9]:
d_train,d_test,y_train,y_test = simple_split(data,data.sentiment,len(data))
print(d_train.shape,d_test.shape,y_train.shape,y_test.shape)

(17500, 3) (7500, 3) (17500,) (7500,)


In [10]:
pipe = make_pipeline(TfidfVectorizer(min_df=5, norm=None),
LogisticRegression())

param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100],
"tfidfvectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)]}

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(d_train.review, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))




Best cross-validation score: 0.90


In [11]:
print("Best parameters:\n{}".format(grid.best_params_))

Best parameters:
{'logisticregression__C': 0.001, 'tfidfvectorizer__ngram_range': (1, 3)}


In [12]:
pipe = make_pipeline(TfidfVectorizer(min_df=5, norm=None,ngram_range=(1,3)),
LogisticRegression(C=0.001))
pipe.fit(d_train.review, y_train)
print("Test set score: {:.3f}".format(pipe.score(d_test.review, y_test)))


Test set score: 0.900


In [18]:
pred_logreg = pipe.predict(d_test.review)
confusion = confusion_matrix(y_test, pred_logreg)
print("Confusion matrix:\n{}".format(confusion))

Confusion matrix:
[[8399  394]
 [ 372 8335]]


In [13]:
pipe2 = make_pipeline(TfidfVectorizer(min_df=5, norm=None,ngram_range=(1,3)),
MultinomialNB())
pipe2.fit(d_train.review, y_train)
print("Train set score: {:.3f}".format(pipe2.score(d_train.review, y_train)))     
print("Test set score: {:.3f}".format(pipe2.score(d_test.review, y_test)))

Train set score: 0.973
Test set score: 0.890


In [14]:
pred_nb = pipe2.predict(d_test.review)
confusion = confusion_matrix(y_test, pred_nb)
print("Confusion matrix:\n{}".format(confusion))

Confusion matrix:
[[3338  401]
 [ 425 3336]]


In [15]:
pipe3 = make_pipeline(TfidfVectorizer(min_df=5, norm=None,ngram_range=(1,3)),
                     RandomForestClassifier(n_estimators=1000,n_jobs=4))
pipe3.fit(d_train.review, y_train)
print("Train set score: {:.3f}".format(pipe3.score(d_train.review, y_train)))     
print("Test set score: {:.3f}".format(pipe3.score(d_test.review, y_test)))


Train set score: 1.000
Test set score: 0.871


In [16]:
pred_rf = pipe3.predict(d_test.review)
confusion = confusion_matrix(y_test, pred_rf)
print("Confusion matrix:\n{}".format(confusion))

Confusion matrix:
[[3204  535]
 [ 435 3326]]


In [17]:
pipe4 = make_pipeline(TfidfVectorizer(min_df=5, norm=None,ngram_range=(1,3)),
                     GradientBoostingClassifier(n_estimators=500))
pipe4.fit(d_train.review, y_train)
print("Train set score: {:.3f}".format(pipe4.score(d_train.review, y_train)))     
print("Test set score: {:.3f}".format(pipe4.score(d_test.review, y_test)))


Train set score: 0.924
Test set score: 0.870


In [21]:
pred_gb = pipe4.predict(d_test.review)
confusion = confusion_matrix(y_test, pred_gb)
print("Confusion matrix:\n{}".format(confusion))

Confusion matrix:
[[3179  560]
 [ 413 3348]]


In [18]:
review = ["This movie is not that good"]
print(pipe.predict(review)[0])
print(pipe2.predict(review)[0])
print(pipe3.predict(review)[0])
print(pipe4.predict(review)[0])

0
0
1
1


In [19]:
review = ["This movie is not that bad"]
print(pipe.predict(review)[0])
print(pipe2.predict(review)[0])
print(pipe3.predict(review)[0])
print(pipe4.predict(review)[0])

0
0
1
0


In [20]:
review = ["I was going to say something awesome or great or good, but I can't because the movie is so bad."]
print(pipe.predict(review)[0])
print(pipe2.predict(review)[0])
print(pipe3.predict(review)[0])
print(pipe4.predict(review)[0])

0
0
0
1
