# Sentiment Analysis on Movie Reviews

First we will import all required libraries. They are a lot but we will try a lot of algorithm so it makes sense.

In [1]:
import tarfile
import time
import numpy
import os.path
import urllib.request
import sklearn.datasets
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import PassiveAggressiveClassifier, RidgeClassifier, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier

If the data set is not present in the filesystem, we download it and load it to memory

In [2]:
if not os.path.isdir('data/'):
    os.makedirs('data/')

if not os.path.isdir('data/txt_sentoken'):
    if not os.path.isfile('data/review_polarity.tar.gz'):
        urllib.request.urlretrieve('http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz', filename='data/review_polarity.tar.gz')
        file = tarfile.open(name='data/review_polarity.tar.gz')
        file.extractall(path='data/')

reviews_data = sklearn.datasets.load_files('data/txt_sentoken', random_state=0)

We will keep 20% of the data as test set and we will use the rest 80% to train and validate the algorithms.

In [3]:
reviews_data_train, reviews_data_test, target_train, target_test = train_test_split(reviews_data.data,
                                                                                    reviews_data.target,
                                                                                    test_size=0.20,
                                                                                    random_state=None)

Now we can start testing out algorithms. We will begin with Naive Bayes, a good base for tf-idf variables.

We are also keeping the best scoring parameters of each algorithm in a file for reference.

In [4]:
#Create a pipeline that performes the vectorization and then execute the ML algorithm
reviews_classifier_NB = Pipeline([('tfidf', TfidfVectorizer()),
                                  ('clf', MultinomialNB())])

#Create a dictionary of algorithm parameters to try out
parameters_NB = {'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                 'clf__alpha': (0.00001, 0.0001, 0.001, 0.01, 0.1, 1),
                 'clf__fit_prior': (True, False)}

#Define a grid search object that will execute the pipeline for all parameter combinations
gs_NB = GridSearchCV(reviews_classifier_NB, parameters_NB, n_jobs=-1, verbose=1)

#Fit the models to the data (and count time required)
start_time = time.time()
gs_NB = gs_NB.fit(reviews_data_train, target_train)
elapsed_time = time.time() - start_time

#Print results
print("Multinomial Naive Bayes results:")
print("Best score: %f" % gs_NB.best_score_)
print("Best parameters: %r" % gs_NB.best_params_)
print("Time required: %f seconds" % elapsed_time)

#We will use the test data to measure the performance
target_predicted = gs_NB.predict(reviews_data_test)
print(metrics.classification_report(target_test, target_predicted, target_names=reviews_data.target_names))
print(metrics.confusion_matrix(target_test, target_predicted))

#Output to file
result_file = open('result.txt', 'a')
print("Best score: %f" % gs_NB.best_score_, file=result_file)
print("Best parameters: %r" % gs_NB.best_params_, file=result_file)
print(elapsed_time, file=result_file)
print(metrics.classification_report(target_test, target_predicted, target_names=reviews_data.target_names), file=result_file)
print(metrics.confusion_matrix(target_test, target_predicted), file=result_file)
print("=================================================", file=result_file)
result_file.close()

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Multinomial Naive Bayes results:
Best score: 0.831875
Best parameters: {'clf__fit_prior': False, 'tfidf__ngram_range': (1, 2), 'clf__alpha': 0.1}
Time required: 58.015098 seconds
             precision    recall  f1-score   support

        neg       0.86      0.76      0.81       209
        pos       0.77      0.86      0.81       191

avg / total       0.82      0.81      0.81       400

[[159  50]
 [ 26 165]]


[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:   50.6s finished


We can proceed with some Support Vector Machine algorithms. We will start will the Linear Support Vector Classifier.

In [5]:
#Create a pipeline that performes the vectorization and then execute the ML algorithm
reviews_classifier_linearSVC = Pipeline([('tfidf', TfidfVectorizer()),
                                         ('clf', LinearSVC())])

#Create a dictionary of algorithm parameters to try out
parameters_linearSVC = {'tfidf__ngram_range': [(1, 1), (1, 2)],
                   'clf__tol': (0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001),
                   'clf__C': (0.8, 0.9, 1.0, 1.1, 1.2),
                   'clf__loss': ('hinge', 'squared_hinge'),
                   'clf__max_iter': (100, 1000, 10000),
                   'clf__penalty': ('l2',),
                   'clf__multi_class': ('ovr', 'crammer_singer')}

#Define a grid search object that will execute the pipeline for all parameter combinations
gs_linearSVC = GridSearchCV(reviews_classifier_linearSVC, parameters_linearSVC, n_jobs=-1, verbose=1)

#Fit the models to the data (and count time required)
start_time = time.time()
gs_linearSVC = gs_linearSVC.fit(reviews_data_train, target_train)
elapsed_time = time.time() - start_time

#Print results
print("Linear Support Vector Machine results:")
print("Best score: %f" % gs_linearSVC.best_score_)
print("Best parameters: %r" % gs_linearSVC.best_params_)
print("Time required: %f seconds" % elapsed_time)

#We will use the test data to measure the performance
target_predicted = gs_linearSVC.predict(reviews_data_test)
print(metrics.classification_report(target_test, target_predicted, target_names=reviews_data.target_names))
print(metrics.confusion_matrix(target_test, target_predicted))

#Output to file
result_file = open('result.txt', 'a')
print("Linear Support Vector Machine results:")
print("Best score: %f" % gs_linearSVC.best_score_, file=result_file)
print("Best parameters: %r" % gs_linearSVC.best_params_, file=result_file)
print(elapsed_time, file=result_file)
print(metrics.classification_report(target_test, target_predicted, target_names=reviews_data.target_names), file=result_file)
print(metrics.confusion_matrix(target_test, target_predicted), file=result_file)
print("=================================================", file=result_file)
result_file.close()

Fitting 3 folds for each of 720 candidates, totalling 2160 fits


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   54.5s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed: 13.0min
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed: 20.1min
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed: 28.0min finished


Linear Support Vector Machine results:
Best score: 0.839375
Best parameters: {'clf__max_iter': 100, 'clf__multi_class': 'ovr', 'clf__tol': 0.01, 'tfidf__ngram_range': (1, 1), 'clf__C': 1.2, 'clf__penalty': 'l2', 'clf__loss': 'hinge'}
Time required: 1683.585500 seconds
             precision    recall  f1-score   support

        neg       0.87      0.85      0.86       209
        pos       0.84      0.86      0.85       191

avg / total       0.86      0.85      0.86       400

[[177  32]
 [ 26 165]]
Linear Support Vector Machine results:


Now we will try other kernels of the SVM. We will put different kernels in the parameter list.

In [6]:
#Create a pipeline that performes the vectorization and then execute the ML algorithm
reviews_classifier_SVC = Pipeline([('tfidf', TfidfVectorizer()),
                                   ('clf', SVC())])

#Create a dictionary of algorithm parameters to try out
parameters_SVC = {'tfidf__ngram_range': [(1, 1), (1, 2)],
                  'clf__C': (0.8, 0.9, 1.0, 1.1, 1.2),
                  'clf__kernel': ('poly', 'rbf', 'sigmoid'),
                  'clf__tol': (0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001),
                  'clf__decision_function_shape': ('ovr', 'ovo')}

#Define a grid search object that will execute the pipeline for all parameter combinations
gs_SVC = GridSearchCV(reviews_classifier_SVC, parameters_SVC, n_jobs=-1, verbose=1)

#Fit the models to the data (and count time required)
start_time = time.time()
gs_SVC = gs_SVC.fit(reviews_data_train, target_train)
elapsed_time = time.time() - start_time

#Print results
print("Support Vector Machine results:")
print("Best score: %f" % gs_SVC.best_score_)
print("Best parameters: %r" % gs_SVC.best_params_)
print("Time required: %f seconds" % elapsed_time)

#We will use the test data to measure the performance
target_predicted = gs_SVC.predict(reviews_data_test)
print(metrics.classification_report(target_test, target_predicted, target_names=reviews_data.target_names))
print(metrics.confusion_matrix(target_test, target_predicted))

#Output to file
result_file = open('result.txt', 'a')
print("Support Vector Machine results:")
print("Best score: %f" % gs_SVC.best_score_, file=result_file)
print("Best parameters: %r" % gs_SVC.best_params_, file=result_file)
print(elapsed_time, file=result_file)
print(metrics.classification_report(target_test, target_predicted, target_names=reviews_data.target_names), file=result_file)
print(metrics.confusion_matrix(target_test, target_predicted), file=result_file)
print("=================================================", file=result_file)
result_file.close()

Fitting 3 folds for each of 360 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed: 12.9min finished


Support Vector Machine results:
Best score: 0.505625
Best parameters: {'tfidf__ngram_range': (1, 1), 'clf__C': 0.8, 'clf__decision_function_shape': 'ovr', 'clf__kernel': 'poly', 'clf__tol': 0.1}
Time required: 782.168875 seconds
             precision    recall  f1-score   support

        neg       0.00      0.00      0.00       209
        pos       0.48      1.00      0.65       191

avg / total       0.23      0.48      0.31       400

[[  0 209]
 [  0 191]]
Support Vector Machine results:


  'precision', 'predicted', average, warn_for)


Trying another family of algorithms, we proceed to the linear models.

The Passive-Aggressive algorithm yields the follwing results:

In [7]:
#Create a pipeline that performes the vectorization and then execute the ML algorithm
reviews_classifier_PassiveAggressive = Pipeline([('tfidf', TfidfVectorizer()),
                                                 ('clf', PassiveAggressiveClassifier())])

#Create a dictionary of algorithm parameters to try out
parameters_PassiveAggressive = {'tfidf__ngram_range': [(1, 1), (1, 2)],
                                'clf__C': (0.6, 0.8, 1, 1.2, 1.4, 1.6, 1.8, 2.),
                                'clf__fit_intercept': (True, False),
                                'clf__n_iter': (1, 2, 3, 5, 8, 13, 21),
                                'clf__shuffle': (True, False),
                                'clf__loss': ('hinge', 'squared_hinge'),
                                'clf__warm_start': (True, False)}

#Define a grid search object that will execute the pipeline for all parameter combinations
gs_PassiveAggressive = GridSearchCV(reviews_classifier_PassiveAggressive, parameters_PassiveAggressive, n_jobs=-1, verbose=1)

#Fit the models to the data (and count time required)
start_time = time.time()
gs_PassiveAggressive = gs_PassiveAggressive.fit(reviews_data_train, target_train)
elapsed_time = time.time() - start_time

#Print results
print("Passive Aggressive linear model results:")
print("Best score: %f" % gs_PassiveAggressive.best_score_)
print("Best parameters: %r" % gs_PassiveAggressive.best_params_)
print("Time required: %f seconds" % elapsed_time)

#We will use the test data to measure the performance
target_predicted = gs_PassiveAggressive.predict(reviews_data_test)
print(metrics.classification_report(target_test, target_predicted, target_names=reviews_data.target_names))
print(metrics.confusion_matrix(target_test, target_predicted))

#Output to file
result_file = open('result.txt', 'a')
print("Passive Aggressive linear model results:")
print("Best score: %f" % gs_PassiveAggressive.best_score_, file=result_file)
print("Best parameters: %r" % gs_PassiveAggressive.best_params_, file=result_file)
print(elapsed_time, file=result_file)
print(metrics.classification_report(target_test, target_predicted, target_names=reviews_data.target_names), file=result_file)
print(metrics.confusion_matrix(target_test, target_predicted), file=result_file)
print("=================================================", file=result_file)
result_file.close()

Fitting 3 folds for each of 1792 candidates, totalling 5376 fits


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   32.3s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 3986 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done 4936 tasks      | elapsed: 18.7min
[Parallel(n_jobs=-1)]: Done 5376 out of 5376 | elapsed: 20.3min finished


Passive Aggressive linear model results:
Best score: 0.847500
Best parameters: {'clf__shuffle': True, 'clf__fit_intercept': False, 'clf__loss': 'squared_hinge', 'tfidf__ngram_range': (1, 2), 'clf__C': 0.6, 'clf__n_iter': 5, 'clf__warm_start': False}
Time required: 1227.384459 seconds
             precision    recall  f1-score   support

        neg       0.88      0.85      0.86       209
        pos       0.84      0.87      0.86       191

avg / total       0.86      0.86      0.86       400

[[178  31]
 [ 25 166]]
Passive Aggressive linear model results:


Another member of the linear model is the Ridge algorithm. The results follow:

In [8]:
#Create a pipeline that performes the vectorization and then execute the ML algorithm
reviews_classifier_Ridge = Pipeline([('tfidf', TfidfVectorizer()),
                                    ('clf', RidgeClassifier())])

#Create a dictionary of algorithm parameters to try out
parameters_Ridge = {'tfidf__ngram_range': [(1, 1), (1, 2)],
                    'clf__alpha': (0.00001, 0.0001, 0.001, 0.01, 0.1, 1.),
                    'clf__fit_intercept': (True, False),
                    'clf__normalize': (True, False),
                    'clf__tol': (0.00001, 0.0001, 0.001, 0.01, 0.1, 1.)}

#Define a grid search object that will execute the pipeline for all parameter combinations
gs_Ridge = GridSearchCV(reviews_classifier_Ridge, parameters_Ridge, n_jobs=-1, verbose=1)

#Fit the models to the data (and count time required)
start_time = time.time()
gs_Ridge = gs_Ridge.fit(reviews_data_train, target_train)
elapsed_time = time.time() - start_time

#Print results
print("Ridge linear model results:")
print("Best score: %f" % gs_Ridge.best_score_)
print("Best parameters: %r" % gs_Ridge.best_params_)
print("Time required: %f seconds" % elapsed_time)

#We will use the test data to measure the performance
target_predicted = gs_Ridge.predict(reviews_data_test)
print(metrics.classification_report(target_test, target_predicted, target_names=reviews_data.target_names))
print(metrics.confusion_matrix(target_test, target_predicted))

#Output to file
result_file = open('result.txt', 'a')
print("Ridge linear model results:")
print("Best score: %f" % gs_Ridge.best_score_, file=result_file)
print("Best parameters: %r" % gs_Ridge.best_params_, file=result_file)
print(elapsed_time, file=result_file)
print(metrics.classification_report(target_test, target_predicted, target_names=reviews_data.target_names), file=result_file)
print(metrics.confusion_matrix(target_test, target_predicted), file=result_file)
print("=================================================", file=result_file)
result_file.close()

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   40.6s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  4.4min finished


Ridge linear model results:
Best score: 0.846250
Best parameters: {'tfidf__ngram_range': (1, 2), 'clf__fit_intercept': False, 'clf__alpha': 0.01, 'clf__normalize': True, 'clf__tol': 0.001}
Time required: 269.739924 seconds
             precision    recall  f1-score   support

        neg       0.89      0.85      0.87       209
        pos       0.84      0.88      0.86       191

avg / total       0.86      0.86      0.86       400

[[177  32]
 [ 23 168]]
Ridge linear model results:


Another family of classifiers is the decision trees. We will give a go to such an algorithm:

In [10]:
#Create a pipeline that performes the vectorization and then execute the ML algorithm
reviews_classifier_Decision_Tree = Pipeline([('tfidf', TfidfVectorizer()),
                                             ('clf', DecisionTreeClassifier())])

#Create a dictionary of algorithm parameters to try out
parameters_Decision_Tree = {'tfidf__ngram_range': [(1, 1), (1, 2)],
                            'clf__criterion': ('gini', 'entropy'),
                            'clf__splitter': ('best', 'random'),
                            'clf__max_features': (None, 'sqrt', 'log2')}

#Define a grid search object that will execute the pipeline for all parameter combinations
gs_Decision_Tree = GridSearchCV(reviews_classifier_Decision_Tree, parameters_Decision_Tree, n_jobs=-1, verbose=1)

#Fit the models to the data (and count time required)
start_time = time.time()
gs_Decision_Tree = gs_Decision_Tree.fit(reviews_data_train, target_train)
elapsed_time = time.time() - start_time

#Print results
print("Decision tree results:")
print("Best score: %f" % gs_Decision_Tree.best_score_)
print("Best parameters: %r" % gs_Decision_Tree.best_params_)
print("Time required: %f seconds" % elapsed_time)

#We will use the test data to measure the performance
target_predicted = gs_Decision_Tree.predict(reviews_data_test)
print(metrics.classification_report(target_test, target_predicted, target_names=reviews_data.target_names))
print(metrics.confusion_matrix(target_test, target_predicted))

#Output to file
result_file = open('result.txt', 'a')
print("Decision tree results:")
print("Best score: %f" % gs_Decision_Tree.best_score_, file=result_file)
print("Best parameters: %r" % gs_Decision_Tree.best_params_, file=result_file)
print(elapsed_time, file=result_file)
print(metrics.classification_report(target_test, target_predicted, target_names=reviews_data.target_names), file=result_file)
print(metrics.confusion_matrix(target_test, target_predicted), file=result_file)
print("=================================================", file=result_file)
result_file.close()

Fitting 3 folds for each of 24 candidates, totalling 72 fits
Decision tree results:
Best score: 0.646250
Best parameters: {'tfidf__ngram_range': (1, 2), 'clf__criterion': 'entropy', 'clf__splitter': 'random', 'clf__max_features': None}
Time required: 38.029461 seconds
             precision    recall  f1-score   support

        neg       0.62      0.60      0.61       209
        pos       0.57      0.59      0.58       191

avg / total       0.60      0.59      0.60       400

[[125  84]
 [ 78 113]]
Decision tree results:


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   23.0s finished


Finally we will try an ensemble method. We will use many random trees and assess the performance of this method:

In [11]:
#Create a pipeline that performes the vectorization and then execute the ML algorithm
reviews_classifier_ExtraTrees = Pipeline([('tfidf', TfidfVectorizer()),
                                          ('clf', ExtraTreesClassifier())])

#Create a dictionary of algorithm parameters to try out
parameters_Extra_Trees = {'tfidf__ngram_range': [(1, 1), (1, 2)],
                          'clf__n_estimators': (10, 100, 1000, 10000, 100000)}

#Define a grid search object that will execute the pipeline for all parameter combinations
gs_ExtraTrees = GridSearchCV(reviews_classifier_ExtraTrees, parameters_Extra_Trees, n_jobs=-1, verbose=1)

#Fit the models to the data (and count time required)
start_time = time.time()
gs_ExtraTrees = gs_ExtraTrees.fit(reviews_data_train, target_train)
elapsed_time = time.time() - start_time

#Print results
print("Ensemble extra trees results:")
print("Best score: %f" % gs_ExtraTrees.best_score_)
print("Best parameters: %r" % gs_ExtraTrees.best_params_)
print("Time required: %f seconds" % elapsed_time)

#We will use the test data to measure the performance
target_predicted = gs_ExtraTrees.predict(reviews_data_test)
print(metrics.classification_report(target_test, target_predicted, target_names=reviews_data.target_names))
print(metrics.confusion_matrix(target_test, target_predicted))

#Output to file
result_file = open('result.txt', 'a')
print("Ensemble extra trees results:")
print("Best score: %f" % gs_ExtraTrees.best_score_, file=result_file)
print("Best parameters: %r" % gs_ExtraTrees.best_params_, file=result_file)
print(elapsed_time, file=result_file)
print(metrics.classification_report(target_test, target_predicted, target_names=reviews_data.target_names), file=result_file)
print(metrics.confusion_matrix(target_test, target_predicted), file=result_file)
print("=================================================", file=result_file)
result_file.close()

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Ensemble extra trees results:
Best score: 0.856875
Best parameters: {'tfidf__ngram_range': (1, 2), 'clf__n_estimators': 10000}
Time required: 6123.715922 seconds
             precision    recall  f1-score   support

        neg       0.86      0.92      0.89       209
        pos       0.90      0.84      0.87       191

avg / total       0.88      0.88      0.88       400

[[192  17]
 [ 31 160]]
Ensemble extra trees results:


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 90.4min finished
