####  Import libraries as needed

In [1]:
import os
import re
import sys
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, f1_score
from sklearn.linear_model import LogisticRegression
from scipy.stats import binom

import warnings
warnings.filterwarnings('ignore')

***
#### Download and uncompress datasets

The datasets being used are as follow:
* `Pang & Lee's movie review dataset “polarity dataset v2.0”, review_polarity.tar.gz, ` consists of `'positive'` and `'negative'` movie reviews that are stored in `pos` and `neg` directories, subdirectories of `txt_sentoken` of `review_polarity`.

* `Bing Liu’s sentiment lexicon dataset “opinion lexicon”, opinion-lexicon-English.rar, ` consists of `positive-words` and `negative-words` `.txt` files in `opinion-lexicon-English` directory.

* Link to [polarity dataset v2.0](http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz)

* Link to [opinion lexicon](http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar)

* For `Mac` users, `The Unarchiver` is one of the apps that is used to uncompress `.rar` file.

***
#### Exploring Bing Liu's sentiment lexicon dataset

In [2]:
def extract_sentiment_lexicon(file_dir):
    """
    Extract and tokenize Bing Liu's sentiment lexicon.
    """
    try:
        # Create a sentiment lexicon dictionary
        sentiment_lexicon = {}
        with open(file_dir, errors='ignore') as f:
            # Read all the lines
            lines = f.readlines() 
            for line in lines:
                # Normalize the case and strip the lines
                line = line.lower().strip() 
                # Remove punctuation characters
                line = re.sub(r'[^\w\+\*\-]', ' ', line)
                # Extract a single word in a line
                if len(line.split(' ')) == 1 and line.split(' ')[0] != '':
                    sentiment_lexicon[line] = len(sentiment_lexicon)
        return sentiment_lexicon
    except:
        print('Extract data failed. Please check data directory.')

In [3]:
# File path refer to Bing Liu’s positive sentiment lexicon
pos_sentiment_lexicon_dir = './opinion-lexicon-English/positive-words.txt'
pos_sentiment_lexicon = extract_sentiment_lexicon(pos_sentiment_lexicon_dir)
print('Overview of positive sentiment lexicons:\n', 
      list(pos_sentiment_lexicon)[:10])

# File path refer to Bing Liu’s negative sentiment lexicon
neg_sentiment_lexicon_dir = './opinion-lexicon-English/negative-words.txt'
neg_sentiment_lexicon = extract_sentiment_lexicon(neg_sentiment_lexicon_dir)
print('\nOverview of negative sentiment lexicons:\n', 
      list(neg_sentiment_lexicon)[:10])

Overview of positive sentiment lexicons:
 ['a+', 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'accessible', 'acclaim', 'acclaimed', 'acclamation']

Overview of negative sentiment lexicons:
 ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort', 'aborted']


In [4]:
print('Vocabulary of positive sentiment lexicon:', 
      len(pos_sentiment_lexicon.keys()), 'words')
print('Vocabulary of negative sentiment lexicon:', 
      len(neg_sentiment_lexicon.keys()), 'words')

Vocabulary of positive sentiment lexicon: 2006 words
Vocabulary of negative sentiment lexicon: 4783 words


***
#### Exploring Pang & Lee's movie review data

In [5]:
def read_movie_review_data(data_dir='./review_polarity/txt_sentoken'):
    """
    Extract moview review data from given directory.
    """
    try:
        # Data and labels to be returned
        data = []
        labels = []

        # Assume 2 sub-directories: neg, pos
        for sentiment in ['neg', 'pos']:
            dir_name = os.path.join(data_dir, sentiment)
            for fname in os.listdir(dir_name):
                if fname[-4:] == '.txt':
                    with open(os.path.join(dir_name, fname)) as f:
                        data.append(f.read())
                        if sentiment == 'neg':
                            labels.append(0)
                        else:
                            labels.append(1)
        return np.array(data), np.array(labels)
    except:
        print('Read data failed. Please check data directory.')

In [6]:
# Read movie review data
data, labels = read_movie_review_data()
print('Moive reviews: {} positive and {} negative'.format(
    len(labels) - np.sum(labels), len(labels) - np.sum(labels)))

Moive reviews: 1000 positive and 1000 negative


In [7]:
print('A quick look at one of the positive reviews:\n')
print(data[1400])

A quick look at one of the positive reviews:

the happy bastard's 30-second review : 
american pie 
the summer of raunch continues to spread into theatres with this latest yuk fest , filled with sick jokes and teen dialogue aplenty . 
if you go expecting dawson's creek , you're in for a problem . 
if your expectations are lower ( and better , i might add ) , you will enjoy the hell out of american pie . 
the movie casts several unknowns , with the only real recognizable one being sctv's own eugene levy as a happy-go-lucky dad . 
the story revolves around four high school seniors who have one goal before the school year gets out- get laid . 
that's pretty much it . 
throughout the movie , little sick comic bits are sprinkled throughout , including a memorable scene involving an apple pie ( i won't give it away , but you probably know what it is ) and an internet broadcast gone horribly awry . 
of course , the movie has some slightly sentimental bits , but they don't drag the movie's hum

In [8]:
print('A quick look at one of the negative reviews:\n')
print(data[1])

A quick look at one of the negative reviews:

the happy bastard's quick movie review 
damn that y2k bug . 
it's got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding a crew of a tugboat that comes across a deserted russian tech ship that has a strangeness to it when they kick the power back on . 
little do they know the power within . . . 
going for the gore and bringing on a few action sequences here and there , virus still feels very empty , like a movie going for all flash and no substance . 
we don't know why the crew was really out in the middle of nowhere , we don't know the origin of what took over the ship ( just that a big pink flashy thing hit the mir ) , and , of course , we don't know why donald sutherland is stumbling around drunkenly throughout . 
here , it's just " hey , let's chase these people around with some robots " . 
the acting is below average , even from the likes of curtis . 
you're more 

***
#### Preprocess movie review data

In [9]:
def review_to_words(data):
    """
    Convert a raw text review into a sequence of words.
    """
    # Remove punctuation characters
    data_normalized = []
    for text in data:
        # Convert to lowercase and remove certain punctuation characters
        tmp_text = re.sub(r'[^\w\'\+\*\-/]', ' ', text.lower())
        data_normalized.append(' '.join(tmp_text.split()))

    return np.array(data_normalized)

In [10]:
data_normalized = review_to_words(data)
print('A quick look at one of the normalized reviews:\n\n', data_normalized[1])

A quick look at one of the normalized reviews:

 the happy bastard's quick movie review damn that y2k bug it's got a head start in this movie starring jamie lee curtis and another baldwin brother william this time in a story regarding a crew of a tugboat that comes across a deserted russian tech ship that has a strangeness to it when they kick the power back on little do they know the power within going for the gore and bringing on a few action sequences here and there virus still feels very empty like a movie going for all flash and no substance we don't know why the crew was really out in the middle of nowhere we don't know the origin of what took over the ship just that a big pink flashy thing hit the mir and of course we don't know why donald sutherland is stumbling around drunkenly throughout here it's just hey let's chase these people around with some robots the acting is below average even from the likes of curtis you're more likely to get a kick out of her work in halloween h20

***
#### Form train and test sets

In [11]:
# 8/2 split
X_train, X_test, y_train, y_test = train_test_split(data_normalized,
                                                    labels,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=labels)

print("""Movie reviews: 
  train data = {}, positive train labels = {}, negative train labels = {}
  test data = {}, positive test labels = {}, negative test labels = {}""".
      format(len(X_train), np.sum(y_train), len(y_train) - np.sum(y_train),
             len(X_test), np.sum(y_test), len(y_test) - np.sum(y_test)))

Movie reviews: 
  train data = 1600, positive train labels = 800, negative train labels = 800
  test data = 400, positive test labels = 200, negative test labels = 200


***
#### Compute Bag-of-Words features

In [12]:
# Instantiate CountVectorizer
count_vector = CountVectorizer()

# Fit the training data and return the matrix
X_train_vectorized = count_vector.fit_transform(X_train).toarray()

# Transofmr the testing data and return the marix
X_test_vectorized = count_vector.transform(X_test).toarray()

In [13]:
print('Vocabulary of movie reviews:', len(count_vector.vocabulary_.keys()))
print('\nSample words:', list(count_vector.vocabulary_.keys())[:30])

Vocabulary of movie reviews: 36180

Sample words: ['still', 'can', 'figure', 'out', 'why', 'people', 'went', 'in', 'droves', 'to', 'see', 'this', 'movie', 'now', 'before', 'you', 'go', 'assuming', 'some', 'sort', 'of', 'high', 'brow', 'snob', 'who', 'appreciate', 'little', 'dumb', 'humor', 'let']


***
#### Classification using lexicon-based classifier

In [14]:
def lexicon_based_clf(X_test, pos, neg):
    """
    Implementation of the lexicon-based classifier mentioned by J. Eisenstein.
    Classify each document in texts as positive 
    iff it has more positive sentiment words 
    than negative sentiment words.
    """
    y_pred = []
    for text in X_test:
        pos_count = 0 # number of positive sentiment lexicons
        neg_count = 0 # number of negative sentiment lexicons
        for word in text.split():
            if word in pos:
                pos_count += 1
            if word in neg:
                neg_count += 1
        if pos_count > neg_count:
            y_pred.append(1)
        else:
            y_pred.append(0)
            
    return np.array(y_pred)

In [15]:
y_pred_lb = lexicon_based_clf(X_test, 
                           pos_sentiment_lexicon, 
                           neg_sentiment_lexicon)

print('Lexicon-based classifier performance:\n')
print(classification_report(y_test, 
                            y_pred_lb, 
                            target_names=['negative', 'positive']))

Lexicon-based classifier performance:

              precision    recall  f1-score   support

    negative       0.69      0.74      0.71       200
    positive       0.72      0.67      0.69       200

    accuracy                           0.70       400
   macro avg       0.70      0.70      0.70       400
weighted avg       0.70      0.70      0.70       400



***
#### Classification using Logistic Regression classifier

In [16]:
# Train a logistic regression classifier
logistic_clf = LogisticRegression(random_state=0).fit(X_train_vectorized, 
                                                      y_train)

In [17]:
y_pred_lg = logistic_clf.predict(X_test_vectorized)

print('Logistic regression classifier performance:\n')
print(classification_report(y_test, 
                            y_pred_lg, 
                            target_names=['negative', 'positive']))

Logistic regression classifier performance:

              precision    recall  f1-score   support

    negative       0.85      0.85      0.85       200
    positive       0.85      0.85      0.85       200

    accuracy                           0.85       400
   macro avg       0.85      0.85      0.85       400
weighted avg       0.85      0.85      0.85       400



***
#### Determine whether the differences of the performances are significant

##### The binomial test on accuracy

In [18]:
# N instances in a test set that 
# lexicon-based classifier and logistic regression classifier disagree on
disagrees = ~(y_pred_lb == y_pred_lg)
N = np.sum(disagrees)
print('Number of instances disagree on:', N)

Number of instances disagree on: 124


In [19]:
# k instances in N instances that lexicon-based classifier is correct on
corrects_lb = (y_pred_lb == y_test)
correct_lb_on_disagrees = (corrects_lb == disagrees)
k = np.sum(correct_lb_on_disagrees)
print('Number of instances that lexicon-based classifier is correct on:', k)

Number of instances that lexicon-based classifier is correct on: 59


In [20]:
# Compute a two-tailed p-value
p = 0.5 # binary class
p_val = binom.pmf(k, N, p) * 2 # two-tailed
print('p_value:', p_val)
print('p_value < 0.05:', p_val < 0.05)

p_value: 0.12382904545780314
p_value < 0.05: False


Based on the `two-tailed p-value`, lexicon-based classifier and logistic regression classifier are equally accurate in terms of accuracy.

***
##### The bootstrap test on macro-F-measure

In [21]:
# number of bootstrap samples
M = 1000 

# number of test instances
N = len(X_test) 

# Matrix to store the difference of f-measure
delta_f_measure = np.zeros((M, 1)) 

# Implementation of bootstrap-sample procedure mentioned by J. Eisenstein.
for m in range(M):
    # Resample instances from the test set with replacement
    resample_index = np.random.choice(np.random.randint(0, N, N), 
                                      size=N, 
                                      replace=True)
    
    # Predictions of lexicon-based classifier on boostrap sample
    y_pred_lb = lexicon_based_clf(X_test[resample_index], 
                                       pos_sentiment_lexicon, 
                                       neg_sentiment_lexicon)
    
    # Predictions of logistic regression classifier on boostrap sample
    y_pred_lg = logistic_clf.predict(X_test_vectorized[resample_index])
    
    # Ground true labels of the boostrap sample
    y_true = y_test[resample_index]
    
    # f-meausre score of lexicon-based classifier
    f_measure_lb = f1_score(y_true, y_pred_lb)
    
    # f-measure score of logistic regression classifier
    f_measure_lg = f1_score(y_true, y_pred_lg)
    
    # Difference of f-measure between lexicon-based and logistic classifiers
    delta_f_measure[m, 0] = f_measure_lb - f_measure_lg
    
    # Monitor the progress of bootstrap samples
    sys.stdout.write("\rProgress: {}/{} boostrap samples".format(m+1, M))
    sys.stdout.flush()

Progress: 1000/1000 boostrap samples

In [22]:
print('Number of differences of f-measure that is less than or equal to 0:',
      np.sum(delta_f_measure <= 0))
print('Percentage of differences of f-measure that is less than or equal to 0: {:.2f}%'.
      format(np.sum(delta_f_measure <= 0)/M * 100))

Number of differences of f-measure that is less than or equal to 0: 1000
Percentage of differences of f-measure that is less than or equal to 0: 100.00%


Based on the boostrap test, logistic regression classifier is at least as good as lexicon-based one in terms of F-measure.

##### Conclusion

Thus, based on the results of the above tests, I would prefer logistic regression classifier to lexicon-based one.