# Non-deep-learning Sentiment Classification #

## Prepare tools ##

In [21]:
# utility
import pandas as pd
import numpy as np

import urllib.request # download files

from collections import Counter
########

# text pre-processing
from bs4 import BeautifulSoup # remove HTML tags
import nltk # natural language processing

from nltk.corpus import stopwords, wordnet # stop words

from nltk.stem.snowball import SnowballStemmer # stemming
from nltk import pos_tag, word_tokenize # identify POS tag, required by lemmatizer
from nltk.stem import WordNetLemmatizer # lemmatization
########

# feature extraction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import FeatureHasher
########

from sklearn.model_selection import train_test_split
########

# supervised learning model
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
########

# processing time
import timeit
########

from sklearn.metrics import roc_auc_score

**Resources to be downloaded with `nltk` downloader:**
-  `stopwords` - stop words list
-  `wordnet` - lemmatization
-  `punkt` - tokenization
-  `averaged_perceptron_tagger` - POS tag of tokens

In [3]:
working_dir = "/home/lee/Documents/Datasets for GitHub/kaggle_movie_sentiment_analysis/"

# download to working directory
nltk.download(['stopwords', 'wordnet', 'punkt', 'averaged_perceptron_tagger'], download_dir=working_dir)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lee/Documents/Datasets for
[nltk_data]     GitHub/kaggle_movie_sentiment_analysis/...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/lee/Documents/Datasets for
[nltk_data]     GitHub/kaggle_movie_sentiment_analysis/...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/lee/Documents/Datasets
[nltk_data]     for GitHub/kaggle_movie_sentiment_analysis/...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/lee/Documents/Datasets for
[nltk_data]     GitHub/kaggle_movie_sentiment_analysis/...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Inspect data ##

In [4]:
labeledTrain = pd.read_csv(working_dir+"labeledTrainData.tsv", delimiter="\t", quoting=3)
# unlabeledTrain = pd.read_csv(working_dir+"unlabeledTrainData.tsv", delimiter="\t", quoting=3)
# test = pd.read_csv(working_dir+"testData.tsv", delimiter="\t", quoting=3)

In [5]:
print('Labeled dataset size: {}'.format(labeledTrain.shape))
print('Inspect labeled dataset:\n {}'.format(labeledTrain.head()))

Labeled dataset size: (25000, 3)
Inspect labeled dataset:
          id  sentiment                                             review
0  "5814_8"          1  "With all this stuff going down at the moment ...
1  "2381_9"          1  "\"The Classic War of the Worlds\" by Timothy ...
2  "7759_3"          0  "The film starts with a manager (Nicholas Bell...
3  "3630_4"          0  "It must be assumed that those who praised thi...
4  "9495_8"          1  "Superbly trashy and wondrously unpretentious ...


**Reviews are long; access the very first review as a scalar to inspect it. **

In [6]:
labeledTrain.loc[0,'review']

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

## Preprocess the textual data

**Need to clean the punctuations, stop words, and HTML tags, split sentences into individual words, convert all words to lower case and lemmatize them.**

** `nltk.data.path` is a list of locations to search for the downloaded files `nltk` needs. Earlier we downloaded those files to the working directory rather than any of its defaults. **

In [7]:
nltk.data.path.append(working_dir)

# remove html
labeledTrain['removehtml'] = labeledTrain['review'].apply(lambda x: BeautifulSoup(x, "html5lib").text)

# remove punctuation
labeledTrain['removepunc'] = labeledTrain['removehtml'].str.replace('[^\w\s]',' ')

# lower casing
labeledTrain['lower'] = labeledTrain['removepunc'].str.lower()

# tokenize and lemmatize
# adjective, satellite adjective, adverb, noun, verb = 'a', 's', 'r', 'n', 'v'
def lemmatize_after_pos(review):
    lemma_review = []
    for word, tag in pos_tag(word_tokenize(review)):
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
        lemma = WordNetLemmatizer().lemmatize(word, wntag) if wntag else word
        lemma_review.append(lemma)
    return lemma_review

labeledTrain['lemmatized'] = labeledTrain['lower'].apply(lambda x: lemmatize_after_pos(x))

# remove stopwords
stop = stopwords.words('english')
labeledTrain['removestop'] = labeledTrain['lemmatized'].apply(lambda x: [item for item in x if item not in stop])

labeledTrain['processed_review'] = labeledTrain['removestop']

**Inspect the results. **

In [8]:
labeledTrain.loc[0, 'processed_review']

['stuff',
 'go',
 'moment',
 'mj',
 'start',
 'listen',
 'music',
 'watch',
 'odd',
 'documentary',
 'watch',
 'wiz',
 'watch',
 'moonwalker',
 'maybe',
 'want',
 'get',
 'certain',
 'insight',
 'guy',
 'think',
 'really',
 'cool',
 'eighty',
 'maybe',
 'make',
 'mind',
 'whether',
 'guilty',
 'innocent',
 'moonwalker',
 'part',
 'biography',
 'part',
 'feature',
 'film',
 'remember',
 'go',
 'see',
 'cinema',
 'originally',
 'release',
 'subtle',
 'message',
 'mj',
 'feeling',
 'towards',
 'press',
 'also',
 'obvious',
 'message',
 'drug',
 'bad',
 'kay',
 'visually',
 'impressive',
 'course',
 'michael',
 'jackson',
 'unless',
 'remotely',
 'like',
 'mj',
 'anyway',
 'go',
 'hate',
 'find',
 'bore',
 'may',
 'call',
 'mj',
 'egotist',
 'consent',
 'making',
 'movie',
 'mj',
 'fan',
 'would',
 'say',
 'make',
 'fan',
 'true',
 'really',
 'nice',
 'actual',
 'feature',
 'film',
 'bit',
 'finally',
 'start',
 '20',
 'minute',
 'exclude',
 'smooth',
 'criminal',
 'sequence',
 'joe',
 'pe

In [9]:
labeledTrain.drop(['removehtml', 'lemmatized', 'removepunc', 'lower', 'removestop'], axis=1, inplace=True)

In [10]:
labeledTrain.columns

Index(['id', 'sentiment', 'review', 'processed_review'], dtype='object')

## Construct features

**Identify sentiment of individual words **

In [11]:
# download lexicon
urllib.request.urlretrieve('http://www.unc.edu/~ncaren/haphazard/negative.txt', working_dir+"/negative.txt")
urllib.request.urlretrieve('http://www.unc.edu/~ncaren/haphazard/positive.txt', working_dir+"/positive.txt")
########

pos_sent = open(working_dir+'/positive.txt').read()
positive_words = pos_sent.split('\n')

neg_sent = open(working_dir+'/negative.txt').read()
negative_words = neg_sent.split('\n')

print('First 10 positive words: \n {}'.format(positive_words[:10]))
print('First 10 negative words: \n {}'.format(negative_words[:10]))

del pos_sent, neg_sent

First 10 positive words: 
 ['abidance', 'abidance', 'abilities', 'ability', 'able', 'above', 'above-average', 'abundant', 'abundance', 'acceptance']
First 10 negative words: 
 ['abandoned', 'abandonment', 'aberration', 'aberration', 'abhorred', 'abhorrence', 'abhorrent', 'abhorrently', 'abhors', 'abhors']


**Count the numbers of positive and negative words in a review. **

In [12]:
# Intersect the list of a review with the list of positive/negative words. The result is a {word: frequency} dictionary.
# Add up all the word frequencies in a review to get a total count.

positive_counter = labeledTrain['processed_review'].apply(lambda x: Counter(x) & Counter(positive_words)) # The lambda function is the intersection of two counters
positive_counts = positive_counter.apply(lambda x: sum(x.values()))

negative_counter = labeledTrain['processed_review'].apply(lambda x: Counter(x) & Counter(negative_words))
negative_counts = negative_counter.apply(lambda x: sum(x.values()))

labeledTrain = pd.concat([labeledTrain, positive_counts.to_frame('positive_counts'), \
                          negative_counts.to_frame('negative_counts')], axis=1)

del positive_counter, positive_counts, negative_counter, negative_counts

**Bag of words**

**Simplest bag of words, each word has a score of 1. **

In [13]:
vectorizer = CountVectorizer()

bow = vectorizer.fit_transform(labeledTrain['processed_review'].apply(lambda x: ' '.join(x)))

print('Inspect 5 words in the vocabulary:\n {}'\
      .format({k: vectorizer.vocabulary_[k] for k in list(vectorizer.vocabulary_)[:5]}))

Inspect 5 words in the vocabulary:
 {'stuff': 56009, 'go': 24420, 'moment': 38575, 'mj': 38375, 'start': 55253}


**Feature hashing**

**Estimate the size of hashing space allowing 5% collision:**<br>
**$n$ (hashing space size) = $20k$ (feature space size)** [formula reference](https://booking.ai/dont-be-tricked-by-the-hashing-trick-192a6aae3087)<br>
[Raw bag of words with count as score](https://stackoverflow.com/questions/15507172/how-to-get-bag-of-words-from-textual-data)

In [15]:
hashing_space_size = int(2**np.floor(np.log(20*bow.shape[1])/np.log(2)))

raw_bagofwords = labeledTrain['processed_review'].apply(lambda row: Counter(row))

hasher = FeatureHasher(n_features=2**20)#hashing_space_size)
hashed = hasher.transform(raw_bagofwords)

## Train models ##

In [16]:
X_train, X_test, y_train, y_test = train_test_split(labeledTrain.drop(['sentiment'], axis=1), \
                                                    labeledTrain['sentiment'], test_size=0.33, \
                                                    random_state=0)

index_train = X_train.index.tolist()
index_test = X_test.index.tolist()

X_train_bow = bow.tocsr()[index_train, :] # slice a sparse matrix by row
X_test_bow = bow.tocsr()[index_test, :]

X_train_fh = hashed.tocsr()[index_train, :] 
X_test_fh = hashed.tocsr()[index_test, :]

### Logistic regression ###

In [23]:
logistic_counts = logistic_bow = logistic_fh = LogisticRegression()

tic = timeit.default_timer()
logistic_counts.fit(X_train.loc[:, ['positive_counts','negative_counts']], y_train)
print("Training time in seconds: {}".format(timeit.default_timer() - tic))

print('Training set accuracy: {}'\
      .format(logistic_counts\
              .score(X_train.loc[:, ['positive_counts','negative_counts']], y_train)))
print('Test set accuracy: {}'\
      .format(logistic_counts\
              .score(X_test.loc[:, ['positive_counts','negative_counts']], y_test)))
print('Test set AUC: {}'\
      .format(roc_auc_score(y_test, \
                            logistic_counts.predict_proba\
                            (X_test.loc[:, ['positive_counts','negative_counts']])[:,1])))

Training time in seconds: 0.03848735394421965
Training set accuracy: 0.7096716417910448
Test set accuracy: 0.7064242424242424
Test set AUC: 0.7698403966036869


In [24]:
tic = timeit.default_timer()
logistic_bow.fit(X_train_bow, y_train)
print("Training time in seconds: {}".format(timeit.default_timer() - tic))

print('Training set accuracy: {}'\
      .format(logistic_bow\
              .score(X_train_bow, y_train)))
print('Test set accuracy: {}'\
      .format(logistic_bow\
              .score(X_test_bow, y_test)))
print('Test set AUC: {}'\
      .format(roc_auc_score(y_test, \
                            logistic_bow.predict_proba\
                            (X_test_bow)[:,1])))

Training time in seconds: 3.2100722460309044
Training set accuracy: 0.9986268656716418
Test set accuracy: 0.8752727272727273
Test set AUC: 0.9379977302036263


In [25]:
tic = timeit.default_timer()
logistic_fh.fit(X_train_fh, y_train)
print("Training time in seconds: {}".format(timeit.default_timer() - tic))

print('Training set accuracy: {}'\
      .format(logistic_fh\
              .score(X_train_fh, y_train)))
print('Test set accuracy: {}'\
      .format(logistic_fh\
              .score(X_test_fh, y_test)))
print('Test set AUC: {}'\
      .format(roc_auc_score(y_test, \
                            logistic_fh.predict_proba\
                            (X_test_fh)[:,1])))

Training time in seconds: 8.690083815949038
Training set accuracy: 0.9987462686567165
Test set accuracy: 0.8772121212121212
Test set AUC: 0.9401828309282648


### Naive Bayes classifier ###

In [26]:
multinomial_nb_counts = multinomial_nb_bow = multinomial_nb_fh = MultinomialNB()

tic = timeit.default_timer()
multinomial_nb_counts.fit(X_train.loc[:, ['positive_counts','negative_counts']], y_train)
print("Training time in seconds: {}".format(timeit.default_timer() - tic))

print('Multinomial naive Bayes, word counts as features')
print('Training set accuracy: {}'\
      .format(multinomial_nb_counts\
              .score(X_train.loc[:, ['positive_counts','negative_counts']], y_train)))
print('Test set accuracy: {}'\
      .format(multinomial_nb_counts\
              .score(X_test.loc[:, ['positive_counts','negative_counts']], y_test)))
print('Test set AUC: {}'\
      .format(roc_auc_score(y_test, \
                            multinomial_nb_counts.predict_proba\
                            (X_test.loc[:, ['positive_counts','negative_counts']])[:,1])))

print('Multinomial naive Bayes, bag of words')
tic = timeit.default_timer()
multinomial_nb_bow.fit(X_train_bow, y_train)
print("Training time in seconds: {}".format(timeit.default_timer() - tic))


print('Training set accuracy: {}'\
      .format(multinomial_nb_bow\
              .score(X_train_bow, y_train)))
print('Test set accuracy: {}'\
      .format(multinomial_nb_bow\
              .score(X_test_bow, y_test)))
print('Test set AUC: {}'\
      .format(roc_auc_score(y_test, \
                            multinomial_nb_bow.predict_proba\
                            (X_test_bow)[:,1])))

Training time in seconds: 0.005976050975732505
Multinomial naive Bayes, word counts as features
Training set accuracy: 0.7094328358208956
Test set accuracy: 0.704
Test set AUC: 0.7682891907956695
Training time in seconds: 0.02025564794894308
Multinomial naive Bayes, bag of words
Training set accuracy: 0.9154626865671642
Test set accuracy: 0.8621818181818182
Test set AUC: 0.9263147265632415


**Feature hashing generates a matrix where some elements are negative. Multinomial naive Bayes requires the input matrix to be non-negative so I did not train a multinomial naive Bayes model on the hashed features. **

Conclusion: Naive Bayes takes less time to train and has similar performance to logistic regression. Feature hashing appears promising and I need to do more analyses with it to actually see its advantages and disadvantages. 