# Basic Sentiment Analysis On Movie Reviews

`nltk` is the most popular Python package for Natural Language Processing, it provides algorithms for importing, cleaning, pre-processing text data in human language and then apply computational linguistics algorithms like sentiment analysis.

**Analysis 1**

This program uses the movie_reviews dataset from nltk's corpus, consisting of 2000 reviews marked as positive and negative. A bag-of-words is created for 2000 most common words that occur over the vocabulary of the entire corpus. And each review is represented by a hot-coded feature vector of those 2000 words. A basic sentiment analysis is carried out with nltk's built-in Naive Bayes estimator with a 80-20% split between training and test data.

In [26]:
import nltk
from nltk.corpus import movie_reviews as reviews
import random
import string

# Helper function to create one-hot feature vectors
def review_features(doc):
    docSet = set(doc)
    features = {}
    
    for word in topKeys:
        features[word] = (word in docSet)
        
    return features

# Helper function for stopword removal
def build_bag_of_words_features_filtered(words):
    return {
        word:1 for word in words \
        if not word in useless_words}

# Helper function for stemming
def stem_with_porter(words):
    porter = nltk.PorterStemmer()
    new_words = [porter.stem(w) for w in words]
    return new_words

# Load the reviews dataset
docs = [(list(reviews.words(id)), cat) for cat in reviews.categories() for id in reviews.fileids(cat)]
random.shuffle(docs)

# Create a Bag-of-Words
stopWords = nltk.corpus.stopwords.words("english") + list(string.punctuation) + ['--', '---']
filtered_words = [word for word in reviews.words() if not word in stopWords]
#stemmed_words = stem_with_porter(filtered_words)
fd = nltk.FreqDist(word.lower() for word in filtered_words)
topKeys = [ key for (key, value) in fd.most_common(2000)]

data = [(review_features(doc), label) for (doc, label) in docs]

dataCount = len(data)
trainCount = int(0.8*dataCount)

trainData = data[:trainCount]
testData = data[trainCount:]
bayes = nltk.NaiveBayesClassifier.train(trainData)

print('train accuracy=', nltk.classify.accuracy(bayes, trainData))
print('test accuracy=', nltk.classify.accuracy(bayes, testData))

bayes.show_most_informative_features(20)

train accuracy= 0.864375
test accuracy= 0.8025
Most Informative Features
             outstanding = True              pos : neg    =     10.9 : 1.0
                  seagal = True              neg : pos    =      7.6 : 1.0
                   mulan = True              pos : neg    =      7.2 : 1.0
                   damon = True              pos : neg    =      6.3 : 1.0
                   awful = True              neg : pos    =      6.1 : 1.0
             wonderfully = True              pos : neg    =      5.8 : 1.0
                  poorly = True              neg : pos    =      5.8 : 1.0
               laughable = True              neg : pos    =      5.4 : 1.0
              ridiculous = True              neg : pos    =      5.3 : 1.0
                    lame = True              neg : pos    =      5.3 : 1.0
                 unfunny = True              neg : pos    =      5.2 : 1.0
                  wasted = True              neg : pos    =      5.1 : 1.0
                   waste = 

**Analysis 2**

This program uses the same dataset. However, each review here is represented by a bag-of-words of 200 most common words occuring in it.

In [27]:
import nltk
from nltk.corpus import movie_reviews as reviews
from nltk.classify import NaiveBayesClassifier
import string

# Helper function to create Bag-of-Words Feature vectors
def build_bag_of_words_features_filtered(words):
    fd = nltk.FreqDist(word.lower() for word in words)
    topWords = [ key for (key, value) in fd.most_common(200)]
    return {
        word:1 for word in topWords \
        if not word in stopWords}

stopWords = nltk.corpus.stopwords.words("english") + list(string.punctuation)

negative_fileids = reviews.fileids('neg')
positive_fileids = reviews.fileids('pos')

negative_features = [
    (build_bag_of_words_features_filtered(reviews.words(fileids=[f])), 'neg') \
    for f in negative_fileids
]

positive_features = [
    (build_bag_of_words_features_filtered(reviews.words(fileids=[f])), 'pos') \
    for f in positive_fileids
]


split = 800
sentiment_classifier = NaiveBayesClassifier.train(positive_features[:split]+negative_features[:split])
train_accuracy = nltk.classify.util.accuracy(sentiment_classifier, positive_features[:split]+negative_features[:split])*100
test_accuracy = nltk.classify.util.accuracy(sentiment_classifier, positive_features[split:]+negative_features[split:])*100


print('train accuracy=', train_accuracy)
print('test accuracy=', test_accuracy)

sentiment_classifier.show_most_informative_features(20)

train accuracy= 99.75
test accuracy= 78.0
Most Informative Features
                   damon = 1                 pos : neg    =     12.3 : 1.0
             outstanding = 1                 pos : neg    =     10.6 : 1.0
                  seagal = 1                 neg : pos    =     10.3 : 1.0
              undercover = 1                 neg : pos    =      9.7 : 1.0
              schumacher = 1                 neg : pos    =      9.7 : 1.0
               stretched = 1                 neg : pos    =      9.0 : 1.0
                    lame = 1                 neg : pos    =      8.6 : 1.0
                downhill = 1                 neg : pos    =      8.3 : 1.0
                 chuckle = 1                 neg : pos    =      8.3 : 1.0
                instinct = 1                 neg : pos    =      8.3 : 1.0
             wonderfully = 1                 pos : neg    =      8.1 : 1.0
               uplifting = 1                 pos : neg    =      7.7 : 1.0
              capitalize = 1    