## Sentiment Analysis

Let's use the movie review corpus to train a sentiment classifier.

In [None]:
import random
import nltk

In [None]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [None]:
# These documents are tuples. 
# The first spot is the review (as a list of words). The second is the classification. 
# Let's look at one.

# classification
print(documents[0][1])
print() # Add a return for spacing

# review
print(" ".join(documents[0][0]))


In [None]:
# build a list of all words
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

# Get 2500 most frequent words
word_features = [w[0] for w in all_words.most_common(2500)]
#word_features = list(all_words)[:2500]


def document_features(document): 
    # We use a set here. Remind me to tell you why...
    document_words = set(document) 
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [None]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
print(nltk.classify.accuracy(classifier, test_set))

In [None]:
classifier.show_most_informative_features(50)